# Data Pipeline

1. Load and process CSV files published on https://data.gov.sg/collections/189/view.
2. Load any existing geocoded addresses.
3. Update geocoded addresses.
4. Make H3 geometries.
5. Output data to disk for further downstream analytics.

In [1]:
import sys
sys.dont_write_bytecode = True

import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas

import h3
import matplotlib.pyplot as plt
import contextily as cx

# Local imports.
from resale_flat_prices.csv_data.csv_data import CsvData
from resale_flat_prices.geocode.geocoded_addresses import GeocodedAddresses
from resale_flat_prices.h3_utils.h3_statistics import grid_ring_monthly_median_price
from resale_flat_prices.h3_utils.h3_vis_utils import plot_df


# Data directories.
csv_data_dir = Path("../data/ResaleFlatPrices/")
processed_data_dir = Path("../data/processed_data/")

In [2]:
# Load and process raw CSV files published on https://data.gov.sg/collections/189/view.
csv_data = CsvData(csv_data_dir, wanted_columns = "default")
csv_data.load_csv_files()
csv_data.compile_csv_data()
csv_data.process_csv_data()
print("Loaded and compiled CSV data into shape {}.".format(csv_data.df.shape))

Loaded and compiled CSV data into shape (940168, 21).


In [3]:
# Load geocoded addresses.
geocoded_addresses = GeocodedAddresses()
geocoded_addresses.read_json(processed_data_dir / "geocoded_addresses.json")
print("Loaded {} existing geocoded addresses.".format(len(geocoded_addresses.address_dict)))

# Check for new addresses to be geocoded.
all_unique_addresses = set(csv_data.df["address"].unique())
all_unique_geocoded_addresses = geocoded_addresses.get_all_geocoded_addresses()

# Update new geocoded addresses.
missing_addresses = all_unique_addresses.difference(all_unique_geocoded_addresses)
print("Found {} new addresses to be geocoded in loaded CSV data.".format(len(missing_addresses)))
if len(missing_addresses) > 0:
    print("Updating {} new geocoded addresses.".format(len(missing_addresses)))
    geocoded_addresses.update_geocoded_addresses(missing_addresses)
    geocoded_addresses.to_json(processed_data_dir / "geocoded_addresses.json")

# Check for problematic geocodes.
problem_addresses = geocoded_addresses.verify_geocoded_latitudes_and_longitudes(country = "SINGAPORE")
if len(problem_addresses) > 0:
    print("Warning - the following {} addresses do not seem to have been geocoded correctly.".format(
        len(problem_addresses))
    )
    for i, p in enumerate(problem_addresses):
        print("{:05d}: {}.".format(i, p))

Loaded 9867 existing geocoded addresses.
Found 0 new addresses to be geocoded in loaded CSV data.


In [4]:
# Merge geocoded addresses with the CSV data.
geocode_df = geocoded_addresses.address_dict_to_df()
csv_df = csv_data.get_df()
processed_data_df = pd.merge(left=csv_df, right=geocode_df, left_on="address", right_on="address", how="left")

In [5]:
output_csv_file = "resale-flat-prices.csv.zip"

# Output the merged processed data to disk.
if output_csv_file[-3:] == "zip":
    compression = "zip"
else:
    compression = None
print("Saving processed data to {}.".format(processed_data_dir / output_csv_file))
processed_data_df.to_csv(processed_data_dir / output_csv_file, index = False, compression = compression)

Saving processed data to ../data/processed_data/resale-flat-prices.csv.zip.
