# Data Pipeline

1. Load and process CSV files published on https://data.gov.sg/collections/189/view.
2. Load any existing geocoded addresses.
3. Update geocoded addresses.
4. Make H3 geometries.
5. Output data to disk for further downstream analytics.

In [None]:
import sys
sys.dont_write_bytecode = True

import os
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import geopandas

import h3
import matplotlib.pyplot as plt
import contextily as cx

# Local imports.
from resale_flat_prices.csv_data.csv_data import CsvData
from resale_flat_prices.geocode.geocoded_addresses import GeocodedAddresses
from resale_flat_prices.h3_utils.h3_statistics import grid_ring_monthly_median_price
from resale_flat_prices.h3_utils.h3_vis_utils import plot_df


# Data directories.
csv_data_dir = Path("../data/ResaleFlatPrices/")
processed_data_dir = Path("../data/processed_data/")

In [None]:
csv_data = CsvData(csv_data_dir, wanted_columns="default")
csv_data.load_csv_files()
csv_data.compile_csv_data()
csv_data.process_csv_data()

print(type(csv_data.df), csv_data.df.shape)

In [None]:
geocoded_addresses = GeocodedAddresses()
geocoded_addresses.read_json(processed_data_dir / "geocoded_addresses.json")

In [None]:
all_unique_addresses = set(csv_data.df["address"].unique())
all_unique_geocoded_addresses = geocoded_addresses.get_all_geocoded_addresses()

missing_addresses = all_unique_addresses.difference(all_unique_geocoded_addresses)
print(len(missing_addresses))

problem_addresses = geocoded_addresses.verify_geocoded_latitudes_and_longitudes(country="SINGAPORE")
print(len(problem_addresses))

In [None]:
RESOLUTION = 9
right_df = geocoded_addresses.make_h3_geometries(resolution = RESOLUTION, crs = "EPSG:4326")
left_df = csv_data.get_df()

df = pd.merge(left = left_df, right = right_df, left_on = "address", right_on = "address", how = "left")
df = geopandas.GeoDataFrame(df, crs = right_df.crs)

print(type(df), df.shape)

In [None]:
median_prices_df = grid_ring_monthly_median_price(
    df,
    date_column = "year_month", 
    price_column = "price_per_sqm",
    grid_ring_distance = 1, 
    h3_column_name = "h3",
)

median_prices_df = geopandas.GeoDataFrame(median_prices_df)
median_prices_df_latest = median_prices_df[median_prices_df["year_month"] == "2024-10"]
median_prices_df_latest = median_prices_df_latest.merge(
    right = df[["h3", "geometry"]], left_on = "h3", right_on = "h3", how = "inner",
)
median_prices_df_latest = geopandas.GeoDataFrame(median_prices_df_latest, crs = df.crs)

In [None]:
plot_df(
    df = median_prices_df_latest, 
    column = "price_per_sqm", 
    epsg = 3857,
    figsize = [10, 10], 
    alpha = 0.5,
    categorical = False,
    legend = True,
    legend_kwds = {"label": "price_per_sqm"},
    edgecolor = None,
    divider_kwds = {"position": "right", "size": "5%", "pad": 0.1},
)