# Private Residential Landed Raw Data Pipeline

Processes the raw private residential landed property transaction data downloaded from https://www.ura.gov.sg/property-market-information/pmiResidentialTransactionSearch.

In [1]:
import sys
sys.dont_write_bytecode = True

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import geopandas

# Local imports.
from property_prices.transaction_csv_data.private_csv_data import PrivateCsvData
from property_prices.geocode.geocoded_addresses import GeocodedAddresses


# Data directories and files.
csv_data_dir = Path("../data/PrivateResidentialPropertiesCondo/")
processed_data_dir = Path("../data/processed_data/")

landed_addresses_json_file = Path("condo_addresses.json")
output_landed_geojson_file = Path("condo_transactions.parquet")

private_residential_property_type = "Condominium"

geocoder_service = "arcgis"

In [None]:
# Load transaction CSV data.
print("Loading landed properties transactions CSV data from {}.".format(csv_data_dir))

private_csv_data = PrivateCsvData(csv_data_dir, wanted_columns = "default")
private_csv_data.load_csv_files()
private_csv_data.process_csv_data()

display(private_csv_data.df.head())
print("    Loaded landed properties CSV data with shape {}.".format(private_csv_data.df.shape))

In [None]:
# Load geocoded addresses.
geocoded_addresses = GeocodedAddresses(geocoder=geocoder_service)
if (processed_data_dir / landed_addresses_json_file).exists() is True:
    print("Loading geocoded HDB addresses from {}.".format(processed_data_dir / landed_addresses_json_file))
    geocoded_addresses.read_json(processed_data_dir / landed_addresses_json_file)
    print("    Loaded {} existing geocoded addresses.".format(len(geocoded_addresses.df)))

# Check for new addresses to be geocoded.
if private_residential_property_type.lower() == "landed":
    all_unique_addresses = set(private_csv_data.df["street_name"].unique())
else:
    # For condominiums and apartments, join the project name with the street name.
    all_unique_addresses = set(
        private_csv_data.df["project_name"] + ", " + private_csv_data.df["street_name"]
    )

all_unique_geocoded_addresses = geocoded_addresses.get_all_geocoded_addresses()
missing_addresses = all_unique_addresses.difference(all_unique_geocoded_addresses)
missing_addresses = list(missing_addresses)
if private_residential_property_type.lower() != "landed":
    missing_addresses = sorted([s + ", SINGAPORE" for s in missing_addresses])
else:
    missing_addresses = sorted(missing_addresses)

print("Found {} new addresses to be geocoded in the CSV data.".format(len(missing_addresses)))

In [None]:
# Update missing geocoded addresses.
if len(missing_addresses) > 0:
    #for i, ma in enumerate(missing_addresses):
    #    print("    {}: {}".format(i+1, ma))
    error_address_list = geocoded_addresses.update_geocoded_addresses(missing_addresses)
    # Output geocoded addresses to disk.
    geocoded_addresses.to_json(processed_data_dir / landed_addresses_json_file)
    print("    Updated {} new geocoded HDB addresses.".format(len(missing_addresses) - len(error_address_list)))

    if len(error_address_list) > 0:
        print("    The following addresses were not geocoded: {}".format(error_address_list))

In [None]:
# Check for problematic geocoded addresses (e.g. coordinates located outside Singapore!).
problem_addresses = geocoded_addresses.verify_geocoded_latitudes_and_longitudes(country = "SINGAPORE")
if len(problem_addresses) > 0:
    print("Warning - the following {} addresses do not seem to have been geocoded correctly.".format(
        len(problem_addresses))
    )
    for i, p in enumerate(problem_addresses):
        print("    {:05d}: {}.".format(i, p))

In [5]:
# Merge geocoded addresses with the resale flat prices CSV data.
geocode_df = geocoded_addresses.df[["address", "geometry"]]
geocode_df = geocode_df.rename(columns={"address": "street_name"})

private_csv_df = private_csv_data.get_df()
processed_data_df = pd.merge(left=private_csv_df, right=geocode_df, left_on="street_name", right_on="street_name", how="left")
processed_data_df = geopandas.GeoDataFrame(processed_data_df)
processed_data_df.crs = geocode_df.crs

In [None]:
display(processed_data_df.head())
print(processed_data_df.shape)

In [None]:
# Output the merged processed resale flat prices data to disk.
out_path = processed_data_dir / output_landed_geojson_file
print("Saving processed resale flat prices data to {}.".format(out_path))
if output_landed_geojson_file.suffix == ".zip":
    processed_data_df.to_csv(out_path, index=False, compression="zip")
elif output_landed_geojson_file.suffix == ".json":
    processed_data_df.to_file(out_path, driver="GeoJSON")
elif output_landed_geojson_file.suffix == ".parquet":
    processed_data_df.to_parquet(out_path, index=False, compression="brotli")