# Private Residential Transaction Price Indexing

In [1]:
import sys
sys.dont_write_bytecode = True

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

import matplotlib.pyplot as plt

# Local imports.
from property_prices.private_residential_data.private_residential_data import PrivateResidentialData


# Data directories.
processed_data_dir = Path("../data/processed_data/")

file_name = "landed_transactions.parquet"
output_file_name = "landed_transactions-indexed.parquet"

#price_column = "transacted_price"
price_column = "unit_price_psf"

In [None]:
private_residential_data = PrivateResidentialData(processed_data_dir / file_name)

private_residential_data.read_parquet()

private_residential_data.df = private_residential_data.df.sort_values(["datetime", "street_name"])

unique_street_names = sorted(list(private_residential_data.df["street_name"].unique()))

min_datetime = private_residential_data.df["datetime"].min()
max_datetime = private_residential_data.df["datetime"].max()
min_year = min_datetime.year

display(private_residential_data.df.head())
print("Loaded private residential data shape: {}.".format(private_residential_data.df.shape))

In [None]:
print(private_residential_data.df["street_name"].value_counts())

In [None]:
# Make time series of the aggregated transaction price for each unit area.
datetime_df = pd.DataFrame(
    {"datetime": np.arange(min_datetime, max_datetime + np.timedelta64(31, "D"), dtype = 'datetime64[M]')}
)
datetime_df["X"] = np.arange(1, len(datetime_df) + 1, 1) / len(datetime_df)

dfs = {}
dfs_rent = {}
for s in unique_street_names:
    if np.sum(private_residential_data.df["street_name"] == s) > 0:
        dfs[s] = private_residential_data.df[private_residential_data.df["street_name"] == s]
        dfs[s] = dfs[s][["datetime", price_column]].groupby(["datetime"]).median().reset_index()
        dfs[s] = pd.merge(
            datetime_df, dfs[s], left_on = ["datetime"], right_on = ["datetime"], how = "left"
        )
        dfs[s] = dfs[s].dropna()

print("{} unique resale street names.".format(len(dfs.keys())))

X_pred_months = datetime_df["datetime"].values.astype("datetime64[M]")
X_pred = datetime_df["X"].values
X_pred = X_pred.reshape(-1, 1)

future_months = 1
for i in range(future_months):
    X_pred_months = np.hstack([X_pred_months, X_pred_months[-1] + 1])
    X_pred = np.vstack([X_pred, X_pred[-1] + (X_pred[-1] - X_pred[-2])])

In [None]:
# Transaction price indexing model training.
n_estimators = 50
max_depth = 2
min_samples_leaf = 2
max_depth_low_data = 1
low_data_threshold = 10
criterion = "absolute_error"

models = {}
y_preds = {}
scores = {}
for k in dfs.keys():
    y = dfs[k][price_column].values
    X = dfs[k]["X"].values.reshape(-1, 1)
    if len(y) >= low_data_threshold:
        models[k] = RandomForestRegressor(
            n_estimators=n_estimators, max_depth=max_depth, criterion=criterion, min_samples_leaf=min_samples_leaf,
        )
        #models[k] = LinearInversion(error_type="l1", vander_order=3)
    else:
        models[k] = RandomForestRegressor(
            n_estimators=n_estimators, max_depth=max_depth_low_data, criterion=criterion, min_samples_leaf=min_samples_leaf,
        )
        #models[k] = LinearInversion(error_type="l2", vander_order=2)

    models[k].fit(X, y)
    y_pred = models[k].predict(X)
    dfs[k]["prediction"] = y_pred
    y_pred = y_pred / y_pred[-1]
    dfs[k]["price_index"] = y_pred
    y_preds[k] = models[k].predict(X_pred)
    scores[k] = r2_score(y, models[k].predict(X))

In [36]:
# Update price DataFrames with price indexes.
price_index_df = pd.DataFrame()
for k in dfs.keys():
    _df = dfs[k][["datetime", "price_index"]].copy()
    _df["street_name"] = k
    price_index_df = pd.concat([price_index_df, _df])

data_indexed_df = private_residential_data.df.merge(
    price_index_df, 
    how = "left", 
    left_on=["datetime", "street_name"],
    right_on=["datetime", "street_name"],
)
assert len(private_residential_data.df) == len(data_indexed_df)

In [None]:
plt.hist([scores[k] for k in scores.keys()], bins = 10)
plt.xlabel("R2 scores")
plt.ylabel("Histogram")
plt.grid(True)
plt.show()

In [None]:
aggregated_resale_r2 = 0
N = 0
for k in dfs.keys():
    if not np.isnan(scores[k]):
        aggregated_resale_r2 = aggregated_resale_r2 + scores[k] * len(dfs[k][price_column].values)
        N = N + len(dfs[k][price_column].values)
aggregated_resale_r2 = aggregated_resale_r2 / N
print("Resale price R2: {:.3f}.".format(aggregated_resale_r2))

In [None]:
plt.figure(figsize = [15, 5])
_df = data_indexed_df[["datetime", "price_index"]].groupby(["datetime"]).median().reset_index()
plt.plot(_df["datetime"], _df["price_index"], "k", linewidth=2)

for k in dfs.keys():
    plt.plot(dfs[k]["datetime"], dfs[k]["price_index"], "tab:gray", alpha = 0.5)

_df = data_indexed_df[["datetime", "price_index"]].groupby(["datetime"]).median().reset_index()
plt.plot(_df["datetime"], _df["price_index"], "k", linewidth=2)

plt.grid(True)
plt.legend(["Resale index", "Rent index"])
plt.xlabel("Datetime")
plt.ylabel("Normalized index")
plt.show()

In [None]:
k = "VERDE CRESCENT"


plt.figure(figsize = [15, 5])
if dfs.get(k, None) is not None:
    plt.plot(dfs[k]["datetime"], dfs[k][price_column], "o")
    plt.plot(X_pred_months, y_preds[k])

plt.legend(["Price data", "Price prediction"])
plt.grid(True)
plt.title(k)
plt.ylabel(price_column)

plt.show()

In [None]:
# Output indexed prices to disk.
save_to_disk = False

parquet_compression = "brotli"

if save_to_disk is True:
    # Output the merged processed resale flat prices data to disk.
    out_path = processed_data_dir / output_file_name
    print("Saving processed resale flat prices data to {}.".format(out_path))
    if out_path.suffix == ".zip":
        data_indexed_df.to_csv(out_path, index=False, compression="zip")
    elif out_path.suffix == ".json":
        data_indexed_df.to_file(out_path, driver="GeoJSON")
    elif out_path.suffix == ".parquet":
        data_indexed_df.to_parquet(out_path, index=False, compression=parquet_compression)