In [1]:
import sys
sys.dont_write_bytecode = True

from pathlib import Path

import numpy as np
import pandas as pd
import geopandas

from sklearn.linear_model import LinearRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, ConstantKernel

import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import contextily as cx


# Local imports.
from resale_flat_prices.resale_flat_data.resale_flat_data import ResaleFlatData
from resale_flat_prices.resale_flat_data.rent_prices_data import RentPricesData
from resale_flat_prices.h3_utils.h3_statistics import monthly_median_price
from resale_flat_prices.vis_utils.vis_utils import plot_df

# Data directories.
csv_data_dir = Path("../data/ResaleFlatPrices/")
processed_data_dir = Path("../data/processed_data/")

# Load processed and geocoded resale flat data.
resale_flat_data_csv_file = "resale-flat-prices.csv.zip"
rent_prices_data_csv_file = "rent-prices.csv.zip"

In [None]:
# Calculate the median prices.
price_column = "price_per_sqft"

# Resale flat data.
resale_flat_data = ResaleFlatData(processed_data_dir / resale_flat_data_csv_file)
resale_flat_data.read_csv()
resale_flat_data.df = resale_flat_data.df.sort_values(["year_month", "town"])

# Create additional datetime columns indicating the quarter the resale occured in.
resale_flat_data.df["quarter"] = resale_flat_data.df["month"].apply(lambda x: int(np.ceil(x / 3)))

# Conditions to restrict the data.
resale_flat_data.df = resale_flat_data.df[resale_flat_data.df["year"] >= 2021]


#resale_flat_data.df = resale_flat_data.df[resale_flat_data.df["town"] == "CENTRAL AREA"]
resale_flat_data.df = resale_flat_data.df[resale_flat_data.df["street_name_cleaned"] == "ANG MO KIO AVENUE 1"]

resale_flat_data.make_point_geometries(crs = "EPSG:4326")
print("resale_flat_data.df.shape: {}.".format(resale_flat_data.df.shape))

In [None]:
large = "year"
small = "month"
combined = "year_months"

xdf = resale_flat_data.df[[large, small, "price_per_sqft"]].groupby([large, small]).median().reset_index()

years = [2021] * 12 + [2022] * 12 + [2023] * 12 + [2024] * 12
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] * 4
year_months = [str(y) + "-" + str(m) for y, m in zip(years, months)]

datetime_df = pd.DataFrame({large: years, small: months, combined: year_months})

datetime_df["X"] = np.arange(1, len(datetime_df) + 1, 1) / len(datetime_df)

xdf = pd.merge(datetime_df, xdf, left_on = [large, small], right_on = [large, small], how = "left")

xdf["year_month"] = xdf.apply(lambda DF: str(int(DF[large])) + "-" + str(int(DF[small])), axis = 1)

xdf = xdf.dropna()

display(xdf.head())
print("xdf.shape: {}.".format(xdf.shape))

In [None]:
y = xdf["price_per_sqft"].values
X = xdf["X"].values

X_pred_months = datetime_df["year_months"].values
X_pred = datetime_df["X"].values

kernel = RBF(12, length_scale_bounds = "fixed")
gpr = GaussianProcessRegressor(kernel = kernel)
gpr.fit(X.reshape(-1, 1), y)
y_pred = gpr.predict(X_pred.reshape(-1, 1))

plt.figure(figsize = [15, 5])
plt.plot(xdf["year_month"], xdf["price_per_sqft"], "o")
plt.plot(X_pred_months, y_pred)
plt.xticks(rotation = 90)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize = [12, 12])
divider = make_axes_locatable(ax)
cax = divider.append_axes(**{"position": "right", "size": "5%", "pad": 0.1})

#ax.set_xlim([103.60152080468028, 104.0470051248534])
#ax.set_ylim([1.2359029533199608, 1.4733321131970046])
ax.set_title("{} {}.".format("year_month", price_column))

resale_flat_data.df.plot(
    ax = ax,
    alpha = 1.0,
    column = price_column, 
    categorical = False,
    legend = True, 
    legend_kwds = {"label": "price_per_sqft"},
    cmap = 'viridis',
    edgecolor = None,
    cax = cax,
)

cx.add_basemap(ax, crs = resale_flat_data.df.crs, source = cx.providers.CartoDB.Positron)
fig.tight_layout()
plt.show()