# Background

Objectives:
- To train a baseline model to predict resale hdb prices

# Import Libraries

In [5]:
import os
os.chdir("../")

In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import spearmanr, pearsonr

# Import data

In [143]:
# Donwloaded from src/download_resale_hdb_dataset.py cli script
df = pd.read_parquet("data/raw/resale_hdb_data.parquet")

# Downloaded from https://tablebuilder.singstat.gov.sg/table/TS/M212882
df_cpi = pd.read_csv("data/raw/cpi_housing.csv", index_col=0).iloc[9:757, :1].reset_index(drop=False)
df_cpi.columns = ["month", "cpi"]

In [144]:
df.head(1)

Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease
0,1,2000-01,ANG MO KIO,3 ROOM,170,ANG MO KIO AVE 4,07 TO 09,69,Improved,1986,147000,


In [145]:
df_cpi.head(5)

Unnamed: 0,month,cpi
0,2023 Dec,115.2
1,2023 Nov,114.7
2,2023 Oct,115.5
3,2023 Sep,114.7
4,2023 Aug,114.1


# Clean data

In [146]:
# Convert columns to correct type
df['month'] = pd.to_datetime(df['month'], format='%Y-%m')
df["resale_price"] = df["resale_price"].astype(float)

# Clean flat type
df["flat_type"] = df["flat_type"].str.replace("MULTI-GENERATION", "MULTI GENERATION")
# Rename flat model duplicates
replace_values = {
    "NEW GENERATION": "New Generation",
    "SIMPLIFIED": "Simplified",
    "STANDARD": "Standard",
    "MODEL A-MAISONETTE": "Maisonette",
    "MULTI GENERATION": "Multi Generation",
    "IMPROVED-MAISONETTE": "Executive Maisonette",
    "Improved-Maisonette": "Executive Maisonette",
    "Premium Maisonette": "Executive Maisonette",
    "2-ROOM": "2-room",
    "MODEL A": "Model A",
    "MAISONETTE": "Maisonette",
    "Model A-Maisonette": "Maisonette",
    "IMPROVED": "Improved",
    "TERRACE": "Terrace",
    "PREMIUM APARTMENT": "Premium Apartment",
    "Premium Apartment Loft": "Premium Apartment",
    "APARTMENT": "Apartment",
    "Type S1": "Type S1S2",
    "Type S2": "Type S1S2",
}
df = df.replace({"flat_model": replace_values})

In [147]:
# Adjust resale price for inflation
df_cpi["month"] = df_cpi["month"].apply(lambda x: x.strip())
df_cpi["month"] = pd.to_datetime(df_cpi["month"], format="%Y %b")
df = df.merge(df_cpi, on="month", how="left")
df["cpi"] = df["cpi"].astype(float)
df["real_price"] = (df["resale_price"] / df["cpi"]) * 100


# only include 1990 to 2023 data
df = df[df["month"] < "2024-01-01"]

# Feature Engineering

In [148]:
df['year'] = pd.DatetimeIndex(df['month']).year # extract out year

In [149]:
# reduce number of class of town to regions
d_region = {
    "ANG MO KIO": "North East",
    "BEDOK": "East",
    "BISHAN": "Central",
    "BUKIT BATOK": "West",
    "BUKIT MERAH": "Central",
    "BUKIT PANJANG": "West",
    "BUKIT TIMAH": "Central",
    "CENTRAL AREA": "Central",
    "CHOA CHU KANG": "West",
    "CLEMENTI": "West",
    "GEYLANG": "Central",
    "HOUGANG": "North East",
    "JURONG EAST": "West",
    "JURONG WEST": "West",
    "KALLANG/WHAMPOA": "Central",
    "MARINE PARADE": "Central",
    "PASIR RIS": "East",
    "PUNGGOL": "North East",
    "QUEENSTOWN": "Central",
    "SEMBAWANG": "North",
    "SENGKANG": "North East",
    "SERANGOON": "North East",
    "TAMPINES": "East",
    "TOA PAYOH": "Central",
    "WOODLANDS": "North",
    "YISHUN": "North",
}
df["region"] = df["town"].map(d_region)

In [150]:
def get_lease_remaining_in_years(lease_info: str or int) -> float:
    """Convert remaining lease information to a number of years.

    This function takes a lease duration expressed either as a string in the format
    "X years Y months" or as an integer representing the number of years, and converts
    it into a float representing the total number of years. If the input is a string
    but does not contain valid numbers, it returns None. If the input is neither a
    string nor an integer, it also returns None.

    Args:
        lease_info (str or int): The lease duration, either as a string with years and months
                                 or as an integer representing years.

    Returns:
        float or None: The total number of years of the lease, or None if input is invalid.

    """
    if isinstance(lease_info, str):
        try:
            yearmonth = [int(s) for s in lease_info.split() if s.isdigit()]
            if len(yearmonth) == 2:  # Format: "X years Y months"
                return yearmonth[0] + (yearmonth[1] / 12)
            elif len(yearmonth) == 1:  # Format: "X years"
                return float(yearmonth[0])
            else:
                return None
        except ValueError:
            return None
    elif isinstance(lease_info, int):
        return float(lease_info)
    else:
        return None


df["remaining_lease"] = df["remaining_lease"].apply(lambda x: get_lease_remaining_in_years(x))

In [152]:
# Select relevant columns
df = df[
    [
        "town",
        "flat_type",
        "storey_range",
        "floor_area_sqm",
        "flat_model",
        "lease_commence_date",
        "year",
        "region",
        "real_price",
    ]
]

In [153]:
# label encode storeys
df = df.sort_values(by="storey_range")
df["storey_range"] = df["storey_range"].astype("category").cat.codes  # label encode

# remove flat types with very few cases
df = df[~df["flat_type"].isin(["MULTI GENERATION", "1 ROOM"])]

# Re-categorize flat model to reduce num classes
replace_values = {
    "Executive Maisonette": "Maisonette",
    "Terrace": "Special",
    "Adjoined flat": "Special",
    "Type S1S2": "Special",
    "DBSS": "Special",
    "Model A2": "Model A",
    "Premium Apartment": "Apartment",
    "Improved": "Standard",
    "Simplified": "Model A",
    "2-room": "Standard",
}
df = df.replace({"flat_model": replace_values})

# Label encode flat type
replace_values = {"2 ROOM": 0, "3 ROOM": 1, "4 ROOM": 2, "5 ROOM": 3, "EXECUTIVE": 4}
df = df.replace({"flat_type": replace_values})

df = df.reset_index(drop=True)
display(df["flat_model"].value_counts())

  df = df.replace({"flat_type": replace_values})


flat_model
Model A           327555
Standard          280127
New Generation    187619
Apartment          80826
Maisonette         31629
Special             5750
3Gen                  26
Name: count, dtype: int64

In [154]:
## dummy encoding
df = pd.get_dummies(df, columns=["region"], prefix=["region"], drop_first=True)  # central is baseline
df = pd.get_dummies(df, columns=["flat_model"], prefix=["model"])
df = df.drop("model_Standard", axis=1)  # remove standard, setting it as the baseline

In [155]:

# Train Test Split
y = df['real_price']
X = df.drop(['real_price','town', 'year'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.1, shuffle=True, random_state=0)
print('Shape of X_train:', X_train.shape)
print('Shape of X_test:', X_test.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)

Shape of X_train: (822178, 14)
Shape of X_test: (91354, 14)
Shape of y_train: (822178,)
Shape of y_test: (91354,)


# Train baseline tree-based model

In [157]:
# Some dummy hyperparameter tuning code
param_grid = {
    "max_features": [10],  # max number of features considered for splitting a node
    "max_depth": [20],  # max number of levels in each decision tree
    "min_samples_split": [15],  # min number of data points placed in a node before the node is split
    "min_samples_leaf": [2],
}  
rfr = GridSearchCV(
    RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=28),
    param_grid,
    cv=5,
    scoring="r2",
    return_train_score=True,
)
rfr.fit(X_train, y_train)
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

Best parameters set found on Cross Validation:

 {'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 15}

Cross Validation R² score:

 0.734


In [160]:
# Evalute on test set
cv_predicted_test = rfr.predict(X_test)

cv_test_score = r2_score(y_test, cv_predicted_test)
spearman = spearmanr(y_test, cv_predicted_test)
pearson = pearsonr(y_test, cv_predicted_test)
cv_mae = mean_absolute_error(y_test, cv_predicted_test)

print(f'Test data R\u00b2 score: {cv_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(cv_mae)}')

Test data R² score: 0.736
Test data Spearman correlation: 0.845
Test data Pearson correlation: 0.858
Test data Mean Absolute Error: 63138


In [161]:
y_test.describe()

count    9.135400e+04
mean     3.691419e+05
std      1.625648e+05
min      1.938611e+04
25%      2.524441e+05
50%      3.529412e+05
75%      4.655870e+05
max      1.266562e+06
Name: real_price, dtype: float64

In [162]:
1.625648e+05

162564.8

# Conclusion

Observations:
- This performance of the baseline model is not bad. Although we used a tree-based model, i.e. Random Forest, instead of a linear model. The model seems to be underfitting based on the results train and test R\u00b2 metric. An R\u00b2 value of 0.7 means that 70% of the variance in the dependent vairable can be explained by the independent variables. It is important to note that R\u00b2 is not a measure of accuracy in an absolute sense. In other words, the model has a good level of explanatory power.
- Mean Absolute Error of 63k suggests that the average magnitude of errors is plus or minus 63k. If we look at the standard deviation of the prices, i.e. 162k, MAE is lesser than SD, which says that this MAE might actually be acceptable as a baseline.
- We can consider the following methods to improve on the performance
    - Integrate more features like: distance to school, distance to mrt, etc. These features will most likely improve the predictive power because intuitively, it makes sense.
    - Try more advanced tree-boosting algorithms like xgboost or lightgbm. Since model is already underfitting, we can try a complex algorithm and capture most of the underlying patterns. When overfitting occurs, which is likely, we can proceed with dealing with it using regularizations and hyperparamter tuning. 
    - Before we go deep into hyperparameter tuning, we need to perform error analysis, i.e. identify wrongly predicted predictions and understand what caused it. Then, we can pay attention to how to treat outliers and impute missing values.