# Background

Objectives:
- To train a baseline model to predict resale hdb prices
- To serve as a starting point for MLOps. 

# Import Libraries

In [1]:
import os
os.chdir("../")

In [176]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.stats import spearmanr, pearsonr

# Import data

In [195]:
# Donwloaded from src/download_resale_hdb_dataset.py cli script
df = pd.read_parquet("data/raw/resale_hdb_data.parquet")

# Downloaded from https://tablebuilder.singstat.gov.sg/table/TS/M212882
df_cpi = pd.read_csv("data/raw/cpi_housing.csv", index_col=0).iloc[9:757, :1].reset_index(drop=False)
df_cpi.columns = ["month", "cpi"]

In [196]:
df.head(1)

Unnamed: 0,_id,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease
0,1,2000-01,ANG MO KIO,3 ROOM,170,ANG MO KIO AVE 4,07 TO 09,69,Improved,1986,147000,


In [197]:
df_cpi.head(5)

Unnamed: 0,month,cpi
0,2023 Dec,115.2
1,2023 Nov,114.7
2,2023 Oct,115.5
3,2023 Sep,114.7
4,2023 Aug,114.1


# Clean data

In [198]:
# Convert columns to correct type
df['month'] = pd.to_datetime(df['month'], format='%Y-%m')
df["resale_price"] = df["resale_price"].astype(float)
df["floor_area_sqm"] = df["floor_area_sqm"].astype(float)


# Clean flat type
df["flat_type"] = df["flat_type"].str.replace("MULTI-GENERATION", "MULTI GENERATION")
# Rename flat model duplicates
replace_values = {
    "NEW GENERATION": "New Generation",
    "SIMPLIFIED": "Simplified",
    "STANDARD": "Standard",
    "MODEL A-MAISONETTE": "Maisonette",
    "MULTI GENERATION": "Multi Generation",
    "IMPROVED-MAISONETTE": "Executive Maisonette",
    "Improved-Maisonette": "Executive Maisonette",
    "Premium Maisonette": "Executive Maisonette",
    "2-ROOM": "2-room",
    "MODEL A": "Model A",
    "MAISONETTE": "Maisonette",
    "Model A-Maisonette": "Maisonette",
    "IMPROVED": "Improved",
    "TERRACE": "Terrace",
    "PREMIUM APARTMENT": "Premium Apartment",
    "Premium Apartment Loft": "Premium Apartment",
    "APARTMENT": "Apartment",
    "Type S1": "Type S1S2",
    "Type S2": "Type S1S2",
}
df = df.replace({"flat_model": replace_values})

In [199]:
# Adjust resale price for inflation
df_cpi["month"] = df_cpi["month"].apply(lambda x: x.strip())
df_cpi["month"] = pd.to_datetime(df_cpi["month"], format="%Y %b")
df = df.merge(df_cpi, on="month", how="left")
df["cpi"] = df["cpi"].astype(float)
df["real_price"] = (df["resale_price"] / df["cpi"]) * 100


# only include 1990 to 2023 data
df = df[df["month"] < "2024-01-01"]

# Feature Engineering

In [200]:
df['year'] = pd.DatetimeIndex(df['month']).year # extract out year

In [201]:
# reduce number of class of town to regions
d_region = {
    "ANG MO KIO": "North East",
    "BEDOK": "East",
    "BISHAN": "Central",
    "BUKIT BATOK": "West",
    "BUKIT MERAH": "Central",
    "BUKIT PANJANG": "West",
    "BUKIT TIMAH": "Central",
    "CENTRAL AREA": "Central",
    "CHOA CHU KANG": "West",
    "CLEMENTI": "West",
    "GEYLANG": "Central",
    "HOUGANG": "North East",
    "JURONG EAST": "West",
    "JURONG WEST": "West",
    "KALLANG/WHAMPOA": "Central",
    "MARINE PARADE": "Central",
    "PASIR RIS": "East",
    "PUNGGOL": "North East",
    "QUEENSTOWN": "Central",
    "SEMBAWANG": "North",
    "SENGKANG": "North East",
    "SERANGOON": "North East",
    "TAMPINES": "East",
    "TOA PAYOH": "Central",
    "WOODLANDS": "North",
    "YISHUN": "North",
}
df["region"] = df["town"].map(d_region)

In [202]:
def get_lease_remaining_in_years(lease_info: str or int) -> float:
    """Convert remaining lease information to a number of years.

    This function takes a lease duration expressed either as a string in the format
    "X years Y months" or as an integer representing the number of years, and converts
    it into a float representing the total number of years. If the input is a string
    but does not contain valid numbers, it returns None. If the input is neither a
    string nor an integer, it also returns None.

    Args:
        lease_info (str or int): The lease duration, either as a string with years and months
                                 or as an integer representing years.

    Returns:
        float or None: The total number of years of the lease, or None if input is invalid.

    """
    if isinstance(lease_info, str):
        try:
            yearmonth = [int(s) for s in lease_info.split() if s.isdigit()]
            if len(yearmonth) == 2:  # Format: "X years Y months"
                return yearmonth[0] + (yearmonth[1] / 12)
            elif len(yearmonth) == 1:  # Format: "X years"
                return float(yearmonth[0])
            else:
                return None
        except ValueError:
            return None
    elif isinstance(lease_info, int):
        return float(lease_info)
    else:
        return None


df["remaining_lease"] = df["remaining_lease"].apply(lambda x: get_lease_remaining_in_years(x))

In [203]:
# Select relevant columns
df = df[
    [
        "town",
        "flat_type",
        "storey_range",
        "floor_area_sqm",
        "flat_model",
        "remaining_lease",
        "year",
        "real_price",
    ]
]

In [204]:
# remove flat types with very few cases
df = df[~df["flat_type"].isin(["MULTI GENERATION", "1 ROOM"])]

# Re-categorize flat model to reduce num classes
replace_values = {
    "Executive Maisonette": "Maisonette",
    "Terrace": "Special",
    "Adjoined flat": "Special",
    "Type S1S2": "Special",
    "DBSS": "Special",
    "Model A2": "Model A",
    "Premium Apartment": "Apartment",
    "Improved": "Standard",
    "Simplified": "Model A",
    "2-room": "Standard",
}
df = df.replace({"flat_model": replace_values})

df = df.reset_index(drop=True)

In [205]:
df["flat_type"].value_counts()

flat_type
4 ROOM       347298
3 ROOM       293090
5 ROOM       192726
EXECUTIVE     68928
2 ROOM        11490
Name: count, dtype: int64

In [206]:
# Train Test Split
y = df["real_price"]
X = df.drop(["real_price", "year"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=0)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (822178, 6)
Shape of X_test: (91354, 6)
Shape of y_train: (822178,)
Shape of y_test: (91354,)


In [211]:
X_train.loc[879683].to_dict()

{'town': 'SENGKANG',
 'flat_type': '4 ROOM',
 'storey_range': '04 TO 06',
 'floor_area_sqm': 93.0,
 'flat_model': 'Model A',
 'remaining_lease': 95.0}

In [214]:
from sklearn.preprocessing import LabelEncoder

# Identifying non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
print(non_numeric_columns)

# Initialize a dictionary to store the label encoders for each column
label_encoders = {}

# Fit label encoders on the training set and transform both training and test sets
for col in non_numeric_columns:
    
    label_encoders[col] = LabelEncoder()
    X_train[col] = label_encoders[col].fit_transform(X_train[col])
    X_test[col] = label_encoders[col].transform(X_test[col])  # Transform the test set using the same encoder

Index(['town', 'flat_type', 'storey_range', 'flat_model'], dtype='object')


In [215]:
label_encoders.keys()

dict_keys(['town', 'flat_type', 'storey_range', 'flat_model'])

# Train baseline tree-based model

In [216]:
# Some dummy hyperparameter tuning code
param_grid = {
    "max_features": [10],  # max number of features considered for splitting a node
    "max_depth": [20],  # max number of levels in each decision tree
    "min_samples_split": [15],  # min number of data points placed in a node before the node is split
    "min_samples_leaf": [2],
}  
rfr = GridSearchCV(
    RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=28),
    param_grid,
    cv=5,
    scoring="r2",
    return_train_score=True,
)
rfr.fit(X_train, y_train)
print("Best parameters set found on Cross Validation:\n\n", rfr.best_params_)
print("\nCross Validation R\u00b2 score:\n\n", rfr.best_score_.round(3))

Best parameters set found on Cross Validation:

 {'max_depth': 20, 'max_features': 10, 'min_samples_leaf': 2, 'min_samples_split': 15}

Cross Validation R² score:

 0.566


In [217]:
# Evalute on test set
cv_predicted_test = rfr.predict(X_test)

cv_test_score = r2_score(y_test, cv_predicted_test)
spearman = spearmanr(y_test, cv_predicted_test)
pearson = pearsonr(y_test, cv_predicted_test)
cv_mae = mean_absolute_error(y_test, cv_predicted_test)

print(f'Test data R\u00b2 score: {cv_test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')
print(f'Test data Mean Absolute Error: {round(cv_mae)}')

Test data R² score: 0.565
Test data Spearman correlation: 0.789
Test data Pearson correlation: 0.765
Test data Mean Absolute Error: 80873


# Conclusion

**Notes to the reader**
- This notebook is designed as a basic framework for training a baseline model. It omits several crucial aspects typical in a data science project, such as exploratory data analysis (EDA), various encoding and transformation techniques, diverse metrics, feature importance evaluation, error analysis, and more.
- Often in academic settings, Random Forest is not considered a baseline model due to its non-parametric nature. Students usually prefer parametric models like linear regression as baseline. However, for complex and multifaceted data such as housing prices, a non-parametric model like Random Forest is more suitable. In contrast, using linear regression would require scaling, transformations, checking for linearity, homoscedasticity (by plotting residuals against labels), ensuring independence of errors, assessing residual normality through QQ-plots, evaluating multicollinearity, and so on.
- The primary focus of this project is to implement MLOps practices, hence less emphasis is placed on this notebook. It should not be considered a comprehensive guide to the scientific rigor required in machine learning applications.
- All code is modularized into Python modules and command-line interfaces (CLIs) within the src/ directory. ML engineers often modularize data science code for efficiency, better testing, and seamless integration with deployment, continuous integration/continuous deployment (CI/CD), and other workflows.

**Observations**
- This performance of the baseline model is not great. Although we used a tree-based model, i.e. Random Forest, instead of a linear model. The model seems to be underfitting based on the results train and test R\u00b2 metric. An R\u00b2 value of 0.55 means that 55% of the variance in the dependent vairable can be explained by the independent variables. It is important to note that R\u00b2 is not a measure of accuracy in an absolute sense. In other words, the model has a good level of explanatory power.
- A Mean Absolute Error (MAE) of $80,873 indicates an average error magnitude of approximately ±$80K. Considering the standard deviation of the prices is around $162K, the MAE being smaller than the standard deviation suggests that it might be an acceptable baseline.
- To enhance the model's performance, we could:
    -  Incorporate additional features, such as distance to schools or MRT stations. These intuitively relevant features are likely to boost predictive accuracy.
    -  Experiment with more advanced tree-boosting algorithms, like XGBoost or LightGBM. Given the current underfitting, a more complex algorithm could capture underlying patterns more effectively. Should overfitting arise, we can address it through regularization and hyperparameter tuning.
    - Conduct thorough error analysis to identify and understand the root causes of incorrect predictions. This process will guide our approach to managing outliers and imputing missing values, setting the stage for more focused model improvement efforts.