# HDB Resale Price Regression Models

**Models implemented:**
1. Linear Regression (OLS)
2. Non-linear Regression (polynomial terms for selected variables)
3. IV Regression (2SLS, exploratory)

**Common settings:**
- 80/20 train-test split
- 5-fold Cross-Validation (CV) for hypertuning
- RMSE used as main metric

In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

## Load and Prepare Data

In [None]:
DATA_PATH = "../data/HDB_data_2021_sample.xlsx"

df = pd.read_excel(DATA_PATH)

# drop rows with missing resale_price
df = df.dropna(subset=["resale_price"])

# define target: use log(price) for nicer regression properties
df["log_resale_price"] = np.log(df["resale_price"])

### Feature Selection for Models

In [5]:
# for "full" models (OLS, LASSO, Ridge, ENet)
# we'll use all columns except the target and year (year is constant 2021)
drop_cols_full = ["resale_price", "log_resale_price", "year"]
X_full = df.drop(columns=drop_cols_full)
y = df["log_resale_price"].values

feature_names_full = X_full.columns.tolist()

# for non-linear regression & stepwise & IV,
# we work with a smaller, interpretable subset to keep things manageable.
nonlinear_features = [
    "floor_area_sqm",
    "Remaining_lease",
    "max_floor_lvl",
    "mature",
    "Dist_CBD",
    "Dist_nearest_station",
    "Dist_nearest_hospital",
]

# keep only those columns that actually exist in the dataframe
nonlinear_features = [f for f in nonlinear_features if f in df.columns]

X_small = df[nonlinear_features].copy()
feature_names_small = X_small.columns.tolist()

## Train-Test Split (80/20)

In [6]:
X_full_train, X_full_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(
    X_small, y, test_size=0.2, random_state=42
)

## Utility: Compute RMSE

In [12]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

## Linear Regression (OLS)

In [13]:
# baseline: simple OLS on full feature set
ols_baseline = LinearRegression()
ols_baseline.fit(X_full_train, y_train)

y_pred_train_ols = ols_baseline.predict(X_full_train)
y_pred_test_ols = ols_baseline.predict(X_full_test)

print(f"Baseline OLS - Train RMSE: {rmse(y_train, y_pred_train_ols):.4f}")
print(f"Baseline OLS - Test  RMSE: {rmse(y_test, y_pred_test_ols):.4f}")

Baseline OLS - Train RMSE: 0.0732
Baseline OLS - Test  RMSE: 0.0772


In [14]:
# "hypertuned" OLS:
# there's no real hyperparameter to tune for plain OLS,
# but we can do a 5-fold CV on the training data to estimate expected RMSE
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse_scores = []
for train_idx, val_idx in kf.split(X_full_train):
    X_tr, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    model = LinearRegression()
    model.fit(X_tr, y_tr)
    y_val_pred = model.predict(X_val)
    cv_rmse_scores.append(rmse(y_val, y_val_pred))

print(f"OLS 5-fold CV RMSE (train): {np.mean(cv_rmse_scores):.4f} "
      f"(std: {np.std(cv_rmse_scores):.4f})")

OLS 5-fold CV RMSE (train): 0.0770 (std: 0.0011)


In [15]:
# coefficient-based "feature importance" for OLS
ols_coefs = pd.Series(ols_baseline.coef_, index=feature_names_full)
print("\nTop 10 OLS coefficients by absolute magnitude:")
print(ols_coefs.abs().sort_values(ascending=False).head(10))


Top 10 OLS coefficients by absolute magnitude:
flat_model_terrace                0.626531
postal_2digits_50                 0.267955
flat_model_2.room                 0.260992
flat_model_improved.maisonette    0.237784
town_BUKIT.TIMAH                  0.226117
postal_2digits_51                 0.209441
postal_2digits_18                 0.202498
flat_type_2.ROOM                  0.183313
postal_2digits_37                 0.173624
flat_model_type.s2                0.170621
dtype: float64


## Non-Linear Regression (Polynomial Terms on Selected Features)

In [16]:
# baseline: degree=2 polynomial on selected features, Linear Regression
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_train = poly.fit_transform(X_small_train)
X_poly_test = poly.transform(X_small_test)

poly_feature_names = poly.get_feature_names_out(feature_names_small)

poly_ols_baseline = LinearRegression()
poly_ols_baseline.fit(X_poly_train, y_small_train)

y_pred_train_poly = poly_ols_baseline.predict(X_poly_train)
y_pred_test_poly = poly_ols_baseline.predict(X_poly_test)

print(f"Baseline Poly (deg=2) - Train RMSE: {rmse(y_small_train, y_pred_train_poly):.4f}")
print(f"Baseline Poly (deg=2) - Test  RMSE: {rmse(y_small_test, y_pred_test_poly):.4f}")

Baseline Poly (deg=2) - Train RMSE: 0.1068
Baseline Poly (deg=2) - Test  RMSE: 0.1089


In [17]:
# hypertuned: try degrees 1, 2, 3 with 5-fold CV
best_degree = None
best_cv_rmse = np.inf

for degree in [1, 2, 3]:
    poly_tmp = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly_tmp = poly_tmp.fit_transform(X_small_train)

    cv_rmse_scores = []

    for train_idx, val_idx in kf.split(X_poly_tmp):
        X_tr, X_val = X_poly_tmp[train_idx], X_poly_tmp[val_idx]
        y_tr, y_val = y_small_train[train_idx], y_small_train[val_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict(X_val)
        cv_rmse_scores.append(rmse(y_val, y_val_pred))

    mean_cv_rmse = np.mean(cv_rmse_scores)
    print(f"Degree {degree} Poly - 5-fold CV RMSE: {mean_cv_rmse:.4f}")

    if mean_cv_rmse < best_cv_rmse:
        best_cv_rmse = mean_cv_rmse
        best_degree = degree

print(f"Best polynomial degree (by CV RMSE): {best_degree}")

Degree 1 Poly - 5-fold CV RMSE: 0.1205
Degree 2 Poly - 5-fold CV RMSE: 0.1079
Degree 3 Poly - 5-fold CV RMSE: 0.1022
Best polynomial degree (by CV RMSE): 3


In [18]:
# fit final tuned model with best degree
poly_best = PolynomialFeatures(degree=best_degree, include_bias=False)
X_poly_train_best = poly_best.fit_transform(X_small_train)
X_poly_test_best = poly_best.transform(X_small_test)

poly_ols_tuned = LinearRegression()
poly_ols_tuned.fit(X_poly_train_best, y_small_train)

y_pred_test_poly_tuned = poly_ols_tuned.predict(X_poly_test_best)
print(f"Tuned Poly (deg={best_degree}) - Test RMSE: {rmse(y_small_test, y_pred_test_poly_tuned):.4f}")

Tuned Poly (deg=3) - Test RMSE: 0.1019
