# HDB Resale Price Regression Models

**Models implemented:**
1. Linear Regression (OLS)
2. Non-linear Regression (polynomial terms for selected variables)
3. IV Regression (2SLS, exploratory)
4. LASSO
5. Ridge
6. Elastic Net
7. Stepwise Selection (forward, on a subset of features)

**Common settings:**
- 80/20 train-test split
- 5-fold Cross-Validation (CV) for hypertuning
- RMSE used as main metric

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

## Load and Prepare Data

In [2]:
DATA_PATH = "../data/HDB_data_2021_sample.xlsx"

df = pd.read_excel(DATA_PATH)

# drop rows with missing resale_price
df = df.dropna(subset=["resale_price"])

# define target: use log(price) for nicer regression properties
df["log_resale_price"] = np.log(df["resale_price"])

### Feature Selection for Models

In [3]:
# for "full" models (OLS, LASSO, Ridge, ENet)
# we'll use all columns except the target and year (year is constant 2021)
drop_cols_full = ["resale_price", "log_resale_price", "year"]
X_full = df.drop(columns=drop_cols_full)
y = df["log_resale_price"].values

feature_names_full = X_full.columns.tolist()

# for non-linear regression & stepwise & IV,
# we work with a smaller, interpretable subset to keep things manageable.
nonlinear_features = [
    "floor_area_sqm",
    "Remaining_lease",
    "max_floor_lvl",
    "mature",
    "Dist_CBD",
    "Dist_nearest_station",
    "Dist_nearest_hospital",
]

# keep only those columns that actually exist in the dataframe
nonlinear_features = [f for f in nonlinear_features if f in df.columns]

X_small = df[nonlinear_features].copy()
feature_names_small = X_small.columns.tolist()

## Train-Test Split (80/20)

In [4]:
X_full_train, X_full_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

X_small_train, X_small_test, y_small_train, y_small_test = train_test_split(
    X_small, y, test_size=0.2, random_state=42
)

## Utility: Compute RMSE

In [5]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

## Linear Regression (OLS)

In [6]:
# baseline: simple OLS on full feature set
ols_baseline = LinearRegression()
ols_baseline.fit(X_full_train, y_train)

y_pred_train_ols = ols_baseline.predict(X_full_train)
y_pred_test_ols = ols_baseline.predict(X_full_test)

print(f"Baseline OLS - Train RMSE: {rmse(y_train, y_pred_train_ols):.4f}")
print(f"Baseline OLS - Test  RMSE: {rmse(y_test, y_pred_test_ols):.4f}")

Baseline OLS - Train RMSE: 0.0732
Baseline OLS - Test  RMSE: 0.0772


In [7]:
# "hypertuned" OLS:
# there's no real hyperparameter to tune for plain OLS,
# but we can do a 5-fold CV on the training data to estimate expected RMSE
kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse_scores = []
for train_idx, val_idx in kf.split(X_full_train):
    X_tr, X_val = X_full_train.iloc[train_idx], X_full_train.iloc[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    model = LinearRegression()
    model.fit(X_tr, y_tr)
    y_val_pred = model.predict(X_val)
    cv_rmse_scores.append(rmse(y_val, y_val_pred))

print(f"OLS 5-fold CV RMSE (train): {np.mean(cv_rmse_scores):.4f} "
      f"(std: {np.std(cv_rmse_scores):.4f})")

OLS 5-fold CV RMSE (train): 0.0770 (std: 0.0011)


In [8]:
# coefficient-based "feature importance" for OLS
ols_coefs = pd.Series(ols_baseline.coef_, index=feature_names_full)
print("\nTop 10 OLS coefficients by absolute magnitude:")
print(ols_coefs.abs().sort_values(ascending=False).head(10))


Top 10 OLS coefficients by absolute magnitude:
flat_model_terrace                0.626531
postal_2digits_50                 0.267955
flat_model_2.room                 0.260992
flat_model_improved.maisonette    0.237784
town_BUKIT.TIMAH                  0.226117
postal_2digits_51                 0.209441
postal_2digits_18                 0.202498
flat_type_2.ROOM                  0.183313
postal_2digits_37                 0.173624
flat_model_type.s2                0.170621
dtype: float64


## Non-Linear Regression (Polynomial Terms on Selected Features)

In [9]:
# baseline: degree=2 polynomial on selected features, Linear Regression
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_train = poly.fit_transform(X_small_train)
X_poly_test = poly.transform(X_small_test)

poly_feature_names = poly.get_feature_names_out(feature_names_small)

poly_ols_baseline = LinearRegression()
poly_ols_baseline.fit(X_poly_train, y_small_train)

y_pred_train_poly = poly_ols_baseline.predict(X_poly_train)
y_pred_test_poly = poly_ols_baseline.predict(X_poly_test)

print(f"Baseline Poly (deg=2) - Train RMSE: {rmse(y_small_train, y_pred_train_poly):.4f}")
print(f"Baseline Poly (deg=2) - Test  RMSE: {rmse(y_small_test, y_pred_test_poly):.4f}")

Baseline Poly (deg=2) - Train RMSE: 0.1068
Baseline Poly (deg=2) - Test  RMSE: 0.1089


In [10]:
# hypertuned: try degrees 1, 2, 3 with 5-fold CV
best_degree = None
best_cv_rmse = np.inf

for degree in [1, 2, 3]:
    poly_tmp = PolynomialFeatures(degree=degree, include_bias=False)
    X_poly_tmp = poly_tmp.fit_transform(X_small_train)

    cv_rmse_scores = []

    for train_idx, val_idx in kf.split(X_poly_tmp):
        X_tr, X_val = X_poly_tmp[train_idx], X_poly_tmp[val_idx]
        y_tr, y_val = y_small_train[train_idx], y_small_train[val_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict(X_val)
        cv_rmse_scores.append(rmse(y_val, y_val_pred))

    mean_cv_rmse = np.mean(cv_rmse_scores)
    print(f"Degree {degree} Poly - 5-fold CV RMSE: {mean_cv_rmse:.4f}")

    if mean_cv_rmse < best_cv_rmse:
        best_cv_rmse = mean_cv_rmse
        best_degree = degree

print(f"Best polynomial degree (by CV RMSE): {best_degree}")

Degree 1 Poly - 5-fold CV RMSE: 0.1205
Degree 2 Poly - 5-fold CV RMSE: 0.1079
Degree 3 Poly - 5-fold CV RMSE: 0.1022
Best polynomial degree (by CV RMSE): 3


In [11]:
# fit final tuned model with best degree
poly_best = PolynomialFeatures(degree=best_degree, include_bias=False)
X_poly_train_best = poly_best.fit_transform(X_small_train)
X_poly_test_best = poly_best.transform(X_small_test)

poly_ols_tuned = LinearRegression()
poly_ols_tuned.fit(X_poly_train_best, y_small_train)

y_pred_test_poly_tuned = poly_ols_tuned.predict(X_poly_test_best)
print(f"Tuned Poly (deg={best_degree}) - Test RMSE: {rmse(y_small_test, y_pred_test_poly_tuned):.4f}")

Tuned Poly (deg=3) - Test RMSE: 0.1019


## IV Regression (2SLS, Exploratory)

In [12]:
"""
IMPORTANT:
This is a SIMPLE 2SLS implementation WITHOUT statsmodels.

You must define:
- endog_var: the (suspected) endogenous regressor
- instrument_vars: list of instruments
- exog_vars: other exogenous controls

Below is just an EXAMPLE SETUP - you should adjust to what makes sense
economically for your project.
"""

# example variable choices – EDIT to match your actual IV story
endog_var = "Remaining_lease"          # endogenous regressor candidate
instrument_vars = ["max_floor_lvl"]    # instrument(s) – purely illustrative!
base_exog_vars = ["floor_area_sqm", "mature", "Dist_CBD", "Dist_nearest_station"]

# keep only variables that exist
iv_all_vars = [endog_var] + instrument_vars + base_exog_vars + ["log_resale_price"]
iv_all_vars = [v for v in iv_all_vars if v in df.columns]

iv_data = df[iv_all_vars].dropna()

if endog_var not in iv_data.columns or not all(z in iv_data.columns for z in instrument_vars):
    print("IV example not run: check that endog and instrument columns exist in df.")
else:
    # define y, endogenous regressor, instruments, and exog controls
    y_iv = iv_data["log_resale_price"].values
    endog = iv_data[[endog_var]]                      # (n, 1)
    Z = iv_data[instrument_vars]                      # instruments
    exog = iv_data[[v for v in base_exog_vars if v in iv_data.columns]]

    # 2a. first stage: endog ~ instruments + exog
    X_first = pd.concat([Z, exog], axis=1)
    first_stage_model = LinearRegression()
    first_stage_model.fit(X_first, endog)

    endog_hat = first_stage_model.predict(X_first)    # predicted Remaining_lease

    print("\nFirst-stage coefficients (endogenous on instruments + exog):")
    first_stage_coefs = pd.Series(
        first_stage_model.coef_.ravel(),
        index=X_first.columns
    )
    print(first_stage_coefs.sort_values(ascending=False))

    # 2b. second stage: y ~ predicted_endog + exog
    X_second = pd.concat(
        [pd.Series(endog_hat.ravel(), name=f"{endog_var}_hat"), exog.reset_index(drop=True)],
        axis=1
    )

    second_stage_model = LinearRegression()
    second_stage_model.fit(X_second, y_iv)

    y_iv_pred = second_stage_model.predict(X_second)
    iv_rmse = rmse(y_iv, y_iv_pred)

    print("\nSecond-stage 2SLS (manual) results:")
    print(f"2SLS In-sample RMSE: {iv_rmse:.4f}")

    second_stage_coefs = pd.Series(
        second_stage_model.coef_,
        index=X_second.columns
    )
    print("\n2SLS coefficients:")
    print(second_stage_coefs.sort_values(ascending=False))

    print(f"\n2SLS intercept: {second_stage_model.intercept_:.4f}")


First-stage coefficients (endogenous on instruments + exog):
max_floor_lvl           1.082473
Dist_CBD                1.033363
floor_area_sqm          0.001078
Dist_nearest_station   -4.292930
mature                 -6.489360
dtype: float64

Second-stage 2SLS (manual) results:
2SLS In-sample RMSE: 0.1558

2SLS coefficients:
mature                  0.192252
Remaining_lease_hat     0.016818
floor_area_sqm          0.009868
Dist_nearest_station   -0.020500
Dist_CBD               -0.026934
dtype: float64

2SLS intercept: 11.0784


## Penalised Regressions: LASSO, Ridge, Elastic Net

In [13]:
# standardise features for penalised models
scaler = StandardScaler()
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_full_test_scaled = scaler.transform(X_full_test)

# helper: run baseline & tuned penalised model with GridSearchCV
def run_penalised_model(model_name, base_model, param_grid):
    """
    model_name: str ("LASSO", "Ridge", "ElasticNet")
    base_model: sklearn estimator (Lasso, Ridge, ElasticNet)
    param_grid: dict of hyperparameters for GridSearchCV
    """
    print(f"\n--- {model_name} ---")

    # baseline: fit with default hyperparameters
    base_model.fit(X_full_train_scaled, y_train)
    y_pred_test_base = base_model.predict(X_full_test_scaled)
    print(f"{model_name} Baseline - Test RMSE: {rmse(y_test, y_pred_test_base):.4f}")

    # hypertuned with 5-fold CV
    grid = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",  # we'll take sqrt later
        cv=5,
        n_jobs=-1
    )
    grid.fit(X_full_train_scaled, y_train)

    best_model = grid.best_estimator_
    y_pred_test_best = best_model.predict(X_full_test_scaled)
    best_rmse = rmse(y_test, y_pred_test_best)

    print(f"{model_name} Tuned - Best Params: {grid.best_params_}")
    print(f"{model_name} Tuned - Test RMSE:  {best_rmse:.4f}")

    # feature importance = absolute value of coefficients
    coefs = pd.Series(best_model.coef_, index=feature_names_full)
    print(f"\nTop 10 {model_name} coefficients by absolute magnitude:")
    print(coefs.abs().sort_values(ascending=False).head(10))

    return best_model

### LASSO

In [14]:
lasso_base = Lasso(max_iter=5000, random_state=42)
lasso_param_grid = {"alpha": [0.0005, 0.001, 0.01, 0.1, 1.0]}
best_lasso = run_penalised_model("LASSO", lasso_base, lasso_param_grid)


--- LASSO ---
LASSO Baseline - Test RMSE: 0.3236
LASSO Tuned - Best Params: {'alpha': 0.0005}
LASSO Tuned - Test RMSE:  0.0774

Top 10 LASSO coefficients by absolute magnitude:
floor_area_sqm           0.178885
Remaining_lease          0.154745
Dist_CBD                 0.062814
mature                   0.031116
Dist_nearest_GHawker     0.030710
storey_range_01.TO.03    0.027989
flat_type_3.ROOM         0.027887
max_floor_lvl            0.026193
Dist_nearest_station     0.025844
postal_2digits_44        0.024547
dtype: float64


### Ridge

In [15]:
ridge_base = Ridge(random_state=42)
ridge_param_grid = {"alpha": [0.1, 1.0, 10.0, 100.0]}
best_ridge = run_penalised_model("Ridge", ridge_base, ridge_param_grid)


--- Ridge ---
Ridge Baseline - Test RMSE: 0.0773
Ridge Tuned - Best Params: {'alpha': 1.0}
Ridge Tuned - Test RMSE:  0.0773

Top 10 Ridge coefficients by absolute magnitude:
floor_area_sqm             0.162985
Remaining_lease            0.153364
Dist_CBD                   0.097727
Dist_nearest_GAI_jc        0.073005
Dist_nearest_university    0.071881
Dist_nearest_jc            0.042982
flat_type_3.ROOM           0.036832
Dist_nearest_G_jc          0.034055
Dist_nearest_station       0.034002
flat_type_5.ROOM           0.030627
dtype: float64


### Elastic Net

In [16]:
enet_base = ElasticNet(max_iter=5000, random_state=42)
enet_param_grid = {
    "alpha": [0.0005, 0.001, 0.01, 0.1, 1.0],
    "l1_ratio": [0.2, 0.5, 0.8]
}
best_enet = run_penalised_model("ElasticNet", enet_base, enet_param_grid)


--- ElasticNet ---
ElasticNet Baseline - Test RMSE: 0.3236
ElasticNet Tuned - Best Params: {'alpha': 0.0005, 'l1_ratio': 0.2}
ElasticNet Tuned - Test RMSE:  0.0771

Top 10 ElasticNet coefficients by absolute magnitude:
floor_area_sqm             0.165953
Remaining_lease            0.153395
Dist_CBD                   0.086982
Dist_nearest_GAI_jc        0.040341
mature                     0.037534
Dist_nearest_university    0.034679
flat_type_3.ROOM           0.033808
Dist_nearest_station       0.031461
flat_type_5.ROOM           0.030557
storey_range_01.TO.03      0.028541
dtype: float64


## Stepwise Selection (Forward) on a Subset of Variables

In [17]:
"""
We:
- start with no predictors,
- at each step, try adding each remaining candidate variable,
- for each tentative model, evaluate 5-fold CV RMSE,
- add the variable that gives the biggest RMSE improvement,
- stop when no variable improves RMSE.

We use:
- X_small_train, y_small_train  (subset of interpretable features)
- X_small_test,  y_small_test   (to evaluate the final selected model)
"""

# use the same small feature set as before
candidate_features = list(X_small_train.columns)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

def cv_rmse_for_features(X, y, features, kf):
    """
    compute K-fold CV RMSE for LinearRegression using
    only the given list of features.
    """
    if len(features) == 0:
        # No features: predict mean
        y_mean = np.mean(y)
        return np.sqrt(np.mean((y - y_mean) ** 2))

    X_sub = X[features].values
    rmse_scores = []

    for train_idx, val_idx in kf.split(X_sub):
        X_tr, X_val = X_sub[train_idx], X_sub[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        model = LinearRegression()
        model.fit(X_tr, y_tr)
        y_val_pred = model.predict(X_val)
        rmse_scores.append(np.sqrt(np.mean((y_val - y_val_pred) ** 2)))

    return np.mean(rmse_scores)


def forward_stepwise_cv(X, y, candidate_features, kf, tol=1e-4, verbose=True):
    """
    forward stepwise selection based on K-fold CV RMSE.

    tol: minimum RMSE improvement required to keep adding variables.
    """
    selected = []
    remaining = list(candidate_features)
    current_best_rmse = cv_rmse_for_features(X, y, selected, kf)
    if verbose:
        print(f"Start: no features, CV RMSE = {current_best_rmse:.4f}")

    while len(remaining) > 0:
        best_feature = None
        best_rmse = current_best_rmse

        # try adding each remaining variable and compute CV RMSE
        for feat in remaining:
            trial_features = selected + [feat]
            trial_rmse = cv_rmse_for_features(X, y, trial_features, kf)

            if verbose:
                print(f"  Try adding {feat:>25s} -> CV RMSE = {trial_rmse:.4f}")

            if trial_rmse + tol < best_rmse:
                best_rmse = trial_rmse
                best_feature = feat

        # if we found a feature that improves RMSE, add it
        if best_feature is not None:
            selected.append(best_feature)
            remaining.remove(best_feature)
            current_best_rmse = best_rmse
            if verbose:
                print(f"--> Added {best_feature}, new best CV RMSE = {current_best_rmse:.4f}\n")
        else:
            # no further improvement
            if verbose:
                print("No further improvement in CV RMSE. Stopping forward selection.")
            break

    return selected, current_best_rmse

# run forward stepwise on the training set
selected_features, final_cv_rmse = forward_stepwise_cv(
    X_small_train,
    y_small_train,
    candidate_features,
    kf,
    tol=1e-4,
    verbose=True
)

print("\nSelected features by forward stepwise (CV RMSE):")
print(selected_features)
print(f"Final CV RMSE (train): {final_cv_rmse:.4f}")

# fit final model on train, evaluate on test
step_model = LinearRegression()
step_model.fit(X_small_train[selected_features], y_small_train)

y_step_train_pred = step_model.predict(X_small_train[selected_features])
y_step_test_pred  = step_model.predict(X_small_test[selected_features])

print(f"\nStepwise (CV-based) - Train RMSE: {rmse(y_small_train, y_step_train_pred):.4f}")
print(f"Stepwise (CV-based) - Test  RMSE: {rmse(y_small_test,  y_step_test_pred):.4f}")

Start: no features, CV RMSE = 0.3180
  Try adding            floor_area_sqm -> CV RMSE = 0.2324
  Try adding           Remaining_lease -> CV RMSE = 0.2915
  Try adding             max_floor_lvl -> CV RMSE = 0.2796
  Try adding                    mature -> CV RMSE = 0.3140
  Try adding                  Dist_CBD -> CV RMSE = 0.3116
  Try adding      Dist_nearest_station -> CV RMSE = 0.3160
  Try adding     Dist_nearest_hospital -> CV RMSE = 0.3163
--> Added floor_area_sqm, new best CV RMSE = 0.2324

  Try adding           Remaining_lease -> CV RMSE = 0.2038
  Try adding             max_floor_lvl -> CV RMSE = 0.1743
  Try adding                    mature -> CV RMSE = 0.2198
  Try adding                  Dist_CBD -> CV RMSE = 0.2035
  Try adding      Dist_nearest_station -> CV RMSE = 0.2278
  Try adding     Dist_nearest_hospital -> CV RMSE = 0.2236
--> Added max_floor_lvl, new best CV RMSE = 0.1743

  Try adding           Remaining_lease -> CV RMSE = 0.1677
  Try adding                    

In [18]:
# coefficients for interpretation
step_coefs = pd.Series(step_model.coef_, index=selected_features)
print("\nStepwise model coefficients:")
print(step_coefs.sort_values(ascending=False))
print(f"\nStepwise intercept: {step_model.intercept_:.4f}")


Stepwise model coefficients:
mature                   0.150569
floor_area_sqm           0.009856
Remaining_lease          0.009302
max_floor_lvl            0.008149
Dist_nearest_hospital   -0.006646
Dist_CBD                -0.017627
Dist_nearest_station    -0.055020
dtype: float64

Stepwise intercept: 11.4549
