## Import Libraries

In [7]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

## Load and Prepare Data

In [5]:
DATA_PATH = "../data/HDB_data_2021_sample.xlsx"

df = pd.read_excel(DATA_PATH)

# ensure resale_price exists
df = df.dropna(subset=["resale_price"])

# target variable: log(price)
df["log_resale_price"] = np.log(df["resale_price"])

# predictors for penalised models & PCR
drop_cols_full = ["resale_price", "log_resale_price", "year"]
drop_cols_full = [c for c in drop_cols_full if c in df.columns]

X_full = df.drop(columns=drop_cols_full)
y = df["log_resale_price"].values

feature_names_full = X_full.columns.tolist()

print("Number of observations:", X_full.shape[0])
print("Number of predictors (full):", X_full.shape[1])

Number of observations: 6000
Number of predictors (full): 228


## Train-Test Split & Scaling

In [6]:
# 80/20 split
X_full_train, X_full_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

# standardisation (important for penalised models and PCA)
scaler = StandardScaler()
X_full_train_scaled = scaler.fit_transform(X_full_train)
X_full_test_scaled = scaler.transform(X_full_test)

# performance metrics
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def r2(y_true, y_pred):
    return r2_score(y_true, y_pred)

## Hybrid Modeling

In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# tune Elastic Net on y
enet = ElasticNet(max_iter=5000, random_state=42)

enet_param_grid = {
    "alpha":    [0.0005, 0.001, 0.01, 0.1],
    "l1_ratio": [0.2, 0.5, 0.8]
}

enet_grid = GridSearchCV(
    estimator=enet,
    param_grid=enet_param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)
enet_grid.fit(X_full_train_scaled, y_train)

enet_best = enet_grid.best_estimator_
print("Best ENet params:", enet_grid.best_params_)

# ENet predictions
y_train_enet = enet_best.predict(X_full_train_scaled)
y_test_enet  = enet_best.predict(X_full_test_scaled)

print("ENet only - Test RMSE:", rmse(y_test, y_test_enet))
print("ENet only - Test MAE :", mae(y_test, y_test_enet))
print("ENet only - Test R²  :", r2(y_test, y_test_enet))

# 2. Residuals
r_train = y_train - y_train_enet

# 3. Tune XGBoost on residuals
xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42
)

xgb_param_grid = {
    "learning_rate": [0.05, 0.1],
    "max_depth":     [3, 5],
    "n_estimators":  [200, 300],
    "subsample":     [0.8, 1.0],
}

xgb_grid = GridSearchCV(
    estimator=xgb,
    param_grid=xgb_param_grid,
    scoring="neg_mean_squared_error",
    cv=5,
    n_jobs=-1
)
xgb_grid.fit(X_full_train_scaled, r_train)

xgb_best = xgb_grid.best_estimator_
print("Best XGB params (residual model):", xgb_grid.best_params_)

# Residual predictions
r_train_hat = xgb_best.predict(X_full_train_scaled)
r_test_hat  = xgb_best.predict(X_full_test_scaled)

# 4. Hybrid predictions
y_train_hybrid = y_train_enet + r_train_hat
y_test_hybrid  = y_test_enet  + r_test_hat

print("\nHybrid ENet + XGBoost-on-residuals:")
print("Test RMSE:", rmse(y_test, y_test_hybrid))
print("Test MAE :", mae(y_test, y_test_hybrid))
print("Test R²  :", r2(y_test, y_test_hybrid))

Best ENet params: {'alpha': 0.0005, 'l1_ratio': 0.2}
ENet only - Test RMSE: 0.07711534832473048
ENet only - Test MAE : 0.05936008220666693
ENet only - Test R²  : 0.9431135064712413
Best XGB params (residual model): {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}

Hybrid ENet + XGBoost-on-residuals:
Test RMSE: 0.06155184253455592
Test MAE : 0.04585764464116451
Test R²  : 0.9637582106790289
