In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
train_data = pd.read_csv('../../data/processed/train.csv')
test_data = pd.read_csv('../../data/processed/test.csv')

In [4]:
# Identify categorical and numerical columns
categorical_cols = ['flat_model']
numerical_cols = ['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months']

# Create column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Prepare X and y for both datasets
X_train = train_data[numerical_cols + categorical_cols]
y_train = train_data['resale_price']
X_test = test_data[numerical_cols + categorical_cols]
y_test = test_data['resale_price']

print(train_data.head())

   time                  address  storey_avg  floor_area_sqm  \
0    29   664C JURONG WEST ST 64           5            91.0   
1    12  524 CHOA CHU KANG ST 51           5           144.0   
2    43             87 DAWSON RD          23            65.0   
3     5      168 LOR 1 TOA PAYOH          11            65.0   
4    80  249 BT BATOK EAST AVE 5          11            73.0   

   flat_type_encoded         flat_model  remaining_lease_months  resale_price  
0                  4            Model A                     963      380000.0  
1                  6          Apartment                     913      437000.0  
2                  3  Premium Apartment                    1140      642888.0  
3                  3           Improved                     650      320000.0  
4                  3            Model A                     726      400000.0  


In [5]:
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Evaluation Metrics:")
    print(f"MSE: {mse:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R²: {r2:.4f}")
    
    return mse, rmse, mae, r2

In [6]:
gbm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        subsample=0.8,
        random_state=42
    ))
])

# Train the model
gbm_pipeline.fit(X_train, y_train)

# Make predictions
gbm_pred_train = gbm_pipeline.predict(X_train)
gbm_pred_test = gbm_pipeline.predict(X_test)

In [7]:
# Evaluate the model
print("\nTraining set evaluation:")
gbm_train_metrics = evaluate_model(y_train, gbm_pred_train, "Gradient Boosting")
print("\nTest set evaluation:")
gbm_test_metrics = evaluate_model(y_test, gbm_pred_test, "Gradient Boosting")

# Feature importance for GBM
gbm_model = gbm_pipeline.named_steps['regressor']
preprocessor_obj = gbm_pipeline.named_steps['preprocessor']
feature_names = (numerical_cols + 
                 list(preprocessor_obj.named_transformers_['cat'].get_feature_names_out(['flat_model'])))


Training set evaluation:

Gradient Boosting Evaluation Metrics:
MSE: 6448629016.44
RMSE: 80303.36
MAE: 56383.70
R²: 0.8013

Test set evaluation:

Gradient Boosting Evaluation Metrics:
MSE: 6605149534.54
RMSE: 81272.07
MAE: 56953.99
R²: 0.7965


In [8]:
# Get feature importances
importances = gbm_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Print feature ranking
print("\nFeature ranking:")
for i in range(len(importances)):
    if i < len(feature_names):
        print(f"{i+1}. {feature_names[indices[i]]} ({importances[indices[i]]:.4f})")


Feature ranking:
1. floor_area_sqm (0.4050)
2. time (0.2028)
3. storey_avg (0.1281)
4. flat_type_encoded (0.1204)
5. remaining_lease_months (0.0865)
6. flat_model_DBSS (0.0246)
7. flat_model_Type S1 (0.0069)
8. flat_model_Premium Apartment (0.0066)
9. flat_model_Terrace (0.0040)
10. flat_model_Type S2 (0.0031)
11. flat_model_Model A (0.0031)
12. flat_model_Improved (0.0027)
13. flat_model_Maisonette (0.0014)
14. flat_model_Standard (0.0013)
15. flat_model_Model A2 (0.0011)
16. flat_model_Premium Apartment Loft (0.0011)
17. flat_model_Model A-Maisonette (0.0004)
18. flat_model_Apartment (0.0003)
19. flat_model_Adjoined flat (0.0002)
20. flat_model_New Generation (0.0001)
21. flat_model_Simplified (0.0001)
22. flat_model_Improved-Maisonette (0.0001)
23. flat_model_Multi Generation (0.0001)
24. flat_model_3Gen (0.0001)
25. flat_model_Premium Maisonette (0.0000)
26. flat_model_2-room (0.0000)
