In [1]:
import pandas as pd

train_data = pd.read_csv('../../data/processed/train.csv')
test_data = pd.read_csv('../../data/processed/test.csv')

# Perform one-hot encoding
train_data = pd.get_dummies(train_data, columns=['flat_model'], prefix='model')
test_data = pd.get_dummies(test_data, columns=['flat_model'], prefix='model')

# Ensure that both train and test have the same one-hot encoded columns
train_columns = set(train_data.columns)
test_columns = set(test_data.columns)
for col in train_columns - test_columns:
    if col.startswith('model_'):
        test_data[col] = 0
extra_columns = [col for col in test_data.columns if col.startswith('model_') and col not in train_columns]
test_data.drop(columns=extra_columns, inplace=True)

# Define the feature columns and target columns
numeric_features = ['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months']
model_columns = [col for col in train_data.columns if col.startswith('model_')]
feature_columns = numeric_features + model_columns
target_column = 'resale_price'

X_train = train_data[feature_columns]
y_train = train_data[target_column]
X_test = test_data[feature_columns]
y_test = test_data[target_column]

print("Feature Columns Used:")
print(feature_columns)


Feature Columns Used:
['time', 'storey_avg', 'floor_area_sqm', 'flat_type_encoded', 'remaining_lease_months', 'model_2-room', 'model_3Gen', 'model_Adjoined flat', 'model_Apartment', 'model_DBSS', 'model_Improved', 'model_Improved-Maisonette', 'model_Maisonette', 'model_Model A', 'model_Model A-Maisonette', 'model_Model A2', 'model_Multi Generation', 'model_New Generation', 'model_Premium Apartment', 'model_Premium Apartment Loft', 'model_Premium Maisonette', 'model_Simplified', 'model_Standard', 'model_Terrace', 'model_Type S1', 'model_Type S2']


In [2]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 4, 5, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_distributions,
    n_iter=20,  
    scoring='neg_mean_absolute_error',
    cv=3,  
    verbose=1,
    random_state=42,
    n_jobs=-1 
)

random_search.fit(X_train, y_train)

print("Best Parameters from RandomizedSearchCV:")
print(random_search.best_params_)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters from RandomizedSearchCV:
{'subsample': 0.8, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.2, 'colsample_bytree': 1.0}


In [3]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

best_xgb_model = random_search.best_estimator_
xgb_predictions = best_xgb_model.predict(X_test)

# Evaluate the model performance
test_data['xgb_predicted_price'] = xgb_predictions
xgb_mae = mean_absolute_error(y_test, xgb_predictions)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_predictions))

print("\nXGBoost Model Evaluation:")
print("Mean Absolute Error (MAE):", xgb_mae)
print("Root Mean Squared Error (RMSE):", xgb_rmse)



XGBoost Model Evaluation:
Mean Absolute Error (MAE): 53670.33516329242
Root Mean Squared Error (RMSE): 77600.80163564642
