<img style="float: right;" src="img/openhouse_logo.png" width="200" height="200"/><br>

# <center> <ins> House Price Prediction Coding Test <br><br> 5. Train and Test Model <ins> </center>
### <center>by: Daniel Lachner-Piza, PhD <br> for: OpenHouse.ai </center>




In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import optuna

from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, root_mean_squared_error
from xgboost import XGBRegressor

EDA_FIG_SIZE = (3,3)
models_path = Path("models")
os.makedirs(models_path, exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


# <ins> 1.Data Loading <ins> 

In [2]:
clean_scaled_data_df = pd.read_csv("data/clean_scaled_dataset.csv")
y_df = pd.read_csv("data/target.csv")

In [3]:
clean_scaled_data_df.head(5)

Unnamed: 0,LotArea,GrLivArea,YearBuilt,TotalBsmtSF,GarageArea,OverallQuality,OverallCondition,FullBath,HalfBath,GarageCars,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.174981,-0.357075,-0.46958,-0.954556,0.210489,-0.130274,0.389367,-1.056176,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
1,-0.888561,-0.868571,-0.096071,-1.011578,-1.154649,-1.624231,2.230762,-1.056176,-0.785264,-1.382694,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
2,0.3747,0.31313,1.160276,-0.409426,0.906547,-0.130274,-0.531331,0.77161,1.205462,1.80227,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
3,-0.340233,-0.496739,-0.809133,-2.448532,-0.431612,-1.624231,-0.531331,0.77161,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,0.388242,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354
4,-0.253258,-0.026053,-0.843088,1.107358,1.683542,-0.877252,0.389367,-1.056176,-0.785264,0.209788,...,-0.0548,-0.303967,-0.027369,-2.575711,-0.257373,-0.027369,-0.091115,-0.123278,0.45722,-0.308354


In [4]:
y_df.head(5)

Unnamed: 0,SalePrice
0,159500
1,86000
2,214000
3,90000
4,153500


# <ins> 2. Partition Data <ins> 

In [5]:
# X_train_valid, X_test, y_train_valid, y_test = train_test_split(clean_scaled_data_df, y_df, test_size=0.20, stratify=y, random_state=42)
# X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.40, stratify=y_train_valid, random_state=42)

X_train_valid, X_test, y_train_valid, y_test = train_test_split(clean_scaled_data_df, y_df, test_size=0.20, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=0.20, random_state=42)

In [6]:
print(f"Train-set size: {X_train.shape} --- Perecentage: {100*X_train.shape[0]/clean_scaled_data_df.shape[0]:.2f}")
print(f"Validation-set size: {X_valid.shape} --- Perecentage: {100*X_valid.shape[0]/clean_scaled_data_df.shape[0]:.2f}")
print(f"Test-set size: {X_test.shape} --- Perecentage: {100*X_test.shape[0]/clean_scaled_data_df.shape[0]:.2f}")

Train-set size: (854, 61) --- Perecentage: 63.92
Validation-set size: (214, 61) --- Perecentage: 16.02
Test-set size: (268, 61) --- Perecentage: 20.06


# <ins>  3. Train Simple Linear Regressor <ins> 

In [7]:
lr_model = LinearRegression().fit(X_train_valid, y_train_valid)
y_pred_test = lr_model.predict(X_test)
r2_value = r2_score(y_test, y_pred_test)
rmse_value = root_mean_squared_error(y_test, y_pred_test)

In [8]:
print(f"R2 Score: {r2_value}")
print(f"RMSE Score: {rmse_value}")

R2 Score: 0.7551370221609158
RMSE Score: 40124.16442334738


# <center> Using a simple linear regression model,<br> the sale price prediction deviates on average <br> <ins> +/- $40k </ins><br>from the real sale price.

# <ins>  4. Train XGBoost Regressor <ins> 

In [9]:
# Objective function for hyperparameter tuning of XGBRegressor
def objective_xgb(trial, X_train, X_valid, y_train, y_valid):
    
    params = {
    "objective":"reg:squarederror",
    "max_depth": trial.suggest_int("max_depth", 10, 50, step=5),
    "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=50),
    "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1, log=False),
    "subsample": trial.suggest_float("subsample", 0.05, 1.0),
    "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
    "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
    }
   
    model = XGBRegressor(**params,random_state=42).fit(X_train, y_train)
    y_predicted = model.predict(X_valid)
    r2 = r2_score(y_valid, y_predicted)
    
    return r2

In [10]:
# XGBoost
study_xgb = optuna.create_study(direction = "maximize")
func = lambda trial: objective_xgb(trial, X_train, X_valid, y_train, y_valid)
study_xgb.optimize(func, n_trials = 100, timeout=600)
xgb_model = XGBRegressor(**study_xgb.best_trial.params).fit(X_train_valid, y_train_valid)

[I 2025-04-19 15:43:32,997] A new study created in memory with name: no-name-7755fc2c-bda5-4650-a31f-c3634e7b5d2a
[I 2025-04-19 15:43:33,203] Trial 0 finished with value: 0.8268437385559082 and parameters: {'max_depth': 50, 'n_estimators': 250, 'learning_rate': 0.014643219773982287, 'subsample': 0.11596933277453453, 'colsample_bytree': 0.6449368991225286, 'min_child_weight': 16}. Best is trial 0 with value: 0.8268437385559082.
[I 2025-04-19 15:43:33,690] Trial 1 finished with value: 0.8082088828086853 and parameters: {'max_depth': 15, 'n_estimators': 250, 'learning_rate': 0.22798254363102835, 'subsample': 0.24651677238228786, 'colsample_bytree': 0.4428274328796406, 'min_child_weight': 5}. Best is trial 0 with value: 0.8268437385559082.
[I 2025-04-19 15:43:33,871] Trial 2 finished with value: 0.6883785724639893 and parameters: {'max_depth': 15, 'n_estimators': 200, 'learning_rate': 0.8911279494774239, 'subsample': 0.6593359861756388, 'colsample_bytree': 0.7001522968198016, 'min_child_we

In [11]:
y_pred_test = xgb_model.predict(X_test)
r2_value = r2_score(y_test, y_pred_test)
rmse_value = root_mean_squared_error(y_test, y_pred_test)

In [12]:
print(f"R2 Score: {r2_value}")
print(f"RMSE Score: {rmse_value}")

R2 Score: 0.8529456853866577
RMSE Score: 31094.484375


# <center> Using an XGBoost Regressor,<br> the sale price prediction deviates on average <br> <ins> +/- $31k </ins><br>from the real sale price.

# 5. Save Models

In [None]:
lr_path = models_path / "LinearRegressorModel.bin"
joblib.dump(lr_model, lr_path compress=True)

In [None]:
xgb_path = models_path / "XGBRegressorModel.json"
xgb_model.save_model(xgb_path)