In [1]:
'model selection and evaluation'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

train_data = pd.read_csv('processed_train_data.csv')
test_data = pd.read_csv('processed_test_data.csv')

X_train = train_data.drop('log_price', axis=1)
y_train = train_data['log_price']

X_test = test_data.drop('log_price', axis=1)
y_test = test_data['log_price']

In [2]:
def get_r2(y_true_log, y_pred_log):
    y_true_actual = np.exp(y_true_log)
    y_pred_actual = np.exp(y_pred_log)
    return r2_score(y_true_actual, y_pred_actual)

In [3]:
'''
Trial 1
linear regression model
'''
model = LinearRegression()
model.fit(X_train, y_train)

y_pred_train_log = model.predict(X_train)
r2_train = get_r2(y_train, y_pred_train_log)

y_pred_test_log = model.predict(X_test)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Training R2: {r2_train:.4f}")
print(f"Test R2: {r2_test:.4f}")

Training R2: 0.9139
Test R2: 0.9025


In [4]:
#Training R2: 0.9139
#Test R2: 0.9025

In [5]:
'''
Trial 2
polynomial regression model with degree of 2 
'''
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

y_pred_train_log = poly_model.predict(X_train_poly)
y_pred_test_log = poly_model.predict(X_test_poly)

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Training R2: {r2_train:.4f}")
print(f"Test R2: {r2_test:.4f}")

Training R2: 0.9425
Test R2: 0.9309


In [None]:
#Training R2: 0.9425
#Test R2: 0.9309

In [7]:
'''
Trial 3
Grid search on ridge regression with polynomial features
and different alpha values
'''
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge())
])

param_grid = {
    'poly__degree': [1, 2],
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='r2',
                           cv=3,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_train_log = best_model.predict(X_train)
y_pred_test_log = best_model.predict(X_test)

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Best Params: {grid_search.best_params_}")
print(f"Train R2: {r2_train:.4f}")
print(f"Test R2: {r2_test:.4f}")

Best Params: {'poly__degree': 2, 'ridge__alpha': 1}
Train R2: 0.9407
Test R2: 0.9306


In [None]:
#Best Params: {'poly__degree': 2, 'ridge__alpha': 1}
#Train R2: 0.9407
#Test R2: 0.9306

In [9]:
'''
Trial 4
Grid search on XGBoost regression model with different n_estimators and learning_rate.
'''
import xgboost as xgb

param_grid = {
    'n_estimators': [10, 100, 500, 1000, 2000],
    'learning_rate': [0.1, 0.01, 0.001]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=xgb_model, 
                           param_grid=param_grid, 
                           scoring='r2', 
                           cv=3, 
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred_train_log = best_model.predict(X_train)
y_pred_test_log = best_model.predict(X_test)

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Best Params: {grid_search.best_params_}")
print(f"Train R2: {r2_train:.4f}")
print(f"Test R2: {r2_test:.4f}")

Best Params: {'learning_rate': 0.1, 'n_estimators': 500}
Train R2: 0.9806
Test R2: 0.9459


In [10]:
#Best Params: {'learning_rate': 0.1, 'n_estimators': 500}
#Train R2: 0.9806
#Test R2: 0.9459

In [11]:
'''
Trial 5
Grid search on random forest regression model with different n_estimators.
'''
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [10, 100, 1000]
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(estimator=rf_model, 
                           param_grid=param_grid, 
                           scoring='r2', 
                           cv=3, 
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

y_pred_train_log = best_model.predict(X_train)
y_pred_test_log = best_model.predict(X_test)

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Best Params: {grid_search.best_params_}")
print(f"Train R2: {r2_train:.4f}")
print(f"Test R2: {r2_test:.4f}")

Best Params: {'n_estimators': 1000}
Train R2: 0.9883
Test R2: 0.9361


In [12]:
#Best Params: {'n_estimators': 1000}
#Train R2: 0.9883
#Test R2: 0.9361

In [13]:
'''
Trial 6
Neural Network regression model with 2 hidden layers
'''
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

input_dim = X_train.shape[1]

nn_model = Sequential()
nn_model.add(Dense(64, activation='relu', input_shape=(input_dim,)))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1)) 

nn_model.compile(optimizer='adam', loss='mse')
nn_model.fit(X_train, y_train, epochs=100, batch_size=64, verbose=0)

y_pred_train_log = nn_model.predict(X_train).flatten()
y_pred_test_log = nn_model.predict(X_test).flatten()

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"Neural Network Train R2: {r2_train:.4f}")
print(f"Neural Network Test R2: {r2_test:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m270/270[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395us/step
[1m68/68[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 672us/step
Neural Network Train R2: 0.9446
Neural Network Test R2: 0.9278


In [None]:
#Neural Network Train R2: 0.9446
#Neural Network Test R2: 0.9278

In [15]:
'''
Based on the tested models, the XGBoost regression model with n_estimators=500 and learning_rate=0.1 got the highest R2 score on the test data set.
'''

'\nBased on the tested models, the XGBoost regression model with n_estimators=500 and learning_rate=0.1 got the highest R2 score on the test data set.\n'

In [16]:
#train the XGboost model with the best parameters and save it for deployment.
import pickle

xgb_model = xgb.XGBRegressor(objective='reg:squarederror', 
                             n_estimators=500, 
                             learning_rate=0.1, 
                             random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_train_log = xgb_model.predict(X_train)
y_pred_test_log = xgb_model.predict(X_test)

r2_train = get_r2(y_train, y_pred_train_log)
r2_test = get_r2(y_test, y_pred_test_log)

print(f"XGBoost Train R2: {r2_train:.4f}")
print(f"XGBoost Test R2: {r2_test:.4f}")

model_filename = 'best_model.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(xgb_model, file)

print(f"XGBoost model successfully saved to {model_filename}")

XGBoost Train R2: 0.9806
XGBoost Test R2: 0.9459
XGBoost model successfully saved to best_model.pkl
