In [1]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import warnings
import statsmodels.api as sm
import math
import seaborn as sns
import time 
import random
from scipy.interpolate import LSQUnivariateSpline as Spline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, KFold
import pickle
import xgboost as xgb

In [2]:
# Load data
train = pd.read_csv("../data/train_data.csv")
test = pd.read_csv("../data/test_data.csv")
train.head()

Unnamed: 0,Bids,Views,Make,Mileage,Exterior Color,Interior Color,Gears,Engine_Displacement_L,2_keys_ind,is_dry_climate_car,...,auction_month_3,auction_month_4,auction_month_5,auction_month_6,auction_month_7,auction_month_8,auction_month_9,auction_month_10,auction_month_11,auction_month_12
0,37.0,13056.0,91,24100.0,13,1,5.0,2.0,0,0,...,False,False,False,False,False,False,True,False,False,False
1,52.0,15252.0,8,121000.0,11,1,6.0,2.7,1,1,...,False,True,False,False,False,False,False,False,False,False
2,30.0,13251.430852,86,35000.0,11,5,5.0,5.4,0,0,...,False,False,False,False,False,False,False,True,False,False
3,17.0,13907.0,52,35800.0,1,1,4.0,2.4,1,1,...,False,True,False,False,False,False,False,False,False,False
4,30.0,10075.0,16,96800.0,2,13,5.0,2.7,0,0,...,False,False,False,False,True,False,False,False,False,False


In [3]:
# Data Splitting
pd.set_option('display.max_columns', None)

X_train = train.drop(columns='Sold_Price')
X_test = test.drop(columns='Sold_Price')
y_train = train['Sold_Price']
y_test = test['Sold_Price']
X_train.head()

del train
del test

In [4]:
# 2. Set up Cross-Validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Scikit-learn GridSearch attempts to *maximize* the score. 
# Therefore, error metrics must be negative (closer to 0 is better).
scoring_metric = 'neg_mean_absolute_percentage_error'

In [5]:
# ==========================================
# 3. Model 1: Random Forest Grid Search
# ==========================================
print("--- Starting Random Forest Grid Search ---")
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 300],         # Number of trees
    'max_depth': [10, 20, None],        # Depth of the trees (None = fully expanded)
    'min_samples_split': [2, 5, 10]     # Minimum samples required to split an internal node
}

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=kf,
    scoring=scoring_metric,
    n_jobs=2,                          # Use all available CPU cores to speed up training
    verbose=1
)

rf_grid.fit(X_train, y_train)

# Multiply by -100 to convert the negative decimal back to a readable percentage
rf_best_mape = -rf_grid.best_score_ * 100
print(f"Best RF Parameters: {rf_grid.best_params_}")
print(f"Best RF CV MAPE: {rf_best_mape:.2f}%\n")

--- Starting Random Forest Grid Search ---
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best RF Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 300}
Best RF CV MAPE: 26.64%



In [6]:
# ==========================================
# 4. Model 2: XGBoost Grid Search
# ==========================================
print("--- Starting XGBoost Grid Search ---")
xgbr = xgb.XGBRegressor(objective='reg:squaredlogerror', random_state=42)

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8],
    'subsample': [0.8],                 # Kept fixed to prevent overfitting
    'colsample_bytree': [0.8]           # Kept fixed to prevent overfitting
}

xgb_grid = GridSearchCV(
    estimator=xgbr,
    param_grid=xgb_param_grid,
    cv=kf,
    scoring=scoring_metric,
    n_jobs=2,
    verbose=1
)

xgb_grid.fit(X_train, y_train)

xgb_best_mape = -xgb_grid.best_score_ * 100
print(f"Best XGB Parameters: {xgb_grid.best_params_}")
print(f"Best XGB CV MAPE: {xgb_best_mape:.2f}%\n")

--- Starting XGBoost Grid Search ---
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best XGB Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 300, 'subsample': 0.8}
Best XGB CV MAPE: 97.98%



In [7]:
# ==========================================
# 5. Model Selection & Final Evaluation
# ==========================================
print("--- Model Selection ---")
# The lower MAPE wins
if rf_best_mape < xgb_best_mape:
    print(f"Winning Model: Random Forest (Beats XGBoost by {xgb_best_mape - rf_best_mape:.2f}%)")
    best_model = rf_grid.best_estimator_
else:
    print(f"Winning Model: XGBoost (Beats Random Forest by {rf_best_mape - xgb_best_mape:.2f}%)")
    best_model = xgb_grid.best_estimator_

print("\nEvaluating the Winning Model on the Hold-Out Test Set...")
# This is the final check against data leakage. The model has never seen X_test during CV or training.
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f"Test Set MAE: ${mae:,.2f}")
print(f"Test Set RMSE: ${rmse:,.2f}")
print(f"Test Set MAPE: {mape * 100:.2f}%")

--- Model Selection ---
Winning Model: Random Forest (Beats XGBoost by 71.33%)

Evaluating the Winning Model on the Hold-Out Test Set...
Test Set MAE: $6,287.67
Test Set RMSE: $15,673.25
Test Set MAPE: 28.46%

Winning model successfully saved to ../deployment/best_model_artifacts.pkl


In [8]:
# Saving model and columns
artifacts = {
    "model": best_model,
    "training_columns": X_train.columns.tolist(),
    "model_type": "Random Forest" if best_model == rf_grid.best_estimator_ else "XGBoost"
}

with open('../deployment/model_artifacts_002.pkl', 'wb') as f:
    pickle.dump(artifacts, f)
