In [1]:
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import warnings
import statsmodels.api as sm
import math
import seaborn as sns
import time 
import random
from scipy.interpolate import LSQUnivariateSpline as Spline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, KFold
import pickle
import xgboost as xgb

In [2]:
# Load data
train = pd.read_csv("../data/train_data.csv")
test = pd.read_csv("../data/test_data.csv")
train.head()

Unnamed: 0,Bids,Views,Make,Mileage,Exterior Color,Interior Color,Gears,Engine_Displacement_L,2_keys_ind,is_dry_climate_car,...,auction_month_3,auction_month_4,auction_month_5,auction_month_6,auction_month_7,auction_month_8,auction_month_9,auction_month_10,auction_month_11,auction_month_12
0,37.0,13056.0,91,24100.0,13,1,5.0,2.0,0,0,...,False,False,False,False,False,False,True,False,False,False
1,52.0,15252.0,8,121000.0,11,1,6.0,2.7,1,1,...,False,True,False,False,False,False,False,False,False,False
2,30.0,13251.430852,86,35000.0,11,5,5.0,5.4,0,0,...,False,False,False,False,False,False,False,True,False,False
3,17.0,13907.0,52,35800.0,1,1,4.0,2.4,1,1,...,False,True,False,False,False,False,False,False,False,False
4,30.0,10075.0,16,96800.0,2,13,5.0,2.7,0,0,...,False,False,False,False,True,False,False,False,False,False


In [3]:
# Data Splitting
pd.set_option('display.max_columns', None)

X_train = train.drop(columns='Sold_Price')
X_test = test.drop(columns='Sold_Price')

# Keep original dollars for final evaluation
y_train_dollars = train['Sold_Price']
y_test_dollars = test['Sold_Price']

# Transform target to log space for stable training
y_train_log = np.log1p(y_train_dollars)
y_test_log = np.log1p(y_test_dollars)

del train
del test

X_train.head()

In [4]:
# 2. Set up Time-Series Cross-Validation
# This prevents data leakage by ensuring we only train on past auctions to predict future ones
tscv = TimeSeriesSplit(n_splits=5)

# Because we manually logged the target, we switch to standard MSE
scoring_metric = 'neg_mean_squared_error'

In [5]:
# ==========================================
# 3. Model 1: Random Forest Grid Search
# ==========================================
print("--- Starting Random Forest Grid Search ---")
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [100, 250, 400],         # Number of trees
    'max_depth': [10, 20, 50],        # Depth of the trees (None = fully expanded)
    'min_samples_split': [2, 5, 10]     # Minimum samples required to split an internal node
}

rf_grid = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    cv=tscv,
    scoring=scoring_metric,
    n_jobs=2,
    verbose=1
)

rf_grid.fit(X_train, y_train_log)

# Take the negative to make it positive MSLE, then take the square root for RMSLE
rf_best_rmsle = np.sqrt(-rf_grid.best_score_)
print(f"Best RF Parameters: {rf_grid.best_params_}")
print(f"Best RF CV RMSLE: {rf_best_rmsle:.4f}\n")

--- Starting Random Forest Grid Search ---
Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best RF Parameters: {'max_depth': 50, 'min_samples_split': 2, 'n_estimators': 400}
Best RF CV RMSLE: 0.3149



In [6]:
# ==========================================
# 4. Model 2: XGBoost Grid Search
# ==========================================
print("--- Starting XGBoost Grid Search ---")
xgbr = xgb.XGBRegressor(objective='reg:squaredlogerror', random_state=42)

# Define the hyperparameter grid for XGBoost
xgb_param_grid = {
    'n_estimators': [100, 250],
    'learning_rate': [0.01, 0.1],
    'max_depth': [5, 10],
    'subsample': [0.4, 0.8],                 # Kept fixed to prevent overfitting
    'colsample_bytree': [0.4, 0.8]           # Kept fixed to prevent overfitting
}

xgb_grid = GridSearchCV(
    estimator=xgbr,
    param_grid=xgb_param_grid,
    cv=tscv,
    scoring=scoring_metric,
    n_jobs=2,
    verbose=1
)

xgb_grid.fit(X_train, y_train_log)

# Take the negative to make it positive MSLE, then take the square root for RMSLE
xgb_best_rmsle = np.sqrt(-xgb_grid.best_score_)
print(f"Best XGB Parameters: {xgb_grid.best_params_}")
print(f"Best XGB CV RMSLE: {xgb_best_rmsle:.4f}\n")

--- Starting XGBoost Grid Search ---
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best XGB Parameters: {'colsample_bytree': 0.4, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Best XGB CV RMSLE: 4.2817



In [7]:
# ==========================================
# 5. Model Selection & Final Evaluation
# ==========================================
print("--- Model Selection ---")
if rf_best_rmsle < xgb_best_rmsle:
    print(f"Winning Model: Random Forest (Beats XGBoost by {xgb_best_rmsle - rf_best_rmsle:.4f} RMSLE)")
    best_model = rf_grid.best_estimator_
else:
    print(f"Winning Model: XGBoost (Beats Random Forest by {rf_best_rmsle - xgb_best_rmsle:.4f} RMSLE)")
    best_model = xgb_grid.best_estimator_

print("\nEvaluating the Winning Model on the Hold-Out Test Set...")

# Predict in log space
y_pred_log = best_model.predict(X_test)

# Inverse transform predictions back to dollars
y_pred_dollars = np.expm1(y_pred_log)

mae = mean_absolute_error(y_test_dollars, y_pred_dollars)
rmse = np.sqrt(mean_squared_error(y_test_dollars, y_pred_dollars))
mape = mean_absolute_percentage_error(y_test_dollars, y_pred_dollars)

print(f"Test Set MAE: ${mae:,.2f}")
print(f"Test Set RMSE: ${rmse:,.2f}")
print(f"Test Set Overall MAPE: {mape * 100:.2f}%\n")

# --- SEGMENTED MAPE EVALUATION ---
print("--- Segmented Evaluation ---")
under_75k_mask = y_test_dollars < 75000
over_75k_mask = y_test_dollars >= 75000

if under_75k_mask.sum() > 0:
    mape_under = mean_absolute_percentage_error(y_test_dollars[under_75k_mask], y_pred_dollars[under_75k_mask])
    print(f"MAPE (Under $75k): {mape_under * 100:.2f}% (Count: {under_75k_mask.sum()})")

if over_75k_mask.sum() > 0:
    mape_over = mean_absolute_percentage_error(y_test_dollars[over_75k_mask], y_pred_dollars[over_75k_mask])
    print(f"MAPE ($75k and Over): {mape_over * 100:.2f}% (Count: {over_75k_mask.sum()})")

--- Model Selection ---
Winning Model: Random Forest (Beats XGBoost by 3.9669 RMSLE)

Evaluating the Winning Model on the Hold-Out Test Set...
Test Set MAE: $6,277.65
Test Set RMSE: $15,651.75
Test Set Overall MAPE: 28.40%

--- Segmented Evaluation ---
MAPE (Under $75k): 29.03% (Count: 5729)
MAPE ($75k and Over): 19.94% (Count: 426)


In [None]:
# Saving model and columns
artifacts = {
    "model": best_model,
    "training_columns": X_train.columns.tolist(),
    "model_type": "Random Forest" if best_model == rf_grid.best_estimator_ else "XGBoost"
}

with open('../deployment/model_artifacts_002.pkl', 'wb') as f:
    pickle.dump(artifacts, f)
