In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import learning_curve
from sklearn.inspection import PartialDependenceDisplay

import matplotlib.pyplot as plt
import seaborn as sns

# Gradient Boosting Regressors
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# SHAP for model interpretation
import shap


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
X_train = pd.read_csv('X_train.csv')
X_valid = pd.read_csv('X_valid.csv')
y_train = pd.read_csv('y_train.csv')
y_valid = pd.read_csv('y_valid.csv')

## Basic Linear Regression as a Baseline
We'll start by training a simple Linear Regression model as our baseline.

In [7]:
# Instantiate the Linear Regression model
lin_reg = LinearRegression()
# Define the scoring metrics
scoring = {
    'MAE': 'neg_mean_absolute_error',
    'RMSE': 'neg_root_mean_squared_error',
    'R2': 'r2'
}

# Perform cross-validation
cv_results_lin = cross_validate(lin_reg, X_train, y_train, cv=5, scoring=scoring)

# Calculate average scores
mean_mae = -cv_results_lin['test_MAE'].mean()
mean_rmse = -cv_results_lin['test_RMSE'].mean()
mean_r2 = cv_results_lin['test_R2'].mean()

print('Linear Regression Cross-Validation Results:')
print(f"Mean MAE: {mean_mae:.2f}")
print(f"Mean RMSE: {mean_rmse:.2f}")
print(f"Mean R²: {mean_r2:.2f}")


Linear Regression Cross-Validation Results:
Mean MAE: 953171.88
Mean RMSE: 8584109.59
Mean R²: -3913674512731946.50


Wow, those results are really bad, lets use some advanced models!

## Advanced Models
Now lets make it interesting! We will examine the performance of various models on our dataset.

In [6]:
# Instantiate the Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)
# Perform cross-validation
cv_results_rf = cross_validate(rf_reg, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1)

# Calculate average scores
mean_mae_rf = -cv_results_rf['test_MAE'].mean()
mean_rmse_rf = -cv_results_rf['test_RMSE'].mean()
mean_r2_rf = cv_results_rf['test_R2'].mean()

print('Random Forest Cross-Validation Results:')
print(f"Mean MAE: {mean_mae_rf:.2f}")
print(f"Mean RMSE: {mean_rmse_rf:.2f}")
print(f"Mean R²: {mean_r2_rf:.2f}")


Random Forest Cross-Validation Results:
Mean MAE: 0.64
Mean RMSE: 0.83
Mean R²: 0.61


This is far far better, which is great! It means we aren't adding complexity for nothing! Lets see if XGBoost is even better.

In [8]:
# Instantiate the XGBoost Regressor
xgb_reg = XGBRegressor(random_state=42, verbosity=0)
# Perform cross-validation
cv_results_xgb = cross_validate(xgb_reg, X_train, y_train, cv=5, scoring=scoring, n_jobs=-1)

# Calculate average scores
mean_mae_xgb = -cv_results_xgb['test_MAE'].mean()
mean_rmse_xgb = -cv_results_xgb['test_RMSE'].mean()
mean_r2_xgb = cv_results_xgb['test_R2'].mean()

print('XGBoost Cross-Validation Results:')
print(f"Mean MAE: {mean_mae_xgb:.2f}")
print(f"Mean RMSE: {mean_rmse_xgb:.2f}")
print(f"Mean R²: {mean_r2_xgb:.2f}")


XGBoost Cross-Validation Results:
Mean MAE: 2.32
Mean RMSE: 3.09
Mean R²: 0.93


In [None]:
# Create a DataFrame to compare the models
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'MAE': [mean_mae, mean_mae_rf, mean_mae_xgb],
    'RMSE': [mean_rmse, mean_rmse_rf, mean_rmse_xgb],
    'R²': [mean_r2, mean_r2_rf, mean_r2_xgb]
})

results


Unnamed: 0,Model,MAE,RMSE,R²
0,Linear Regression,953171.877966,8584110.0,-3913675000000000.0
1,Random Forest,0.6377,0.8298656,0.6133422
2,XGBoost,2.323017,3.085638,0.9307691
