Step 4: Chossing a Regression Model

In [1]:
import pandas as pd
import smogn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.svm import SVR
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv('/home/lola/machine-learning-project/famd_data.csv')

try:
    data_smogn = smogn.smoter(data=data, y='MSRP')
except ValueError:

    print("Oops! Synthetic data contains missing values.")
   
    data = data.dropna()
    data_smogn = smogn.smoter(data=data, y='MSRP')
   

feature_columns = ['0', '1']
target_variable = 'MSRP'

X = data_smogn[feature_columns]
y = data_smogn[target_variable]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

dist_matrix: 100%|##########| 1455/1455 [01:41<00:00, 14.36it/s]
synth_matrix: 100%|##########| 1455/1455 [00:01<00:00, 1116.58it/s]
r_index: 100%|##########| 127/127 [00:00<00:00, 1797.99it/s]


In [4]:
# define models to train
models = {
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosted Trees": GradientBoostingRegressor(),
    "Linear Regression": LinearRegression(),
    "Polynomial Regression": make_pipeline(PolynomialFeatures(), LinearRegression()),
    "Bayesian Regression": BayesianRidge(),
    "Support Vector Regression": SVR()
}

# define parameter grid for each model
param_grids = {
    "Random Forest": {'n_estimators': [50, 100, 200, 300, 400],
                      'max_depth': [None, 10, 20, 30],
                      'min_samples_split': [2, 5, 10]},
    "Gradient Boosted Trees": {'n_estimators': [100, 200, 300],
                               'learning_rate': [0.05, 0.1, 0.2],
                               'max_depth': [3, 5, 7]},
    "Linear Regression": {},
    "Polynomial Regression": {'polynomialfeatures__degree': [2, 3, 4, 5]},
    "Bayesian Regression": {'alpha_1': [1e-6, 1e-5, 1e-4],
                            'alpha_2': [1e-6, 1e-5, 1e-4],
                            'lambda_1': [1e-6, 1e-5, 1e-4],
                            'lambda_2': [1e-6, 1e-5, 1e-4]},
    "Support Vector Regression": {'C': [0.1, 1, 10, 100, 1000],
                                  'gamma': ['scale', 'auto']}
}

# compare best models of all regression models 
best_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    grid_search = GridSearchCV(model, param_grid=param_grids[name], scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best MSE on cross-validation set: {-grid_search.best_score_}")
    print()

print("Evaluating models on test data:")
for name, model in best_models.items():
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{name}: MSE = {mse}")

best_model_name = min(best_models, key=lambda x: mean_squared_error(y_test, best_models[x].predict(X_test)))
best_model = best_models[best_model_name]
print(f"\nBest model: {best_model_name}")

Training Random Forest...
Best parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 300}
Best MSE on cross-validation set: 3767135018.761902

Training Gradient Boosted Trees...
Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100}
Best MSE on cross-validation set: 3862117642.7219605

Training Linear Regression...
Best parameters: {}
Best MSE on cross-validation set: 9621853421.331833

Training Polynomial Regression...
Best parameters: {'polynomialfeatures__degree': 5}
Best MSE on cross-validation set: 4789420049.303801

Training Bayesian Regression...
Best parameters: {'alpha_1': 1e-06, 'alpha_2': 1e-06, 'lambda_1': 0.0001, 'lambda_2': 1e-06}
Best MSE on cross-validation set: 9621821448.69897

Training Support Vector Regression...
Best parameters: {'C': 1000, 'gamma': 'scale'}
Best MSE on cross-validation set: 8437766651.425189

Evaluating models on test data:
Random Forest: MSE = 5866334866.671805
Gradient Boosted Trees: MSE = 6011181059.46182