# Hyperparameter Tuning
This notebook performs hyperparameter tuning using 'GridSearchCV' on the best performing models from the last notebook:
 - Ridge Regression(alpha - regularization strength)
 - Gradient Boosting Regressor(learning rate - how much each tree adjusts predictions, n_estimators - number of boosting rounds(trees))
 - Random Forest(n_estimators - number of trees in the forest, max depth - depth of each tree)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor

# Load processed data
df = pd.read_csv('../data/processed_train.csv')

X = df.drop('SalePrice', axis=1)
y = df['SalePrice']


## RMSE Scorer

In [2]:
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

rmse_scorer = make_scorer(rmse, greater_is_better=False)


## Ridge Regression Tuning

In [3]:
ridge_params = {'alpha': [0.1, 1, 10, 50, 100, 200]}
ridge = Ridge()

ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, scoring=rmse_scorer)
ridge_grid.fit(X, y)

print("Best Ridge Params:", ridge_grid.best_params_)
print("Best Ridge RMSE:", -ridge_grid.best_score_)


Best Ridge Params: {'alpha': 10}
Best Ridge RMSE: 0.11366688333139538


## Gradient Boosting Tuning

In [4]:
gb_params = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5]
}

gbr = GradientBoostingRegressor()
gbr_grid = GridSearchCV(gbr, gb_params, cv=5, scoring=rmse_scorer)
gbr_grid.fit(X, y)

print("Best GBR Params:", gbr_grid.best_params_)
print("Best GBR RMSE:", -gbr_grid.best_score_)


Best GBR Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Best GBR RMSE: 0.1212803752415049


## Random Forest Hyperparameter Tuning

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestRegressor(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring=rmse_scorer, n_jobs=-1)
rf_grid.fit(X, y)

print("Best RF Params:", rf_grid.best_params_)
print("Best RF RMSE:", -rf_grid.best_score_)


Best RF Params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best RF RMSE: 0.14244856496705519


## Compare All final tuned results

In [9]:
ridge_best_rmse = -ridge_grid.best_score_
gbr_best_rmse = -gbr_grid.best_score_
rf_best_rmse = -rf_grid.best_score_

print(f"Ridge Best RMSE: {ridge_best_rmse:.4f}")
print(f"Gradient Boosting Best RMSE: {gbr_best_rmse:.4f}")
print(f"Random Forest Best RMSE: {rf_best_rmse:.4f}")


Ridge Best RMSE: 0.1137
Gradient Boosting Best RMSE: 0.1213
Random Forest Best RMSE: 0.1424


## Summary
 - All three models were tuned using GridSearchCV
 - Best RMSEs:
  - Ridge:
  - Gradient Boosting:
  - Random Forest:
 - What's Next: Using