In [1]:
from GridSearchHelper import EstimatorSelectionHelper
import numpy as np
import pandas as pd
from collections import defaultdict
# import matplotlib
# import matplotlib.pyplot as plt
# import scipy.stats as st
# from scipy.stats import norm
# import seaborn as sns
import datetime
# import pandas_profiling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, power_transform
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, median_absolute_error
# from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import KFold, StratifiedKFold

# Comparing models using GridSearchHelper

We will perform the same exercise as we did manually but this time using GridSearchHelper().

We still need to do loading and (if any) data cleaning and feature engineering manually

In [2]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes().data, load_diabetes().target

X.shape # (442, 10)
y.shape # (442,)

(442,)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 18)

We instantiate the EstimatorSelectionHelper() class and pass the list of models and parameters to optimize over

In [4]:
models = {'regression': LinearRegression(),\
             'ridge': Ridge(),\
             'Stochastic_Descent': SGDRegressor(),\
             'Decision_Tree': DecisionTreeRegressor(),\
             'Random_Forest': RandomForestRegressor()}

params = {'regression': {},\
             'ridge': [{'alpha': [0.01, 0.05, 0.1, 0.3, 0.6]}],\
             'Stochastic_Descent': [{'penalty': ['l1', 'l2', 'elasticnet'], 'alpha': [0.00005, 0.0001, 0.0003, 0.0006, 0.001]}],\
             'Decision_Tree': [{'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'max_features': ['auto', 'sqrt', 'log2']}],\
             'Random_Forest': [{'n_estimators': [100, 150, 200, 250], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3], 'max_features': ['auto', 'sqrt', 'log2']}]}

In [5]:
helper = EstimatorSelectionHelper(models, params)

You can access the stored models and parameters anytime:

In [6]:
helper.models

{'regression': LinearRegression(),
 'ridge': Ridge(),
 'Stochastic_Descent': SGDRegressor(),
 'Decision_Tree': DecisionTreeRegressor(),
 'Random_Forest': RandomForestRegressor()}

In [7]:
helper.params

{'regression': {},
 'ridge': [{'alpha': [0.01, 0.05, 0.1, 0.3, 0.6]}],
 'Stochastic_Descent': [{'penalty': ['l1', 'l2', 'elasticnet'],
   'alpha': [5e-05, 0.0001, 0.0003, 0.0006, 0.001]}],
 'Decision_Tree': [{'min_samples_split': [2, 3, 4, 5],
   'min_samples_leaf': [1, 2, 3],
   'max_features': ['auto', 'sqrt', 'log2']}],
 'Random_Forest': [{'n_estimators': [100, 150, 200, 250],
   'min_samples_split': [2, 3, 4, 5],
   'min_samples_leaf': [1, 2, 3],
   'max_features': ['auto', 'sqrt', 'log2']}]}

After that, just call a .fit method. Refer to the manual to see what parameters you can pass.

In [8]:
helper.fit(X_train, y_train, scoring = 'r2', n_jobs = -1)

Running GridSearchCV for regression
Running GridSearchCV for ridge
Running GridSearchCV for Stochastic_Descent




Running GridSearchCV for Decision_Tree
Running GridSearchCV for Random_Forest
Completed


defaultdict(list,
            {'regression': [{}],
             'ridge': [{'alpha': 0.01}],
             'Stochastic_Descent': [{'alpha': 5e-05, 'penalty': 'l1'}],
             'Decision_Tree': [{'max_features': 'log2',
               'min_samples_leaf': 3,
               'min_samples_split': 3}],
             'Random_Forest': [{'max_features': 'sqrt',
               'min_samples_leaf': 3,
               'min_samples_split': 3,
               'n_estimators': 250}]})

This method returns a defaultdict with the best parameters for every model. You can either:
* copy-paste them
* access later by calling .best_params
* directly envoke fit_with_best() instead

In [11]:
helper.fit_with_best(X_train, y_train, X_test, y_test)

Fitting model regression with its best parameters
Fitting model ridge with its best parameters
Fitting model Stochastic_Descent with its best parameters
Fitting model Decision_Tree with its best parameters
Fitting model Random_Forest with its best parameters




neg_root_mean_squared_error of the model regression with the best parameters is 3006.07
neg_root_mean_squared_error of the model ridge with the best parameters is 3019.46
neg_root_mean_squared_error of the model Stochastic_Descent with the best parameters is 3959.83
neg_root_mean_squared_error of the model Decision_Tree with the best parameters is 7597.33
neg_root_mean_squared_error of the model Random_Forest with the best parameters is 3771.44


{'regression': 3006.0733911079724,
 'ridge': 3019.4609723576123,
 'Stochastic_Descent': 3959.8291344139766,
 'Decision_Tree': 7597.3254129129145,
 'Random_Forest': 3771.4427324489006}

In [13]:
helper.d_errors()

AttributeError: 'EstimatorSelectionHelper' object has no attribute 'd_errors'