# Perform grid search on model hyperparameters

All Rights Reserved © <a href="http://www.louisdorard.com" style="color: #6D00FF;">Louis Dorard</a>

<img src="http://s3.louisdorard.com.s3.amazonaws.com/ML_icon.png">

## Load data

In [48]:
import pandas as pd

data = pd.read_csv("/data/kaggle-give-me-credit-nomissing.csv", index_col=0)
target_column = 'SeriousDlqin2yrs'
features = data.drop(target_column, axis=1)
outputs = data[target_column]
X = features.values.astype(float)
y = outputs.values

## Grid search from scratch

Let's implement a procedure to tune 1 hyperparameter — here, `max_features` in Random Forest.

In [15]:
FOLDS = 10
SEED = 8
verbose = 1

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from numpy import arange
params = arange(0.1, 1.0, 0.5)
means = []
stdevs = []
#for i in params:
s = cross_val_score(RandomForestClassifier(random_state=SEED, n_estimators=10, max_features=0.1), X, y, scoring="r2", cv=FOLDS, verbose=verbose)
m = s.mean()
st = s.std()
means.append(m)
stdevs.append(st)
print("Param " + str(0.1) + ": " + str(m) + " +/- " + str(st))

Param 0.1: -0.06837322564777848 +/- 0.02148674992362238


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   10.2s finished


## Grid search with scikit's `GridSearchCV`

### Example with 1 hyperparameter

Define a grid search task:

In [33]:
from sklearn.model_selection import GridSearchCV

params = arange(0.1, 1.0, 0.2)
print(params)

grid = {"max_depth": [3, 9, None],
        "max_features": [0.5, 0.75]}

grid_search = GridSearchCV(RandomForestClassifier(n_estimators=10, random_state=SEED),
                           grid,
                           scoring="roc_auc",
                           cv=5, # cv is the number of folds; smaller values will make the evaluation quicker; recommended values are between 5 and 10
                           n_jobs=-1,
                           verbose=verbose)

[0.1 0.3 0.5 0.7 0.9]


Run the search:

In [34]:
grid_search.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   46.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=8, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 9, None], 'max_features': [0.5, 0.75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [43]:
print(grid_search.best_params_)

{'max_depth': 9, 'max_features': 0.5}


Create model from `X` and `y` using the best hyperparameters found during this search:

In [44]:
model = RandomForestClassifier(**grid_search.best_params_)
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features=0.5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
idx = 25
print(X[idx])
print(y[idx])
prediction = model.predict([X[idx]])
print(prediction)
print(prediction == y[idx])

[4.52515828e-01 2.40000000e+01 0.00000000e+00 1.17612470e-02
 3.40000000e+03 1.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00]
0
[0]
[ True]


In [18]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
k_fold_grid_search = GridSearchCV(RandomForestRegressor(n_estimators=10, random_state=SEED),
                           grid,
                           scoring="r2",
                           cv=kfold,
                           n_jobs=-1)
k_fold_grid_search.fit(X, y)

GridSearchCV(cv=KFold(n_splits=10, random_state=8, shuffle=True),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=8, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_depth': [3, 9, None], 'max_features': [0.5, 0.75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=0)