<a href="https://colab.research.google.com/github/mvince33/Coding-Dojo/blob/main/week07/GridSearchCV_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GridSearchCV

In this notebook we will:
1. Load the Iris dataset and prepare it for modeling
2. Demonstrate hyperparameter tuning using nested for loops
3. Introduce GridSearchCV as a simpler way to explore various values for multiple hyperparameters.
4. Use a pipeline in place of a base estimator in GridSearchCV to prevent data leakage between cross-validation folds.

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV

# Load Data

We will be using the Iris dataset for this demonstration.

In [2]:
iris = load_iris()

X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Hyperparameter Tuning With For Loops

In [3]:
#create and visualize our parameter grid
param_grid = {'max_depth': [1,2,3],
              'min_samples_leaf': [3,15,20],
              'min_samples_split': [2, 10, 100]}

pd.DataFrame(param_grid).T

Unnamed: 0,0,1,2
max_depth,1,2,3
min_samples_leaf,3,15,20
min_samples_split,2,10,100


In [4]:
scores = {}

#loop through all combinations of values for all hyperparameters
for d in param_grid['max_depth']:
  for l in param_grid['min_samples_leaf']:
    for s in param_grid['min_samples_split']:
      #fit a model for each combination of hyperparameter values
      model = DecisionTreeClassifier(max_depth=d,
                                     min_samples_leaf=l,
                                     min_samples_split=s)
      model.fit(X_train, y_train)
      score = model.score(X_test, y_test)
      #add the model accuracy to a dictionary with the parameter settings as the
      #keys and the accuracies as the values.
      scores[f'depth {d}, min_samples_leaf {l}, min_samples_split {s} accuracy'] = score.round(6) * 100

#display dictionary of scores
scores

{'depth 1, min_samples_leaf 15, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 15, min_samples_split 100 accuracy': 68.4211,
 'depth 1, min_samples_leaf 15, min_samples_split 2 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 100 accuracy': 68.4211,
 'depth 1, min_samples_leaf 20, min_samples_split 2 accuracy': 68.4211,
 'depth 1, min_samples_leaf 3, min_samples_split 10 accuracy': 68.4211,
 'depth 1, min_samples_leaf 3, min_samples_split 100 accuracy': 68.4211,
 'depth 1, min_samples_leaf 3, min_samples_split 2 accuracy': 68.4211,
 'depth 2, min_samples_leaf 15, min_samples_split 10 accuracy': 97.3684,
 'depth 2, min_samples_leaf 15, min_samples_split 100 accuracy': 68.4211,
 'depth 2, min_samples_leaf 15, min_samples_split 2 accuracy': 97.3684,
 'depth 2, min_samples_leaf 20, min_samples_split 10 accuracy': 97.3684,
 'depth 2, min_samples_leaf 20, min_samples_split 100 

# Using GridSearchCV

In [5]:
model = DecisionTreeClassifier()

param_grid = {'max_depth': [1,2,3],
              'min_samples_leaf': [3,15,20],
              'min_samples_split': [2, 10, 100]}

In [6]:
dt_grid_search = GridSearchCV(model, param_grid)


In [7]:
dt_grid_search.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3],
                         'min_samples_leaf': [3, 15, 20],
                         'min_samples_split': [2, 10, 100]})

In [8]:
dt_grid_search.best_params_

{'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}

## Adjusting the Ranges of Hyperparameter Values.

In [None]:
param_grid2 = {'max_depth': [3, 5, 10],
              'min_samples_leaf': [1, 2, 3],
              'min_samples_split': [2, 4, 7]}

dt_grid_search2 = GridSearchCV(model, param_grid2)
dt_grid_search2.fit(X_train, y_train)
dt_grid_search2.best_params_

{'max_depth': 10, 'min_samples_leaf': 3, 'min_samples_split': 2}

In [None]:
#retrieve the best version of the model
best_model = dt_grid_search2.best_estimator_

#refit the model on the whole training set
best_model.fit(X_train, y_train)
#score the model on the test set
best_model.score(X_test, y_test)

1.0

# GridSearchCV With a Pipeline

In [9]:
knn_pipe = make_pipeline(StandardScaler(), KNeighborsClassifier())
knn_pipe.get_params()

{'kneighborsclassifier': KNeighborsClassifier(),
 'kneighborsclassifier__algorithm': 'auto',
 'kneighborsclassifier__leaf_size': 30,
 'kneighborsclassifier__metric': 'minkowski',
 'kneighborsclassifier__metric_params': None,
 'kneighborsclassifier__n_jobs': None,
 'kneighborsclassifier__n_neighbors': 5,
 'kneighborsclassifier__p': 2,
 'kneighborsclassifier__weights': 'uniform',
 'memory': None,
 'standardscaler': StandardScaler(),
 'standardscaler__copy': True,
 'standardscaler__with_mean': True,
 'standardscaler__with_std': True,
 'steps': [('standardscaler', StandardScaler()),
  ('kneighborsclassifier', KNeighborsClassifier())],
 'verbose': False}

In [10]:
pipe_param_grid = {'kneighborsclassifier__n_neighbors': range(1,10),
              'kneighborsclassifier__p': range(1,5),
              'kneighborsclassifier__weights': ['distance','uniform']}

In [11]:
knn_pipe_gs = GridSearchCV(knn_pipe, pipe_param_grid)

knn_pipe_gs.fit(X_train, y_train)
print('Best KNN Parameters:')
print(knn_pipe_gs.best_params_)
best_pipe = knn_pipe_gs.best_estimator_
print(f'Accuracy of best KNN model is: {best_pipe.score(X_test, y_test)}')

Best KNN Parameters:
{'kneighborsclassifier__n_neighbors': 9, 'kneighborsclassifier__p': 2, 'kneighborsclassifier__weights': 'distance'}
Accuracy of best KNN model is: 1.0
