# Hyper Parameter Tuning(GridSearchCV)
Once we have selected the appropriate model for the data, how do we tune the parameter for the best model performance?

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

## Data

In [4]:
breast_cancer = pd.read_csv('../Data/breast_cancer.csv')
breast_cancer.drop(['id'], axis=1, inplace=True)
breast_cancer_dummies = pd.get_dummies(breast_cancer, columns=['diagnosis'], drop_first=True)
breast_cancer_dummies.rename(columns={'diagnosis_M': 'Malignant'}, inplace=True)
breast_cancer_dummies.head()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Malignant
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


In [5]:
inputs = breast_cancer_dummies.drop(['Malignant'],axis=1)
target = breast_cancer_dummies.Malignant

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2)

## Model Selected: Random Forrest

In [7]:
from sklearn.ensemble import RandomForestClassifier
randomForrest = RandomForestClassifier()
randomForrest.fit(X_train, y_train)
randomForrest.score(X_test, y_test)

0.956140350877193

## Hyperparameters in Random Forrest

In [8]:
randomForrest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Important Hyperparameters
Number of Trees(n_estimators): Any = 100,
Number of features considered for splitting at each leaf node(max_features): Any = "auto",
Maximum number of levels in each decision tree(max_depth): Any = None,
Min number of data points allowed in a leaf node(min_samples_split): Any = 2,
Sampling method with or without repalcement(bootstrap): Any = True,

# Random Grid Search
We will first use random grid search(computationally mmore efficient) to randomnly find the best hyperparameters. From there we will hone into the best parameters using a sequential selection using Grid Search CV

In [17]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [21]:
rf_randomGridSearch = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=random_grid, n_iter=100, cv=3, verbose=3,
                                         n_jobs=-1)
rf_randomGridSearch.fit(X_train, y_train)
rf_randomGridSearch.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 50,
 'bootstrap': True}

In [22]:
rf_randomGridSearch.score(X_test, y_test)

0.9736842105263158

## Grid Search CV

In [24]:
from sklearn.model_selection import GridSearchCV
params = {'n_estimators': [800,900,1000,1100, 1200],
          'min_samples_split': [1,2,3],
          'min_samples_leaf': [1,2,3],
          'max_features': [2,3],
          'max_depth': [30,40,50,60,70],
          'bootstrap': [True]}
rf_gridSearch = GridSearchCV(RandomForestClassifier(), param_grid=params, cv=3, return_train_score=False, verbose=3)
rf_gridSearch.fit(X_train, y_train)
df = pd.DataFrame(rf_gridSearch.cv_results_)
df

Fitting 3 folds for each of 450 candidates, totalling 1350 fits
[CV 1/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=800;, score=nan total time=   0.2s
[CV 2/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=800;, score=nan total time=   0.2s
[CV 3/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=800;, score=nan total time=   0.2s
[CV 1/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=900;, score=nan total time=   0.2s
[CV 2/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=900;, score=nan total time=   0.2s
[CV 3/3] END bootstrap=True, max_depth=30, max_features=2, min_samples_leaf=1, min_samples_split=1, n_estimators=900;, score=nan total time=   0.2s
[CV 1/3] END bootstrap=True, max_depth=30, max_f

450 fits failed out of a total of 1350.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
450 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/muhammedmafawalla/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/muhammedmafawalla/lib/python3.7/site-packages/sklearn/ensemble/_forest.py", line 467, in fit
    for i, t in enumerate(trees)
  File "/Users/muhammedmafawalla/lib/python3.7/site-packages/joblib/parallel.py", line 1085, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/muhammedmafawalla/lib/python3.7/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.216596,0.001091,0.000000,0.000000,True,30,2,1,1,800,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",,,,,,450
1,0.241624,0.000251,0.000000,0.000000,True,30,2,1,1,900,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",,,,,,339
2,0.267052,0.000347,0.000000,0.000000,True,30,2,1,1,1000,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",,,,,,337
3,0.293965,0.000476,0.000000,0.000000,True,30,2,1,1,1100,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",,,,,,336
4,0.322271,0.002548,0.000000,0.000000,True,30,2,1,1,1200,"{'bootstrap': True, 'max_depth': 30, 'max_feat...",,,,,,335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.760257,0.006038,0.056330,0.000575,True,70,3,3,3,800,"{'bootstrap': True, 'max_depth': 70, 'max_feat...",0.973684,0.927632,0.973510,0.958275,0.021668,108
446,0.861687,0.003725,0.062850,0.000255,True,70,3,3,3,900,"{'bootstrap': True, 'max_depth': 70, 'max_feat...",0.960526,0.927632,0.973510,0.953889,0.019309,284
447,0.951820,0.010196,0.070365,0.001702,True,70,3,3,3,1000,"{'bootstrap': True, 'max_depth': 70, 'max_feat...",0.980263,0.921053,0.966887,0.956068,0.025354,211
448,1.050758,0.001452,0.076247,0.000114,True,70,3,3,3,1100,"{'bootstrap': True, 'max_depth': 70, 'max_feat...",0.980263,0.921053,0.966887,0.956068,0.025354,211


# How to select a model for a dataset

We will select between 3 models to classify observations in our breast cancer dataset:
* Logistic Regression
* Random Forrest
* Support Vector Machine

In [50]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Initialise model params dictionary
model_params = {
    'random_forrest': {
        'model': RandomForestClassifier(),
        'params': {'bootstrap': [True], 'max_depth': [60], 'max_features': [3], 'min_samples_leaf': [1], 'min_samples_split': [2], 'n_estimators': [1200]}
    },
    'svm': {
        'model': SVC(gamma='auto'),
        'params': {'kernel': ['linear', 'rbf'], 'C': [1,10,20]}
    },
    'logistic_regression': {
        'model': LogisticRegression(max_iter=10000),
        'params': {'C': [1,5,10]}
    }
}

In [51]:
scores = []

for model_name, mp in model_params.items():
    classifier = GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False, verbose=3)
    classifier.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best_score': classifier.best_score_,
        'best_params': classifier.best_params_
    })

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.967 total time=   1.4s
[CV 2/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.954 total time=   1.3s
[CV 3/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.940 total time=   1.2s
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ................C=1, kernel=linear;, score=0.941 total time=   0.4s
[CV 2/3] END ................C=1, kernel=linear;, score=0.954 total time=   0.2s
[CV 3/3] END ................C=1, kernel=linear;, score=0.934 total time=   2.4s
[CV 1/3] END ...................C=1, kernel=rbf;, score=0.638 total time=   0.0s
[CV 2/3] END ...................C=1, kernel=rbf;, score=0.638 total time=   0.0s
[CV 3/3] END ..........

In [53]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,random_forrest,0.953817,"{'bootstrap': True, 'max_depth': 60, 'max_feat..."
1,svm,0.949445,"{'C': 10, 'kernel': 'linear'}"
2,logistic_regression,0.951638,{'C': 10}


## Scale Data

In [54]:
from sklearn.preprocessing import StandardScaler
inputs_transformed = StandardScaler().fit_transform(inputs)
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(inputs_transformed, target, test_size=0.2)

In [56]:
scores_scaled = []

for model_name, mp in model_params.items():
    classifier = GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False, verbose=3)
    classifier.fit(X_train_scaled, y_train_scaled)
    scores_scaled.append({
        'model': model_name,
        'best_score': classifier.best_score_,
        'best_params': classifier.best_params_
    })
df = pd.DataFrame(scores_scaled, columns=['model', 'best_score', 'best_params'])
df

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.974 total time=   1.3s
[CV 2/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.967 total time=   1.2s
[CV 3/3] END bootstrap=True, max_depth=60, max_features=3, min_samples_leaf=1, min_samples_split=2, n_estimators=1200;, score=0.954 total time=   1.2s
Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ................C=1, kernel=linear;, score=0.961 total time=   0.0s
[CV 2/3] END ................C=1, kernel=linear;, score=0.980 total time=   0.0s
[CV 3/3] END ................C=1, kernel=linear;, score=0.980 total time=   0.0s
[CV 1/3] END ...................C=1, kernel=rbf;, score=0.967 total time=   0.0s
[CV 2/3] END ...................C=1, kernel=rbf;, score=0.980 total time=   0.0s
[CV 3/3] END ..........

Unnamed: 0,model,best_score,best_params
0,random_forrest,0.964811,"{'bootstrap': True, 'max_depth': 60, 'max_feat..."
1,svm,0.978027,"{'C': 10, 'kernel': 'rbf'}"
2,logistic_regression,0.973655,{'C': 1}
