# **Hyper Parameter Tuning to select the best model with best hyperparameters using GridSearchCV/ RandomizedSearchCV**  

In [1]:
from sklearn import svm, datasets
iris = datasets.load_iris()
import pandas as pd
import numpy as np
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['flower'] = iris.target
df['target']=iris.target
df['flower'] = df['flower'].apply(lambda x: iris.target_names[x])
df[47:105]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower,target
47,4.6,3.2,1.4,0.2,setosa,0
48,5.3,3.7,1.5,0.2,setosa,0
49,5.0,3.3,1.4,0.2,setosa,0
50,7.0,3.2,4.7,1.4,versicolor,1
51,6.4,3.2,4.5,1.5,versicolor,1
52,6.9,3.1,4.9,1.5,versicolor,1
53,5.5,2.3,4.0,1.3,versicolor,1
54,6.5,2.8,4.6,1.5,versicolor,1
55,5.7,2.8,4.5,1.3,versicolor,1
56,6.3,3.3,4.7,1.6,versicolor,1


**Use train_test_split & manual tuning of heperparameters**

In [2]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [3]:
#Support Vector Machine Classifier for example
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9555555555555556

**Use K Fold Cross Validation to compare different models**

In [4]:
from sklearn.model_selection import cross_val_score

In [5]:
import numpy as np
cv_1=cross_val_score(svm.SVC(kernel='linear',C=10,gamma='auto'),iris.data, iris.target, cv=5)
np.average(cv_1)

0.9733333333333334

In [6]:
cross_val_score(svm.SVC(kernel='rbf',C=10,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [7]:
cross_val_score(svm.SVC(kernel='rbf',C=20,gamma='auto'),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

**GridSearchCV**

In [8]:
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)
clf.fit(iris.data, iris.target)
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_required_parameters',
 '_run_search',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'iid',
 'inverse_transform',
 'multimetric_',
 'n_jobs',
 'n_splits_',
 'param_grid',
 'pre_dispatch',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'refit',
 'refit_time_',
 'return_train_score',
 'score

In [9]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [10]:
clf.best_score_

0.9800000000000001

In [11]:
df_GSCV = pd.DataFrame(clf.cv_results_)
df_GSCV[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'C': 1, 'kernel': 'rbf'}",0.98
1,"{'C': 1, 'kernel': 'linear'}",0.98
2,"{'C': 10, 'kernel': 'rbf'}",0.98
3,"{'C': 10, 'kernel': 'linear'}",0.973333
4,"{'C': 20, 'kernel': 'rbf'}",0.966667
5,"{'C': 20, 'kernel': 'linear'}",0.966667


**RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation**



In [12]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(iris.data, iris.target)
df_RSCV=pd.DataFrame(rs.cv_results_)
df_RSCV[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'kernel': 'rbf', 'C': 10}",0.98
1,"{'kernel': 'linear', 'C': 10}",0.973333


**Comparison of different models with different hyperparameters**



In [13]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [14]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.953333,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}
