<a href="https://colab.research.google.com/github/ngzhiwei517/Machine_Learning_Self-Learn/blob/main/Grid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Finding best model and hyper parameter tunning using GridSearchCV

In [1]:

from sklearn import svm, datasets
iris = datasets.load_iris()

In [3]:
import pandas as pd
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [5]:
df['flower']=iris.target
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [6]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [7]:
df['flower']=df['flower'].apply(lambda x:iris.target_names[x])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Approach 1: Use train_test_split and manually tune parameters by trial and error

In [8]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [9]:
from sklearn.svm import  SVC
model=SVC(kernel='rbf',gamma='auto',C=50)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9555555555555556

The score is changing based on our sample

## Approach 2: Use K Fold Cross validation

In [31]:
from sklearn.model_selection import cross_val_score
linear=cross_val_score(SVC(kernel='linear',gamma='auto',C=10),iris.data,iris.target,cv=5)

In [33]:
import numpy as np
avg_score=np.average(linear)
avg_score

np.float64(0.9733333333333334)

In [36]:
rbf_10=cross_val_score(SVC(kernel='rbf',gamma='auto',C=10),iris.data,iris.target,cv=5)

In [37]:
avg_rbf=np.average(rbf_10)
avg_rbf

np.float64(0.9800000000000001)

In [34]:
rbf_20=cross_val_score(SVC(kernel='rbf',gamma='auto',C=20),iris.data,iris.target,cv=5)

In [35]:
avg_rbf=np.average(rbf_20)
avg_rbf

np.float64(0.9666666666666668)

In [56]:
kernel=['rbf','linear']
c_value=[1,10,20]
avg_score={}
for k in kernel:
  for c_v in c_value:
    score=cross_val_score(SVC(kernel=k,gamma='auto',C=c_v),iris.data,iris.target,cv=5)
    avg_score[k+"_"+str(c_v)]=np.average(score)
avg_score

{'rbf_1': np.float64(0.9800000000000001),
 'rbf_10': np.float64(0.9800000000000001),
 'rbf_20': np.float64(0.9666666666666668),
 'linear_1': np.float64(0.9800000000000001),
 'linear_10': np.float64(0.9733333333333334),
 'linear_20': np.float64(0.9666666666666666)}

Find optimal score by using loop but not conveninet if we hv many C, then we need to run the loop many times

## Approach 3: Use GridSearchCV

In [57]:
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(SVC(gamma='auto'),{  #The second parameter of GridSearchCV is the parameter grid
    'C':[1,10,20],
    'kernel':['linear','rbf']
},cv=5,return_train_score=False)
clf.fit(iris.data,iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00239701, 0.00181093, 0.00163851, 0.00167031, 0.00161963,
        0.00483723]),
 'std_fit_time': array([1.57703936e-03, 3.33685860e-05, 4.90708298e-05, 1.51408105e-04,
        4.32092878e-05, 3.27992562e-03]),
 'mean_score_time': array([0.00132818, 0.00130725, 0.00126953, 0.00128016, 0.00117555,
        0.00286431]),
 'std_score_time': array([2.27830399e-04, 4.33405370e-05, 1.19015670e-04, 1.62080983e-04,
        2.30601920e-05, 1.65427919e-03]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['linear', 'rbf', 'linear', 'rbf', 'linear', 'rbf'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'params': [{'C': 1, 'kernel': 'linear'},
  {'C': 1, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}

In [58]:
df=pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002397,0.001577,0.001328,0.000228,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001811,3.3e-05,0.001307,4.3e-05,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001639,4.9e-05,0.00127,0.000119,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
3,0.00167,0.000151,0.00128,0.000162,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.00162,4.3e-05,0.001176,2.3e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
5,0.004837,0.00328,0.002864,0.001654,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5


In [59]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.98
2,10,linear,0.973333
3,10,rbf,0.98
4,20,linear,0.966667
5,20,rbf,0.966667


# Example (Random Forest)

In [None]:
clf=GridSearchCV(RandomForestClassifier(),{
    'n_estimator':[100,200,300],

})

In [65]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
clf=GridSearchCV(RandomForestClassifier(),{
  'n_estimators':[50,100,150],
  'max_depth':[None,5,10]
})
clf.fit(iris.data,iris.target)


In [68]:
clf_results=clf.cv_results_
clf_results
df=pd.DataFrame(clf_results)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.278692,0.042446,0.014113,0.003512,,50,"{'max_depth': None, 'n_estimators': 50}",0.966667,0.966667,0.933333,0.966667,1.0,0.966667,0.021082,1
1,0.361099,0.084404,0.017268,0.007784,,100,"{'max_depth': None, 'n_estimators': 100}",0.966667,0.966667,0.933333,0.966667,1.0,0.966667,0.021082,1
2,0.35146,0.05805,0.018911,0.001565,,150,"{'max_depth': None, 'n_estimators': 150}",0.966667,0.966667,0.933333,0.933333,1.0,0.96,0.024944,5
3,0.140896,0.028701,0.007798,0.000208,5.0,50,"{'max_depth': 5, 'n_estimators': 50}",0.966667,0.966667,0.9,0.933333,1.0,0.953333,0.033993,9
4,0.264787,0.02008,0.0141,0.000868,5.0,100,"{'max_depth': 5, 'n_estimators': 100}",0.966667,0.966667,0.933333,0.966667,1.0,0.966667,0.021082,1
5,0.402229,0.018594,0.021701,0.00197,5.0,150,"{'max_depth': 5, 'n_estimators': 150}",0.966667,0.966667,0.933333,0.933333,1.0,0.96,0.024944,5
6,0.106599,0.034713,0.006474,0.00303,10.0,50,"{'max_depth': 10, 'n_estimators': 50}",0.966667,0.966667,0.933333,0.933333,1.0,0.96,0.024944,5
7,0.143253,0.004715,0.008308,0.002197,10.0,100,"{'max_depth': 10, 'n_estimators': 100}",0.966667,0.966667,0.933333,0.966667,1.0,0.966667,0.021082,1
8,0.212176,0.003946,0.014114,0.004509,10.0,150,"{'max_depth': 10, 'n_estimators': 150}",0.966667,0.966667,0.933333,0.933333,1.0,0.96,0.024944,5


In [69]:
df[['param_max_depth','param_n_estimators','mean_test_score']]

Unnamed: 0,param_max_depth,param_n_estimators,mean_test_score
0,,50,0.966667
1,,100,0.966667
2,,150,0.96
3,5.0,50,0.953333
4,5.0,100,0.966667
5,5.0,150,0.96
6,10.0,50,0.96
7,10.0,100,0.966667
8,10.0,150,0.96


In [71]:
print(f'Best parameters:',clf.best_params_)
print(f'Best score:',clf.best_score_)

Best parameters: {'max_depth': None, 'n_estimators': 50}
Best score: 0.9666666666666668


If we need to try large number of C value, then computational cost will increase


Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters.

In [20]:
from sklearn.model_selection import RandomizedSearchCV
rs=RandomizedSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['linear','rbf']
},cv=5,n_iter=2) #randomly try only 2 kernel with 2 C value
rs.fit(iris.data,iris.target)
df=pd.DataFrame(rs.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001427,0.000297,0.001138,0.000329,rbf,20,"{'kernel': 'rbf', 'C': 20}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,2
1,0.000985,3.6e-05,0.000817,2.8e-05,linear,1,"{'kernel': 'linear', 'C': 1}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1


In [21]:
df[['param_kernel','param_C','mean_test_score']]

Unnamed: 0,param_kernel,param_C,mean_test_score
0,rbf,20,0.966667
1,linear,1,0.98


Works well in practical life if we do not have too much computational power

# How about different models with different hyperparameters?

In [22]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [29]:
model_parameter={
    'SVM':{
        'model':SVC(gamma='auto'),
        'parameter':{
            'C':[1,10,20],
            'kernel':['linear','rbf']
        }
    },
    'Random Forest':{
        'model':RandomForestClassifier(),
        'parameter':{
            'n_estimators':[1,5,10]
        }
    }
}

In [39]:
score=[]
for model_n,model_p in model_parameter.items():
  clf=GridSearchCV(model_p['model'],model_p['parameter'],cv=5)
  clf.fit(iris.data,iris.target)
  score.append({
      'model':model_n,
      'best_score':clf.best_score_,
      'best_parameter':clf.best_params_
  })
df=pd.DataFrame(score)
df

Unnamed: 0,model,best_score,best_parameter
0,SVM,0.98,"{'C': 1, 'kernel': 'linear'}"
1,Random Forest,0.966667,{'n_estimators': 5}




---



---



---

#Exercise

In [53]:
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20],
    'kernel':['rbf','linear']
},cv=5)
clf.fit(iris.data,iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00449991, 0.0021225 , 0.00187964, 0.00158224, 0.00175743,
        0.00238342]),
 'std_fit_time': array([8.19318265e-04, 8.84308074e-04, 1.21328533e-04, 5.17726590e-05,
        4.28388231e-05, 8.04104462e-04]),
 'mean_score_time': array([0.00523543, 0.00152845, 0.00137239, 0.00120449, 0.00129051,
        0.00140896]),
 'std_score_time': array([1.63667695e-03, 5.06276683e-04, 1.24434893e-04, 1.02025325e-05,
        5.22408567e-05, 1.53602979e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
 

In [54]:
df=pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.0045,0.000819,0.005235,0.001637,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002122,0.000884,0.001528,0.000506,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.00188,0.000121,0.001372,0.000124,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001582,5.2e-05,0.001204,1e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.001757,4.3e-05,0.001291,5.2e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.002383,0.000804,0.001409,0.000154,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [55]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


We have supply the first three row of parameter to get the best performance

Analyze cv_results_: Using the same iris dataset and GridSearchCV from Exercise 1, convert cv_results_ to a DataFrame and display only the columns: param_C, param_kernel, and mean_test_score.

In [1]:
import pandas as pd
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()
param_grid = {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']
}

clf = GridSearchCV(svm.SVC(gamma='auto'), param_grid, cv=5)
clf.fit(iris.data, iris.target)

In [3]:
df=pd.DataFrame(clf.cv_results_)
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [5]:
print('Best:',clf.best_params_)

Best: {'C': 1, 'kernel': 'rbf'}


RandomizedSearchCV: Use RandomizedSearchCV instead of GridSearchCV with the same parameters, but set n_iter=2 to test only 2 random combinations. Compare the speed difference.

In [40]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn import svm, datasets
import pandas as pd
iris = datasets.load_iris()
param_grid = {
    'C': [1, 10, 20],
    'kernel': ['rbf', 'linear']
}

In [48]:
rs=RandomizedSearchCV(SVC(gamma='auto'),param_grid,cv=5,n_iter=3)
rs.fit(iris.data,iris.target)


In [49]:
df=pd.DataFrame(rs.cv_results_)
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.98
1,20,linear,0.966667
2,20,rbf,0.966667


In [50]:
print('Best score:',rs.best_score_)
print('Best parameter:',rs.best_params_)

Best score: 0.9800000000000001
Best parameter: {'kernel': 'rbf', 'C': 10}


Manual approach vs GridSearchCV: First, manually test different parameters using train_test_split. Then use GridSearchCV with cross-validation. Compare which approach gives more reliable results.

In [55]:
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split, GridSearchCV

iris = datasets.load_iris()

# Part 1: Manual approach
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, test_size=0.3)


In [71]:
score={}
kernel=['linear','rbf']
c=[10,20,30]
for k in kernel:
  for c_v in c:
    model=SVC(gamma='auto',kernel=k,C=c_v)
    model.fit(X_train,y_train)
    score['Kernel:'+k+" C:"+str(c_v)]=model.score(X_test,y_test)
score

{'Kernel:linear C:10': 0.9777777777777777,
 'Kernel:linear C:20': 0.9777777777777777,
 'Kernel:linear C:30': 0.9777777777777777,
 'Kernel:rbf C:10': 0.9777777777777777,
 'Kernel:rbf C:20': 0.9777777777777777,
 'Kernel:rbf C:30': 0.9777777777777777}

manual score

In [73]:
model=SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
print('Model score(manual):',model.score(X_test,y_test))

Model score(manual): 0.9777777777777777


In [75]:
clf=GridSearchCV(SVC(gamma='auto'),{
    'C':[1,10,20,30],
    'kernel':['linear','rbf']}
                 ,cv=5)



In [76]:
clf.fit(iris.data,iris.target)


In [77]:
df=pd.DataFrame(clf.cv_results_)
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,linear,0.98
1,1,rbf,0.98
2,10,linear,0.973333
3,10,rbf,0.98
4,20,linear,0.966667
5,20,rbf,0.966667
6,30,linear,0.96
7,30,rbf,0.96


In [79]:
print('Best parameter:',clf.best_params_)
print('Best score:',clf.best_score_)

Best parameter: {'C': 1, 'kernel': 'linear'}
Best score: 0.9800000000000001


For digits dataset in sklearn.dataset, please try following classifiers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

In [81]:

from sklearn import datasets
digits = datasets.load_digits()

In [80]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [94]:
model_parameter={
    'SVM':{
        'model':SVC(gamma='auto'),
        'parameter':{
            'C':[1,10,20],
            'kernel':['linear','rbf']
        }
    },
    'Random Forest':{
        'model':RandomForestClassifier(),
        'parameter':{
            'n_estimators':[1,5,10]
        }
    },
    'Logistic Regression':{
        'model':LogisticRegression(solver='liblinear'),
        'parameter':{
          'C':[1,5,10]
        }
    },
    'naive_bayes_gaussian':{
        'model':GaussianNB(),
        'parameter':{}

    },
    'naive_bayes_multinomial':{
        'model':MultinomialNB(),
        'parameter':{}
    }
}

In [97]:
scores=[]
for model_n,model_p in model_parameter.items():
  clf=GridSearchCV(model_p['model'],model_p['parameter'],cv=5)
  clf.fit(digits.data,digits.target)
  scores.append({
      'model':model_n,
      'best_score':clf.best_score_,
      'best_parameter':clf.best_params_
  }
  )
scores

[{'model': 'SVM',
  'best_score': np.float64(0.9476973073351903),
  'best_parameter': {'C': 1, 'kernel': 'linear'}},
 {'model': 'Random Forest',
  'best_score': np.float64(0.9037480656143609),
  'best_parameter': {'n_estimators': 10}},
 {'model': 'Logistic Regression',
  'best_score': np.float64(0.9221138966264315),
  'best_parameter': {'C': 1}},
 {'model': 'naive_bayes_gaussian',
  'best_score': np.float64(0.8069281956050759),
  'best_parameter': {}},
 {'model': 'naive_bayes_multinomial',
  'best_score': np.float64(0.8703497369235531),
  'best_parameter': {}}]

In [98]:
df=pd.DataFrame(scores)
df

Unnamed: 0,model,best_score,best_parameter
0,SVM,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,Random Forest,0.903748,{'n_estimators': 10}
2,Logistic Regression,0.922114,{'C': 1}
3,naive_bayes_gaussian,0.806928,{}
4,naive_bayes_multinomial,0.87035,{}



For me the winner is svm (C=1, kernel=linear) with 94.77% score