In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC

In [2]:
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
Y = data.target

In [3]:
dataset = X.copy()
dataset['result'] = Y
dataset.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),result
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
dataset.result = dataset.result.apply(lambda x: data.target_names[x])
dataset.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),result
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, stratify=Y, test_size=0.25)

## Find Best Hyper Parameters

In [6]:
cross_val_score(SVC(kernel='linear',C=10, gamma='auto'), X,Y, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [7]:
kernels = ['rbf','linear']
C = [5,10,15,20]

avg_scores = {}

for kval in kernels:
    for cval in C:
        scores = cross_val_score(SVC(kernel=kval,C=10, gamma='auto'), X,Y, cv=5)
        avg_scores[f'{kval}_{cval}'] = np.average(scores)
        
avg_scores

{'rbf_5': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_15': 0.9800000000000001,
 'rbf_20': 0.9800000000000001,
 'linear_5': 0.9733333333333334,
 'linear_10': 0.9733333333333334,
 'linear_15': 0.9733333333333334,
 'linear_20': 0.9733333333333334}

## Doing the the same thing withb GridSearchCV

In [8]:
from sklearn.model_selection import GridSearchCV

In [15]:
clf = GridSearchCV(SVC(gamma='auto'),{
       'kernel':['rbf','linear'],
       'C': [5,10,15,20]},
       cv=5, 
       return_train_score = False)

In [16]:
clf.fit(X,Y)

In [17]:
clf.cv_results_

{'mean_fit_time': array([0.00473819, 0.00407524, 0.0029675 , 0.00315552, 0.0030786 ,
        0.00293589, 0.00295987, 0.00294027]),
 'std_fit_time': array([7.78193912e-04, 2.00981462e-03, 7.38743113e-05, 8.93455528e-04,
        9.80029395e-04, 1.84905068e-04, 1.72857620e-04, 1.92756466e-04]),
 'mean_score_time': array([0.00315113, 0.00266137, 0.00213962, 0.0019949 , 0.00247064,
        0.00186052, 0.00219579, 0.00240598]),
 'std_score_time': array([0.000808  , 0.00087135, 0.00064278, 0.00011494, 0.00041351,
        0.00044687, 0.00040245, 0.0004945 ]),
 'param_C': masked_array(data=[5, 5, 10, 10, 15, 15, 20, 20],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear',
                    'rbf', 'linear'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object

In [18]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.004738,0.000778,0.003151,0.000808,5,rbf,"{'C': 5, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.004075,0.00201,0.002661,0.000871,5,linear,"{'C': 5, 'kernel': 'linear'}",1.0,1.0,0.933333,0.966667,1.0,0.98,0.026667,1
2,0.002968,7.4e-05,0.00214,0.000643,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.003156,0.000893,0.001995,0.000115,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.003079,0.00098,0.002471,0.000414,15,rbf,"{'C': 15, 'kernel': 'rbf'}",0.966667,1.0,0.933333,0.966667,1.0,0.973333,0.024944,4
5,0.002936,0.000185,0.001861,0.000447,15,linear,"{'C': 15, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,7
6,0.00296,0.000173,0.002196,0.000402,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,6
7,0.00294,0.000193,0.002406,0.000494,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,7


In [19]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,5,rbf,0.98
1,5,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,15,rbf,0.973333
5,15,linear,0.966667
6,20,rbf,0.966667
7,20,linear,0.966667


In [20]:
clf.best_params_

{'C': 5, 'kernel': 'rbf'}

## Random GridSearchCV to Reduce Computation Cost

In [21]:
from sklearn.model_selection import RandomizedSearchCV

In [33]:
rs = RandomizedSearchCV(SVC(gamma='auto'),{
       'kernel':['rbf','linear'],
       'C': [5,10,15,20]},
       cv=5, 
       return_train_score = False,
       n_iter=3)

rs.fit(X,Y)

In [34]:
df = pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]
df

Unnamed: 0,param_C,param_kernel,mean_test_score
0,5,rbf,0.98
1,5,linear,0.98
2,10,rbf,0.98


## Finding Best Model for The Problem

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

classifier_models = {
    'Logistic Regression':{
        "model": LogisticRegression(solver='liblinear', multi_class='auto'),
        "params":{'C': [1,5,10,20]},
},
    'Random Forest':{
        "model": RandomForestClassifier(),
        "params":{'n_estimators': [1,5,10,20]},
},
    'SVC':{
        "model": SVC(gamma='auto'),
        "params":{
            'kernel':['rbf','linear'],
            'C': [5,10,15,20]},
},
    'Decision Tree':{
        "model": DecisionTreeClassifier(),
        "params":{'criterion': ['gini','entropy']},
}}

In [36]:
best_model = []
for model, config in classifier_models.items():
    clf = GridSearchCV(config['model'], config['params'], cv=5, return_train_score=False)
    clf.fit(X,Y)
    best_model.append({'model':model, 'score': clf.best_score_, 'params': clf.best_params_})

In [37]:
pd.DataFrame(best_model)

Unnamed: 0,model,score,params
0,Logistic Regression,0.966667,{'C': 5}
1,Random Forest,0.973333,{'n_estimators': 1}
2,SVC,0.98,"{'C': 5, 'kernel': 'rbf'}"
3,Decision Tree,0.966667,{'criterion': 'gini'}
