In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
iris = load_iris()
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [4]:
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['target'] = iris.target
df['target_names'] = df.target.apply(lambda x : iris.target_names[x])
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_names
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica


In [5]:
X = iris.data
Y = iris.target

# From Scratch

In [6]:
kernels = ['rbf', 'linear']
Cs = [1,10,20]
gammas = ['auto', 'scale']

avg_scores = {}
for k in kernels:
    for g in gammas:
        for c in Cs:
            cv_scores = cross_val_score(SVC(kernel=k, C=c, gamma=g), X, Y, cv=5)
            # add to dict
            avg_scores[f'Kernel = {k:<6}, Gamma = {g:<5}, C = {str(c):<2}'] = np.average(cv_scores)

In [7]:
# read dict
avg_scores

{'Kernel = rbf   , Gamma = auto , C = 1 ': 0.9800000000000001,
 'Kernel = rbf   , Gamma = auto , C = 10': 0.9800000000000001,
 'Kernel = rbf   , Gamma = auto , C = 20': 0.9666666666666668,
 'Kernel = rbf   , Gamma = scale, C = 1 ': 0.9666666666666666,
 'Kernel = rbf   , Gamma = scale, C = 10': 0.9800000000000001,
 'Kernel = rbf   , Gamma = scale, C = 20': 0.9800000000000001,
 'Kernel = linear, Gamma = auto , C = 1 ': 0.9800000000000001,
 'Kernel = linear, Gamma = auto , C = 10': 0.9733333333333334,
 'Kernel = linear, Gamma = auto , C = 20': 0.9666666666666666,
 'Kernel = linear, Gamma = scale, C = 1 ': 0.9800000000000001,
 'Kernel = linear, Gamma = scale, C = 10': 0.9733333333333334,
 'Kernel = linear, Gamma = scale, C = 20': 0.9666666666666666}

# Grid Search CV

In [8]:
params = {
    'kernel':['rbf', 'linear'],
    'C':[1, 10, 20],
    'gamma':['auto', 'scale']
}

In [9]:
model = SVC()

In [10]:
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, verbose=1, return_train_score=False)
grid_search.fit(X,Y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [11]:
grid_search.best_score_

0.9800000000000001

In [12]:
grid_search.best_params_

{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}

In [13]:
df = pd.DataFrame(grid_search.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000999,3.234067e-07,0.0006,0.0004896297,1,auto,rbf,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.001199,0.0003995657,0.000399,0.0004891617,1,auto,linear,"{'C': 1, 'gamma': 'auto', 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.001199,0.000400138,0.000999,0.0006323345,1,scale,rbf,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.966667,0.966667,0.966667,0.933333,1.0,0.966667,0.021082,10
3,0.000999,9.246216e-07,0.000799,0.0003994706,1,scale,linear,"{'C': 1, 'gamma': 'scale', 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
4,0.001198,0.000399304,0.001199,0.0003993751,10,auto,rbf,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
5,0.001199,0.0004005194,0.0004,0.000490214,10,auto,linear,"{'C': 10, 'gamma': 'auto', 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,7
6,0.001198,0.0003989462,0.0006,0.0004901355,10,scale,rbf,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
7,0.000999,3.814697e-07,0.000999,1.907349e-07,10,scale,linear,"{'C': 10, 'gamma': 'scale', 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,7
8,0.001399,0.0007999898,0.000998,8.31394e-07,20,auto,rbf,"{'C': 20, 'gamma': 'auto', 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,9
9,0.001199,0.0004003765,0.000999,0.001094147,20,auto,linear,"{'C': 20, 'gamma': 'auto', 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,10


In [14]:
df = df[['param_C', 'param_gamma', 'param_kernel', 'mean_test_score', 'std_test_score', 'rank_test_score']]
df = df.sort_values('rank_test_score', ascending=True)
df

Unnamed: 0,param_C,param_gamma,param_kernel,mean_test_score,std_test_score,rank_test_score
0,1,auto,rbf,0.98,0.01633,1
1,1,auto,linear,0.98,0.01633,1
3,1,scale,linear,0.98,0.01633,1
4,10,auto,rbf,0.98,0.01633,1
6,10,scale,rbf,0.98,0.01633,1
10,20,scale,rbf,0.98,0.01633,1
5,10,auto,linear,0.973333,0.038873,7
7,10,scale,linear,0.973333,0.038873,7
8,20,auto,rbf,0.966667,0.036515,9
2,1,scale,rbf,0.966667,0.021082,10


# Randomized Search CV

In [15]:
params = {
    'kernel':['rbf', 'linear'],
    'C':[1, 10, 20],
    'gamma':['auto', 'scale']
}

In [16]:
model = SVC()

In [17]:
random_search = RandomizedSearchCV(estimator=model, param_distributions=params, cv=5, n_iter=5, verbose=1, return_train_score=False)
random_search.fit(X,Y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [18]:
random_search.best_score_

0.9800000000000001

In [19]:
random_search.best_params_

{'kernel': 'rbf', 'gamma': 'scale', 'C': 20}

In [20]:
df = pd.DataFrame(random_search.cv_results_)
df = df[['param_C', 'param_gamma', 'param_kernel', 'mean_test_score', 'std_test_score', 'rank_test_score']]
df = df.sort_values('rank_test_score', ascending=True)
df

Unnamed: 0,param_C,param_gamma,param_kernel,mean_test_score,std_test_score,rank_test_score
0,20,scale,rbf,0.98,0.01633,1
1,1,scale,linear,0.98,0.01633,1
2,1,auto,rbf,0.98,0.01633,1
3,20,scale,linear,0.966667,0.042164,4
4,20,auto,linear,0.966667,0.042164,4


# Multiple models + Multiple Hyper-parameters of Each Model

In [21]:
models_params = {
    'SVM':{
        'model':SVC(),
        'params':{
            'kernel':['rbf', 'linear'], 'C':[1, 10, 20], 'gamma':['auto', 'scale']
        }
    },
    'ensemble':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[24, 48, 64]
        }
    },
    'linear_model':{
        'model':LogisticRegression(),
        'params':{
            'C':[1, 10, 20]
        }
    }
}

In [22]:
for module_name, module_info in models_params.items():
    print("Module:", module_name)
    print("Model:", module_info['model'])
    
    for param_name, param_values in module_info['params'].items():
        print("--- Parameter:", param_name, param_values)
        
    print('~'*100)
    

Module: SVM
Model: SVC()
--- Parameter: kernel ['rbf', 'linear']
--- Parameter: C [1, 10, 20]
--- Parameter: gamma ['auto', 'scale']
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Module: ensemble
Model: RandomForestClassifier()
--- Parameter: n_estimators [24, 48, 64]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Module: linear_model
Model: LogisticRegression()
--- Parameter: C [1, 10, 20]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [24]:
best_scores_of_each_model = []
for _, module_info in models_params.items():
    model = module_info['model']
    params = module_info['params']
    grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, return_train_score=False)
    grid_search.fit(X,Y)
    
    best_scores_of_each_model.append({'model': model, 'best_params': grid_search.best_params_, 'best_score': grid_search.best_score_})

In [25]:
best_scores_of_each_model

[{'model': SVC(),
  'best_params': {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'},
  'best_score': 0.9800000000000001},
 {'model': RandomForestClassifier(),
  'best_params': {'n_estimators': 48},
  'best_score': 0.9533333333333334},
 {'model': LogisticRegression(),
  'best_params': {'C': 10},
  'best_score': 0.9800000000000001}]

In [26]:
df = pd.DataFrame(best_scores_of_each_model)
df

Unnamed: 0,model,best_params,best_score
0,SVC(),"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.98
1,RandomForestClassifier(),{'n_estimators': 48},0.953333
2,LogisticRegression(),{'C': 10},0.98
