In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
from sklearn.datasets import load_digits
digits = load_digits()

# Approach 1: Use train_test_split and manually tune parameters by trial and error

In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(digits.data,digits.target,test_size =0.3)

In [4]:
from sklearn import svm, datasets
model = svm.SVC(kernel='rbf',C=30,gamma='auto')
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.4

Approach 2: Use K Fold Cross validation


Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation

In [5]:
from sklearn.model_selection import cross_val_score

In [6]:
cross_val_score(svm.SVC(kernel='linear', C=10,gamma='auto'),digits.data,digits.target,cv=5)

array([0.96388889, 0.91944444, 0.96657382, 0.9637883 , 0.92479109])

In [7]:
cross_val_score(svm.SVC(kernel='rbf', C=10,gamma='auto'),digits.data,digits.target,cv=5)

array([0.45277778, 0.46944444, 0.47910864, 0.47910864, 0.50139276])

In [8]:
cross_val_score(svm.SVC(kernel='rbf', C=20,gamma='auto'),digits.data,digits.target,cv=5)

array([0.45277778, 0.46944444, 0.47910864, 0.47910864, 0.50139276])

Above approach is tiresome and very manual. We can use for loop as an alternative

In [9]:
kernels = ['rbf','linear']
C=[1,10,20]
avg_scores = {}
for kval in kernels:
    for cval in C:
        cv_scores = cross_val_score(svm.SVC(kernel=kval,C=cval,gamma='auto'),digits.data,digits.target,cv=5)
        avg_scores[kval + '_' + str(cval)] = np.average(cv_scores)

avg_scores

{'rbf_1': 0.448545341999381,
 'rbf_10': 0.47636645001547506,
 'rbf_20': 0.47636645001547506,
 'linear_1': 0.9476973073351903,
 'linear_10': 0.9476973073351903,
 'linear_20': 0.9476973073351903}

From above results we can say that rbf with C=1 or 10 or linear with C=1 will give best performance

Approach 3: Use GridSearchCV

In [10]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(svm.SVC(gamma='auto'),{
    'C' :[1,10,20],
    'kernel' : ['rbf','linear']   
},cv=5, return_train_score=False)
clf.fit(digits.data,digits.target)
clf.cv_results_

{'mean_fit_time': array([0.50785074, 0.04489007, 0.74965272, 0.08642168, 0.76298084,
        0.09402208]),
 'std_fit_time': array([0.19368699, 0.00231225, 0.0535002 , 0.01845128, 0.43554311,
        0.01349082]),
 'mean_score_time': array([0.11803007, 0.01040258, 0.21373425, 0.01960406, 0.2242425 ,
        0.02000318]),
 'std_score_time': array([0.00957417, 0.00080185, 0.05541916, 0.00492711, 0.08848462,
        0.00268504]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20, 'kernel': 'linear'}],


In [11]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.507851,0.193687,0.11803,0.009574,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.411111,0.45,0.454039,0.448468,0.479109,0.448545,0.021761,6
1,0.04489,0.002312,0.010403,0.000802,1,linear,"{'C': 1, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
2,0.749653,0.0535,0.213734,0.055419,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
3,0.086422,0.018451,0.019604,0.004927,10,linear,"{'C': 10, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1
4,0.762981,0.435543,0.224242,0.088485,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.452778,0.469444,0.479109,0.479109,0.501393,0.476366,0.015784,4
5,0.094022,0.013491,0.020003,0.002685,20,linear,"{'C': 20, 'kernel': 'linear'}",0.963889,0.919444,0.966574,0.963788,0.924791,0.947697,0.020978,1


In [12]:
df[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.448545
1,1,linear,0.947697
2,10,rbf,0.476366
3,10,linear,0.947697
4,20,rbf,0.476366
5,20,linear,0.947697


In [13]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [14]:
clf.best_score_

0.9476973073351903

In [15]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

Use RandomizedSearchCV to reduce number of iterations and with random combination of parameters. This is useful when you have too many parameters to try and your training time is longer. It helps reduce the cost of computation

In [16]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
        'C': [1,10,20],
        'kernel': ['rbf','linear']
    }, 
    cv=5, 
    return_train_score=False, 
    n_iter=2
)
rs.fit(digits.data, digits.target)
pd.DataFrame(rs.cv_results_)[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.947697
1,10,rbf,0.476366


# GaussianNB

In [17]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(digits.data, digits.target)

GaussianNB()

In [18]:
model.fit(digits.data,digits.target)

GaussianNB()

In [19]:
model.score(digits.data, digits.target)

0.8580968280467446

# DecisionTreeClassifier

In [28]:
from sklearn import tree

In [29]:
model_dtc = tree.DecisionTreeClassifier()

In [30]:
model_dtc.fit(digits.data,digits.target)

DecisionTreeClassifier()

In [32]:
model_dtc.score(digits.data,digits.target)

1.0


# LogisticRegression,RandomForestClassifier,svm

In [39]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [38]:
scores = []

for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.89987,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
