In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from sklearn import svm, datasets
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
iris = datasets.load_iris()
df = pd.DataFrame(iris.data,columns=iris.feature_names)
df['target'] = iris.target
df['target'] = df['target'].apply(lambda x: iris.target_names[x])
df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


#Hyper Parameter Tuning

In [None]:
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [None]:
model = svm.SVC(kernel='rbf', C=30, gamma='auto')
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.9777777777777777

Every time we split the data into a training and testing set, the score will change, so we will use "K Fold Cross Validation."

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(svm.SVC(kernel='linear', C=10, gamma='auto'), iris.data, df.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel='rbf', C=10, gamma='auto'), iris.data, df.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel='rbf', C=20, gamma='auto'), iris.data, df.target, cv=5)

array([0.96666667, 1.        , 0.9       , 0.96666667, 1.        ])

We are doing hyperparameter tuning, but we are writing the same line again and again with different parameters, so we will define an array for different hyperparameters and run the code in a loop.

In [None]:
kernals = ['rbf', 'linear']
C = [1,10,20]
avg_scores = {}
for ker_val in kernals:
    for c_val in C:
        cv_scores = cross_val_score(svm.SVC(kernel=ker_val, C=c_val, gamma='auto'), iris.data, df.target, cv=5)
        avg_scores[ker_val + '_' + str(c_val)] = np.average(cv_scores)
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

Here we have only two parameters, such as kernels and C values, but what if we have 4, 5, or 6 parameters? Then we have to write multiple loops that will be a little bit confusing and not convenient to write such code, but we have a library known as GridSearchCV, which will do the exact same thing we have done in the last cell but with a single line of code.

In [None]:
from sklearn.model_selection import GridSearchCV

In the following line we are defining our classifier with the model we want to use in the first parameter and the second parameter we are setting our hyperparameter values we want to try with our model, and GridSearchCV by default uses k-fold cross-validation, which we set on the third parameter where we define how many folds we want, and this classifier (GridSearchCV) also returns a value, which we don't want here, so we set it as false.

In [None]:
clf = GridSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf','linear']
}, cv=5, return_train_score=False)

In [None]:
clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00106101, 0.00065289, 0.00073485, 0.00064607, 0.00070987,
        0.00069342]),
 'std_fit_time': array([2.86276423e-04, 2.19073335e-05, 4.28526208e-05, 1.89841084e-05,
        1.72075461e-05, 9.03422833e-05]),
 'mean_score_time': array([0.00071549, 0.00051699, 0.00055919, 0.00049777, 0.0005167 ,
        0.00057425]),
 'std_score_time': array([1.95175280e-04, 1.54451081e-05, 7.90550279e-05, 1.16483032e-05,
        1.90576249e-05, 1.23286024e-04]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

To understand these results is not easy, but sklearn provides us support to change this data into a nice tabular format.

In [None]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001061,0.000286,0.000715,0.000195,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000653,2.2e-05,0.000517,1.5e-05,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000735,4.3e-05,0.000559,7.9e-05,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000646,1.9e-05,0.000498,1.2e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.00071,1.7e-05,0.000517,1.9e-05,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000693,9e-05,0.000574,0.000123,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


Here only few columns are usefull let's trim it down

In [None]:
df[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


To see the properities of our classifier use the following line

In [None]:
dir(clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__sklearn_tags__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_routed_params_for_fit',
 '_get_scorers',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_run_sea

In [None]:
clf.best_score_

0.9800000000000001

In [None]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

Right now we have C values as 1, 10, and 20, but what if we have a range from 1 to 50? Then the computation cost will go high because it utilizes combinations when searching for the best hyperparameters. So, to tackle this computation problem, the sklearn library comes with another library called RandomizedSearchCV, and it will not try every single combination of parameters, but it will try random combinations of these parameter values, and we can choose how many iterations we want to perform.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rs = RandomizedSearchCV(svm.SVC(gamma='auto'), {
    'C': [1,10,20],
    'kernel': ['rbf', 'linear']
    },
    cv=5,
    return_train_score=False,
    n_iter=2
)

rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[['param_C', 'param_kernel', 'mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,rbf,0.98
1,1,linear,0.98


#Choosing best model

Till now we did the hyper parameter tuning but now we will see how we can choose best model

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.966667,{'n_estimators': 5}
2,logistic_regression,0.966667,{'C': 5}


#Exercise: Machine Learning Finding Optimal Model and Hyperparameters

For digits dataset in sklearn.dataset, please try following classifers and find out the one that gives best performance. Also find the optimal parameters for that classifier.

In [None]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
import sklearn.datasets as datasets
digits = datasets.load_digits()
df = pd.DataFrame(digits.data,columns=digits.feature_names)
df['target'] = digits.target
df

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


In [None]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'gaussian_nb': {
        'model': GaussianNB(),
        'params': {
            'var_smoothing': [1e-2, 1e-3, 1e-4, 1e-5]
        }
    },
    'multinomial_nb': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100,1000]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        }
    }
}

In [None]:
scores = []

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(digits.data, digits.target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

In [None]:
df = pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.947697,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.899297,{'n_estimators': 10}
2,logistic_regression,0.922114,{'C': 1}
3,gaussian_nb,0.87926,{'var_smoothing': 0.01}
4,multinomial_nb,0.876476,{'alpha': 1000}
5,decision_tree,0.811371,"{'criterion': 'entropy', 'max_depth': 10}"
