Creative Commons CC BY 4.0 Lynd Bacon & Associates, Ltd. Not warranted to be suitable for any particular purpose. (You're on your own!)

#  More Support Vector Classification: Grid Searching

Here we'll try grid searching to find good parameter settings.

# Get Packages

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, Markdown
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn import linear_model  
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import roc_curve, precision_score, recall_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import os
import shelve
%matplotlib inline

# Get Data and Munge It

In [4]:
cervical=pd.read_csv('../DATA/ML/risk_factors_cervical_cancer.csv',na_values='?')

In [5]:
cervicalFeats=cervical.loc[:,'Age':'Hormonal Contraceptives (years)'].copy()

In [6]:
cervical2=pd.concat([cervical.Biopsy,cervicalFeats],axis=1).dropna(axis=0)

In [7]:
X=cervical2.iloc[:,1:].to_numpy()  # features
y=cervical2.Biopsy.to_numpy()
X.shape
y.shape

(676, 9)

(676,)

# Set Up Train/Test Split, Pipeline, Grid Search Parameters



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
            random_state=99,stratify=y)

In [9]:
pipe = Pipeline([("scaler", preprocessing.StandardScaler()), 
                 ("polynom",PolynomialFeatures()),
                 ("svm", LinearSVC(max_iter=100000,random_state=99))])


In [10]:
param_grid={"polynom__degree":[1,2],
           "svm__C":[0.001,0.01,0.1,1.0]}

In [11]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=10,
                       return_train_score=True,iid=False)

In [12]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('polynom', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('svm', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid={'polynom__degree': [1, 2], 'svm__C': [0.001, 0.01, 0.1, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [16]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'polynom__degree': 1, 'svm__C': 0.001}

Best cross-validation score: 0.93
Test-set score: 0.93


# Multiple Learners in a Grid Search

Here we try training both logistic regression and a support vector classifier as part of our grid search.

In [20]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LinearSVC())])

In [21]:
param_grid = [
    {'classifier': [LinearSVC(max_iter=1000000,random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]},
    {'classifier': [linear_model.LogisticRegression(max_iter=1000000,
                            solver='lbfgs',random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]}
    ]


In [22]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=10,
                   return_train_score=True,iid=False)

In [23]:
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid=[{'classifier': [LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0)], 'preprocessing': [StandardScaler(copy=True, with_mea...rue), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'classifier__C': [0.001, 0.01, 0.1, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': LinearSVC(C=0.001, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0), 'classifier__C': 0.001, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score: 0.93
Test-set score: 0.93


In [29]:
cvresDF=pd.DataFrame(grid.cv_results_)
cvresDF

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier,param_classifier__C,param_preprocessing,params,split0_test_score,split1_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.002646,0.00028,0.00069,6.5e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.001,"StandardScaler(copy=True, with_mean=True, with...","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
1,0.002851,0.00039,0.000886,0.00014,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.001,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
2,0.007557,0.000866,0.000813,4.5e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.001,,"{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
3,0.004921,0.000589,0.000955,0.000178,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.01,"StandardScaler(copy=True, with_mean=True, with...","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
4,0.002006,6.3e-05,0.000648,2.3e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.01,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
5,0.064351,0.012594,0.000652,9.9e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.01,,"{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
6,0.011545,0.002099,0.000657,2.6e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.1,"StandardScaler(copy=True, with_mean=True, with...","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
7,0.002144,4.9e-05,0.000642,2.2e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.1,"MinMaxScaler(copy=True, feature_range=(0, 1))","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
8,0.75532,0.113329,0.000793,0.000126,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",0.1,,"{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949
9,0.120757,0.027962,0.000713,8.1e-05,"LinearSVC(C=0.001, class_weight=None, dual=Tru...",1.0,"StandardScaler(copy=True, with_mean=True, with...","{'classifier': LinearSVC(C=0.001, class_weight...",0.923077,0.923077,...,0.934066,0.934211,0.932166,0.932166,0.932166,0.932166,0.932166,0.932166,0.932941,0.000949


In [33]:
pd.DataFrame(cvresDF.iloc[cvresDF.mean_test_score.idxmax(),])

Unnamed: 0,0
mean_fit_time,0.00264649
std_fit_time,0.000279545
mean_score_time,0.000689602
std_score_time,6.47631e-05
param_classifier,"LinearSVC(C=0.001, class_weight=None, dual=Tru..."
param_classifier__C,0.001
param_preprocessing,"StandardScaler(copy=True, with_mean=True, with..."
params,"{'classifier': LinearSVC(C=0.001, class_weight..."
split0_test_score,0.923077
split1_test_score,0.923077


## Apply multiple models include SVC in the BreastCA data

In [37]:
from sklearn import datasets
breastCA=datasets.load_breast_cancer()
target_values, value_counts = np.unique(breastCA['target'], return_counts=True)
X=breastCA['data']  # features
y=breastCA['target'] # labels: 0=malignancy, 1=benign
y=1-y                # relabelled: 0=benign, 1=malignancy

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
            random_state=99,stratify=y)

In [43]:
pipe = Pipeline([('preprocessing', StandardScaler()), ('classifier', LinearSVC())])

In [44]:
param_grid = [
    {'classifier': [LinearSVC(max_iter=1000000,random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]},
    {'classifier': [linear_model.LogisticRegression(max_iter=1000000,
                            solver='lbfgs',random_state=99)],
     'preprocessing': [StandardScaler(), MinMaxScaler(), None],
     'classifier__C': [0.001, 0.01, 0.1, 1]}
    ]


In [45]:
grid = GridSearchCV(pipe, param_grid=param_grid, cv=10,
                   return_train_score=True,iid=False)

In [47]:
grid.fit(X_train, y_train)



GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('preprocessing', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=False, n_jobs=None,
       param_grid=[{'classifier': [LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0)], 'preprocessing': [StandardScaler(copy=True, with_mean...rue), MinMaxScaler(copy=True, feature_range=(0, 1)), None], 'classifier__C': [0.001, 0.01, 0.1, 1]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [48]:
print("Best params:\n{}\n".format(grid.best_params_))
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Test-set score: {:.2f}".format(grid.score(X_test, y_test)))

Best params:
{'classifier': LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000000,
     multi_class='ovr', penalty='l2', random_state=99, tol=0.0001,
     verbose=0), 'classifier__C': 0.01, 'preprocessing': StandardScaler(copy=True, with_mean=True, with_std=True)}

Best cross-validation score: 0.98
Test-set score: 0.97
