In [None]:
### TO DO ###
# 1. how to choose the correct calibration for C and gamma?


In [1]:
import numpy as np
import os

# Load the data
with np.load('cifar4-train.npz', allow_pickle=False) as npz_file:
    # Load items into a dictionary
    cifar = dict(npz_file.items())

print(cifar.keys())

pixels = cifar['pixels']
overfeat = cifar['overfeat']
labels = cifar['labels']
names = cifar['names']

print('pixels shape :',pixels.shape, ', dtype:', pixels.dtype)
print('overfeat shape :',overfeat.shape, ', dtype:', overfeat.dtype)
print('labels shape :',labels.shape, ', dtype:', labels.dtype)
print('Categories:', names)

# split the Overfeat data into train/test sets w/ same proportion of classes in each subset

import pandas as pd
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    overfeat, labels, train_size=4000, test_size=1000, 
    stratify=labels , # same class distribution
    random_state=0)

print('Train:', X_tr.shape, y_tr.shape)
print('Test:', X_te.shape, y_te.shape)

dict_keys(['pixels', 'overfeat', 'labels', 'names', 'allow_pickle'])
pixels shape : (5000, 3072) , dtype: uint8
overfeat shape : (5000, 4096) , dtype: float32
labels shape : (5000,) , dtype: int64
Categories: ['truck' 'car' 'airplane' 'ship']
Train: (4000, 4096) (4000,)
Test: (1000, 4096) (1000,)


### SVM classifier with a linear kernel

In [44]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create decision tree classifier
pipe = Pipeline([
    #('scaler', StandardScaler()), #  Standardization
    ('pca', PCA(n_components=172)), # PCA preprocessing, retain 90% of the variance explained.
    ('linear_svc', LinearSVC()) # SVM with linear kernel
])

# grid search with cross validation
from sklearn.model_selection import GridSearchCV

# Create cross-validation object
grid_cv = GridSearchCV(pipe, {
    #'scaler': [None,StandardScaler()],
    #'pca' : [None, PCA(n_components=172)],
    'linear_svc__C': list(np.linspace(0.0001,0.3,15)) # list of C values
    }, cv=5, n_jobs = -1,verbose = 5) # stratified 5-fold strategy

In [45]:
%%time
# Fit estimator on train set
grid_cv.fit(X_tr, y_tr)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] .............. linear_svc__C=0.0001, score=0.82125, total=   5.1s
[CV] linear_svc__C=0.0001 ............................................
[CV] .............. linear_svc__C=0.0001, score=0.83125, total=   5.1s
[CV] .............. linear_svc__C=0.0001, score=0.83625, total=   5.0s
[CV] .................. linear_svc__C=0.0001, score=0.8, total=   4.8s
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] .............. linear_svc__C=0.0001, score=0.84125, total=   4.6s
[CV] linear_svc_

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   24.6s


[CV] . linear_svc__C=0.042942857142857144, score=0.8375, total=   9.3s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV]  linear_svc__C=0.042942857142857144, score=0.84625, total=  10.4s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] .. linear_svc__C=0.042942857142857144, score=0.805, total=  11.2s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV]  linear_svc__C=0.042942857142857144, score=0.83625, total=  11.1s
[CV] . linear_svc__C=0.06436428571428572, score=0.81125, total=  10.9s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] . linear_svc__C=0.06436428571428572, score=0.84125, total=  12.3s
[CV] linear_svc__C=0.08578571428571428 ...............................
[CV] . linear_svc__C=0.06436428571428572, score=0.79875, total=   9.9s
[CV] linear_svc__C=0.08578571428571428 ...............................
[CV] .

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  2.6min


[CV] .. linear_svc__C=0.25715714285714286, score=0.8225, total=  10.6s
[CV] ... linear_svc__C=0.2785785714285714, score=0.8075, total=  10.4s
[CV] linear_svc__C=0.2785785714285714 ................................
[CV] linear_svc__C=0.2785785714285714 ................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.80625, total=  10.3s
[CV] linear_svc__C=0.3 ...............................................
[CV] .... linear_svc__C=0.2785785714285714, score=0.825, total=  10.3s
[CV] linear_svc__C=0.3 ...............................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.79875, total=  10.0s
[CV] linear_svc__C=0.3 ...............................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.82375, total=   9.9s
[CV] .................. linear_svc__C=0.3, score=0.8075, total=   9.7s
[CV] linear_svc__C=0.3 ...............................................
[CV] linear_svc__C=0.3 ...............................................
[CV] .

[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  3.1min finished


CPU times: user 18.6 s, sys: 688 ms, total: 19.2 s
Wall time: 3min 15s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linear_svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'linear_svc__C': [0.0001, 0.02152142857142857, 0.042942857142857144, 0.06436428571428572, 0.08578571428571428, 0.10720714285714285, 0.12862857142857143, 0.15005, 0.17147142857142855, 0.19289285714285712, 0.21431428571428568, 0.23573571428571427, 0.25715714285714286, 0.2785785714285714, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=5)

In [46]:
import pandas as pd

# Collect results in a DataFrame
df = pd.DataFrame.from_items([
    ('C', grid_cv.cv_results_['param_linear_svc__C']),
    ('mean_te', grid_cv.cv_results_['mean_test_score']),
    ('std_te', grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean_te', ascending=False)

Unnamed: 0,C,mean_te,std_te
5,0.107207,0.83225,0.01377
1,0.0215214,0.831,0.016401
4,0.0857857,0.831,0.015819
6,0.128629,0.829,0.01552
2,0.0429429,0.82825,0.015219
3,0.0643643,0.82825,0.019695
7,0.15005,0.827,0.018038
0,0.0001,0.826,0.014586
8,0.171471,0.8235,0.018698
10,0.214314,0.8235,0.018276


In [48]:
best = df.sort_values(by='mean_te', ascending=False)[0:1]
print('Linear SVM - top accuracy across folds {:.3f}'.format(best.iloc[0,1]),
      'std: {:.3f}'.format(best.iloc[0,2]), ' with C: {:.3f}'.format(best.iloc[0,0]))


Linear SVM - top accuracy across folds 0.832 std: 0.014  with C: 0.107


### SVM classifier with an RBF kernel

In [32]:
from sklearn.svm import SVC

# Create decision tree classifier
pipe = Pipeline([
    #('scaler', StandardScaler()), #  Standardization
    ('pca', PCA(n_components=172)), # PCA preprocessing, retain 90% of the variance explained.
    ('svc_rbf', SVC(kernel='rbf')) # SVM with RBF kernel
])

# grid search with cross validation
from sklearn.model_selection import GridSearchCV

# Create cross-validation object
grid_cv_svm_rbf = GridSearchCV(pipe, {
    #'scaler': [None,StandardScaler()],
    #'pca' : [None, PCA(n_components=172)],
    'svc_rbf__C': list(np.linspace(0.001,0.3,10)), # list of C values
    'svc_rbf__gamma' : list(np.linspace(0.1,10,8)) # list of gamma values
    }, cv=5, n_jobs = -1) # stratified 5-fold strategy

print(grid_cv.get_params)

<bound method BaseEstimator.get_params of GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc_rbf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'SVC__C': [0.001, 0.022357142857142857, 0.04371428571428571, 0.06507142857142857, 0.08642857142857142, 0.10778571428571428, 0.12914285714285714, 0.1505, 0.17185714285714285, 0.1932142857142857, 0.21457142857142855, 0.2359285714285714, 0.2572857142857143, 0.27864285714285714, 0.3], 'SVC__...6315789473684, 15.810526315789472, 16.857894736842105, 17.905263157894737, 18.95263157894737, 20.0]},
       pre_dispatch='2*n_jobs', refit=Tr

In [33]:
%%time
# Fit estimator on train set
grid_cv_svm_rbf.fit(X_tr, y_tr)

CPU times: user 58.9 s, sys: 703 ms, total: 59.7 s
Wall time: 20min 31s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc_rbf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc_rbf__C': [0.001, 0.03422222222222222, 0.06744444444444445, 0.10066666666666667, 0.1338888888888889, 0.1671111111111111, 0.20033333333333334, 0.23355555555555557, 0.2667777777777778, 0.3], 'svc_rbf__gamma': [0.1, 1.5142857142857145, 2.928571428571429, 4.3428571428571425, 5.757142857142857, 7.171428571428572, 8.585714285714285, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [36]:
import pandas as pd

# Collect results in a DataFrame
df_rbf = pd.DataFrame.from_items([
    ('C', grid_cv_svm_rbf.cv_results_['param_svc_rbf__C']),
    ('gamma', grid_cv_svm_rbf.cv_results_['param_svc_rbf__gamma']),
    ('mean_te', grid_cv_svm_rbf.cv_results_['mean_test_score']),
    ('std_te', grid_cv_svm_rbf.cv_results_['std_test_score'])
])
df_rbf.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,C,gamma,mean_te,std_te
0,0.001,0.1,0.697,0.008718
32,0.133889,0.1,0.697,0.006919
48,0.200333,0.1,0.697,0.007969
72,0.3,0.1,0.69675,0.007441
40,0.167111,0.1,0.69675,0.006052
24,0.100667,0.1,0.6965,0.007921
16,0.0674444,0.1,0.6965,0.007089
56,0.233556,0.1,0.6955,0.005948
8,0.0342222,0.1,0.695,0.006614
64,0.266778,0.1,0.6935,0.008419


In [38]:
best_rbf = df_rbf.sort_values(by='mean_te', ascending=False)[0:1]
print('RBF SVM - top accuracy across folds', best_rbf.iloc[0,2],
      '(std:)', best_rbf.iloc[0,3], ' with C:', best_rbf.iloc[0,0], ' and gamma:', best_rbf.iloc[0,1])

RBF SVM - top accuracy across folds 0.697 (std:) 0.00871779788708131  with C: 0.001  and gamma: 0.1


### Make predictions

In [49]:
# Compute predictions with the "best_estimator_" attribute
grid_cv.best_estimator_.predict(X_te)

# evaluate its accuracy on the test set
accuracy_linear = grid_cv.best_estimator_.score(X_te, y_te)
print ('Linear SVM accuracy on the test set: {:.3f}'.format(accuracy_linear))

Linear SVM accuracy on the test set: 0.809


In [40]:
# Compute predictions with the "best_estimator_" attribute
grid_cv_svm_rbf.best_estimator_.predict(X_te)

# evaluate its accuracy on the test set
accuracy = grid_cv_svm_rbf.best_estimator_.score(X_te, y_te)
print ('RBF SVM accuracy on the test set: {:.3f}'.format(accuracy))

RBF SVM accuracy on the test set: 0.706
