In [1]:
import numpy as np
import os

# Load the data
with np.load('cifar4-train.npz', allow_pickle=False) as npz_file:
    # Load items into a dictionary
    cifar = dict(npz_file.items())

print(cifar.keys())

pixels = cifar['pixels']
overfeat = cifar['overfeat']
labels = cifar['labels']
names = cifar['names']

print('pixels shape :',pixels.shape, ', dtype:', pixels.dtype)
print('overfeat shape :',overfeat.shape, ', dtype:', overfeat.dtype)
print('labels shape :',labels.shape, ', dtype:', labels.dtype)
print('Categories:', names)

# split the Overfeat data into train/test sets w/ same proportion of classes in each subset

import pandas as pd
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    overfeat, labels, train_size=4000, test_size=1000, 
    stratify=labels , # same class distribution
    random_state=0)

print('Train:', X_tr.shape, y_tr.shape)
print('Test:', X_te.shape, y_te.shape)

dict_keys(['pixels', 'overfeat', 'labels', 'names', 'allow_pickle'])
pixels shape : (5000, 3072) , dtype: uint8
overfeat shape : (5000, 4096) , dtype: float32
labels shape : (5000,) , dtype: int64
Categories: ['truck' 'car' 'airplane' 'ship']
Train: (4000, 4096) (4000,)
Test: (1000, 4096) (1000,)


### SVM classifier with a linear kernel

In [2]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold

# Create decision tree classifier
pipe = Pipeline([
    #('scaler', StandardScaler()), #  Standardization
    ('pca', PCA(n_components=172)), # PCA preprocessing, retain 90% of the variance explained.
    ('linear_svc', LinearSVC()) # SVM with linear kernel
])

# grid search with cross validation
from sklearn.model_selection import GridSearchCV

# Create cross-validation object, with stratified split strategy
grid_cv = GridSearchCV(pipe, {
    #'scaler': [None,StandardScaler()],
    #'pca' : [None, PCA(n_components=172)],
    'linear_svc__C': list(np.linspace(0.0001,0.3,15)) # list of C values
    }, cv=StratifiedKFold(n_splits=5), # stratified folds: each set contains approx the same % of target class
                       n_jobs = -1,verbose = 5) 

In [3]:
%%time
# Fit estimator on train set
grid_cv.fit(X_tr, y_tr)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] linear_svc__C=0.0001 ............................................
[CV] ............... linear_svc__C=0.0001, score=0.8225, total=   6.1s
[CV] linear_svc__C=0.0001 ............................................
[CV] .............. linear_svc__C=0.0001, score=0.82375, total=   6.9s
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] .............. linear_svc__C=0.0001, score=0.84125, total=   7.0s
[CV] .............. linear_svc__C=0.0001, score=0.80375, total=   6.8s
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] linear_svc__C=0.02152142857142857 ...............................
[CV] .............. linear_svc__C=0.0001, score=0.83875, total=   5.6s
[CV] linear_svc_

[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   28.2s


[CV] ... linear_svc__C=0.042942857142857144, score=0.81, total=  10.2s
[CV] linear_svc__C=0.042942857142857144 ..............................
[CV] . linear_svc__C=0.042942857142857144, score=0.8425, total=  10.7s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV]  linear_svc__C=0.042942857142857144, score=0.85125, total=  10.8s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV]  linear_svc__C=0.042942857142857144, score=0.81125, total=   9.5s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] . linear_svc__C=0.042942857142857144, score=0.8425, total=   9.5s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] . linear_svc__C=0.06436428571428572, score=0.80625, total=   9.2s
[CV] linear_svc__C=0.06436428571428572 ...............................
[CV] . linear_svc__C=0.06436428571428572, score=0.83375, total=  10.3s
[CV] linear_svc__C=0.08578571428571428 ...............................
[CV] .

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.0min


[CV] linear_svc__C=0.2785785714285714 ................................
[CV] . linear_svc__C=0.25715714285714286, score=0.81875, total=  12.4s
[CV] linear_svc__C=0.2785785714285714 ................................
[CV] ... linear_svc__C=0.2785785714285714, score=0.7975, total=  12.3s
[CV] linear_svc__C=0.2785785714285714 ................................
[CV] ... linear_svc__C=0.2785785714285714, score=0.8225, total=  12.5s
[CV] linear_svc__C=0.3 ...............................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.82875, total=  12.4s
[CV] linear_svc__C=0.3 ...............................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.81875, total=   9.9s
[CV] linear_svc__C=0.3 ...............................................
[CV] .. linear_svc__C=0.2785785714285714, score=0.81625, total=  10.2s
[CV] linear_svc__C=0.3 ...............................................
[CV] .................. linear_svc__C=0.3, score=0.8125, total=  10.0s
[CV] l

[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  3.5min finished


CPU times: user 18.3 s, sys: 516 ms, total: 18.8 s
Wall time: 3min 37s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linear_svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'linear_svc__C': [0.0001, 0.02152142857142857, 0.042942857142857144, 0.06436428571428572, 0.08578571428571428, 0.10720714285714285, 0.12862857142857143, 0.15005, 0.17147142857142855, 0.19289285714285712, 0.21431428571428568, 0.23573571428571427, 0.25715714285714286, 0.2785785714285714, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=5)

In [4]:
import pandas as pd

# Collect results in a DataFrame
df = pd.DataFrame.from_items([
    ('C', grid_cv.cv_results_['param_linear_svc__C']),
    ('mean_te', grid_cv.cv_results_['mean_test_score']),
    ('std_te', grid_cv.cv_results_['std_test_score'])
])
df.sort_values(by='mean_te', ascending=False)

Unnamed: 0,C,mean_te,std_te
1,0.0215214,0.8335,0.017436
2,0.0429429,0.8315,0.017346
6,0.128629,0.829,0.012585
5,0.107207,0.8285,0.016035
10,0.214314,0.82775,0.01819
4,0.0857857,0.82725,0.01958
7,0.15005,0.82725,0.013072
0,0.0001,0.826,0.013472
8,0.171471,0.825,0.016583
3,0.0643643,0.824,0.014967


In [5]:
best = df.sort_values(by='mean_te', ascending=False)[0:1]
print('Linear SVM - top accuracy across folds {:.3f}'.format(best.iloc[0,1]),
      'std: {:.3f}'.format(best.iloc[0,2]), ' with C: {:.3f}'.format(best.iloc[0,0]))


Linear SVM - top accuracy across folds 0.834 std: 0.017  with C: 0.022


### SVM classifier with an RBF kernel

In [6]:
from sklearn.svm import SVC

# Create decision tree classifier
pipe = Pipeline([
    #('scaler', StandardScaler()), #  Standardization
    ('pca', PCA(n_components=172)), # PCA preprocessing, retain 90% of the variance explained.
    ('svc_rbf', SVC(kernel='rbf')) # SVM with RBF kernel
])

# grid search with cross validation
from sklearn.model_selection import GridSearchCV

# Create cross-validation object
grid_cv_svm_rbf = GridSearchCV(pipe, {
    'svc_rbf__C': list(np.linspace(0.001,0.3,10)), # list of C values
    'svc_rbf__gamma' : list(np.linspace(0.1,10,8)) # list of gamma values
    }, cv=StratifiedKFold(n_splits=5), # stratified folds: each set contains approx the same % of target class
                            n_jobs = -1) 

print(grid_cv.get_params)

<bound method BaseEstimator.get_params of GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('linear_svc', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'linear_svc__C': [0.0001, 0.02152142857142857, 0.042942857142857144, 0.06436428571428572, 0.08578571428571428, 0.10720714285714285, 0.12862857142857143, 0.15005, 0.17147142857142855, 0.19289285714285712, 0.21431428571428568, 0.23573571428571427, 0.25715714285714286, 0.2785785714285714, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sco

In [11]:
%%time
# Fit estimator on train set
grid_cv_svm_rbf.fit(X_tr, y_tr)

CPU times: user 59.2 s, sys: 698 ms, total: 59.9 s
Wall time: 21min 13s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=172, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svc_rbf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc_rbf__C': [0.001, 0.03422222222222222, 0.06744444444444445, 0.10066666666666667, 0.1338888888888889, 0.1671111111111111, 0.20033333333333334, 0.23355555555555557, 0.2667777777777778, 0.3], 'svc_rbf__gamma': [0.1, 1.5142857142857145, 2.928571428571429, 4.3428571428571425, 5.757142857142857, 7.171428571428572, 8.585714285714285, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True

In [12]:
import pandas as pd

# Collect results in a DataFrame
df_rbf = pd.DataFrame.from_items([
    ('C', grid_cv_svm_rbf.cv_results_['param_svc_rbf__C']),
    ('gamma', grid_cv_svm_rbf.cv_results_['param_svc_rbf__gamma']),
    ('mean_te', grid_cv_svm_rbf.cv_results_['mean_test_score']),
    ('std_te', grid_cv_svm_rbf.cv_results_['std_test_score'])
])
df_rbf.sort_values(by='mean_te', ascending=False).head(10)

Unnamed: 0,C,gamma,mean_te,std_te
72,0.3,0.1,0.6965,0.007599
56,0.233556,0.1,0.6965,0.006773
24,0.100667,0.1,0.6965,0.007517
48,0.200333,0.1,0.696,0.007802
40,0.167111,0.1,0.69575,0.005339
8,0.0342222,0.1,0.6955,0.006052
32,0.133889,0.1,0.69525,0.01029
0,0.001,0.1,0.69475,0.005612
64,0.266778,0.1,0.6945,0.006643
16,0.0674444,0.1,0.6945,0.007185


In [13]:
best_rbf = df_rbf.sort_values(by='mean_te', ascending=False)[0:1]
print('RBF SVM - top accuracy across folds', best_rbf.iloc[0,2],
      '(std:)', best_rbf.iloc[0,3], ' with C:', best_rbf.iloc[0,0], ' and gamma:', best_rbf.iloc[0,1])

RBF SVM - top accuracy across folds 0.6965 (std:) 0.007599342076785336  with C: 0.3  and gamma: 0.1


### Compute predictions

In [14]:
# Compute predictions with the "best_estimator_" attribute
grid_cv.best_estimator_.predict(X_te)

# evaluate its accuracy on the test set
accuracy_linearSVM = grid_cv.best_estimator_.score(X_te, y_te)
print ('Linear SVM accuracy on the test set: {:.3f}'.format(accuracy_linearSVM))

# save results
import csv
results = ['Linear SVM', accuracy_linearSVM]
with open(r'results.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(results)

Linear SVM accuracy on the test set: 0.815


In [15]:
# Compute predictions with the "best_estimator_" attribute
grid_cv_svm_rbf.best_estimator_.predict(X_te)

# evaluate its accuracy on the test set
accuracy_RBFSVM = grid_cv_svm_rbf.best_estimator_.score(X_te, y_te)
print ('RBF SVM accuracy on the test set: {:.3f}'.format(accuracy_RBFSVM))

# save results
import csv
results = ['RBF SVM', accuracy_RBFSVM]
with open(r'results.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(results)

RBF SVM accuracy on the test set: 0.708
