In [1]:
#Load the usual suspects

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [2]:
# Load the data

with np.load('/Users/David/Desktop/EPFL Applied ML/cifar4-train.npz', allow_pickle=False) as npz_file:
    pixels = npz_file['pixels'].astype('float32')
    overfeat = npz_file['overfeat']
    labels = npz_file['labels']
    names = npz_file['names']
    allow = npz_file['allow_pickle']
    
print('Pixels : {:}'.format(pixels.shape),
      'Overfeat: {:}'.format(overfeat.shape),
      'Labels: {:}'.format(labels.shape),
      'Names: {:}'.format(names.shape), 
      'Allow pickle: {:}'.format(allow.shape))

Pixels : (5000, 3072) Overfeat: (5000, 4096) Labels: (5000,) Names: (4,) Allow pickle: ()


In [3]:
# Rename the data and split into train (4000) and test (1000) sets
X_ov = overfeat
X_px = pixels
y = labels

X_ov_tr, X_ov_te, X_px_tr, X_px_te, y_tr, y_te = train_test_split(X_ov, X_px, y, test_size=1000, stratify=y, random_state=0)

# Double check dimensions for the overfeat data:
print('Train set: {:}'.format(X_ov_tr.shape), 'Test set: {:}'.format(X_ov_te.shape))

Train set: (4000, 4096) Test set: (1000, 4096)


In [4]:
# Create a support vector machine classifier using a pipeline:

svm_pipe = Pipeline([
    ('scaler', StandardScaler()), # scaling step - helps with algorithm convergence, especially if gradient descent is used
    ('pca', PCA(n_components=176)), # pca step with 176 to get 90% of the proportion of variance explained. It also reduces dimensionality, which increases algorithm speed and accuracy
    ('svm', SVC()) # leave default settings to be able to create a custom parameter grid
])

# Create a list with the iterable values to feed the svm
C = [0.1, 1, 10] # regularization parameter
gamma = [0.01, 0.1, 1, 10, 100, 1000] # rbf distribution strength

# Create cross-validation object with 5 a stratified cross-validation strategy and custom parameter grid. 
grid_svm = GridSearchCV(svm_pipe, [{
    'svm__kernel': ['rbf'], # include an rbf kernel with tunable C and gamma paremeters
    'svm__C' : C,
    'svm__gamma' : gamma},
    {'svm__kernel': ['linear'], # include a linear kernel with tunable C parameter
    'svm__C' : C
}], cv=5, return_train_score=True, n_jobs=-1)
# in this case, the choice of SVC VS SVClinear or SGD with hinge loss is just a matter of code simplification, as it allows to run a pipeline only once being able to save on preprocessing. 
# To save also time, a custom grid search is defined to separate between linear and rbf kernels. They could have been defined altogether calling kernel = ['rbf', 'linear'] and keeping C and gamma as inputs ('linear' kernel ignores the gamma parameter anyway), but it would have meant fitting the same linear kernel several times, thus wasting time.

# Fit the cross-validation grid
grid_svm.fit(X_ov_tr, y_tr)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=176, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('svm', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'svm__kernel': ['rbf'], 'svm__C': [0.1, 1, 10], 'svm__gamma': [0.01, 0.1, 1, 10, 100, 1000]}, {'svm__kernel': ['linear'], 'svm__C': [0.1, 1, 10]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [24]:
# Put everything nicely into a dataframe and print the best accuracy across the five folds for the rbf kernel:
svm_rbf = pd.DataFrame(grid_svm.cv_results_)[pd.DataFrame(grid_svm.cv_results_)['param_svm__kernel'] == 'rbf'][['param_svm__kernel', 'param_svm__C', 'param_svm__gamma', 'mean_test_score', 'std_test_score']] # create a dataframe from the results dictionary, selecting the columns corresponding to the mean and std deviation of the test accuracy, and rows corresponding to the rbf kernel
svm_rbf.columns = ['kernel', 'C', 'gamma', 'mean test accuracy', 'standard deviation test accuracy'] # column names

print('Top rbf accuracy on training set across 5 folds: {:.5f}'.format(svm_rbf['mean test accuracy'].max()), 
      '(std: {:.5f})'.format(svm_rbf['standard deviation test accuracy'][svm_rbf['mean test accuracy'].idxmax()]),
      'for C: {:.1f}'.format(svm_rbf['C'][svm_rbf['mean test accuracy'].idxmax()]),
      'and gamma: {:.3f}'.format(svm_rbf['gamma'][svm_rbf['mean test accuracy'].idxmax()]))

svm_rbf.drop(columns=['kernel']).sort_values(by='mean test accuracy', ascending=False) # dataframe showing all the rbf svm cases

Top rbf accuracy on training set across 5 folds: 0.68875 (std: 0.01214) for C: 0.1 and gamma: 0.100


Unnamed: 0,C,gamma,mean test accuracy,standard deviation test accuracy
1,0.1,0.1,0.68875,0.012145
12,10.0,0.01,0.275,0.005184
6,1.0,0.01,0.27475,0.009199
2,0.1,1.0,0.2515,0.000935
14,10.0,1.0,0.2515,0.000935
8,1.0,1.0,0.2515,0.000935
0,0.1,0.01,0.25025,0.0005
7,1.0,0.1,0.25025,0.0005
13,10.0,0.1,0.25025,0.0005
16,10.0,100.0,0.25,0.0


In [25]:
# Put everything nicely into a dataframe and print the best accuracy across the five folds for the linear kernel:
svm_lin = pd.DataFrame(grid_svm.cv_results_)[pd.DataFrame(grid_svm.cv_results_)['param_svm__kernel'] == 'linear'][['param_svm__kernel', 'param_svm__C', 'mean_test_score', 'std_test_score']] # create a dataframe from the results dictionary, selecting the columns corresponding to the mean and std deviation of the test accuracy, and rows corresponding to the linear kernel
svm_lin.columns = ['kernel', 'C', 'mean test accuracy', 'standard deviation test accuracy'] # column names

print('Top linear accuracy on training set across 5 folds: {:.5f}'.format(svm_lin['mean test accuracy'].max()),
      '(std: {:.5f})'.format(svm_lin['standard deviation test accuracy'][svm_lin['mean test accuracy'].idxmax()]),
      'for C: {:.1f}'.format(svm_lin['C'][svm_lin['mean test accuracy'].idxmax()]))

svm_lin.drop(columns=['kernel']).sort_values(by='mean test accuracy', ascending=False) # dataframe showing all the linear svm cases

Top linear accuracy on training set across 5 folds: 0.80050 (std: 0.00545) for C: 0.1


Unnamed: 0,C,mean test accuracy,standard deviation test accuracy
18,0.1,0.8005,0.005454
20,10.0,0.79675,0.012465
19,1.0,0.794,0.013024


In [26]:
# Finally, fit your (tuned) rbf and linear estimators on the entire train set with 4,000 data points and evaluate them on the test set. Print the accuracy values.

# For the rbf kernel
rbf_pipe = svm_pipe.set_params(svm__kernel='rbf',
                               svm__C=svm_rbf['C'][svm_rbf['mean test accuracy'].idxmax()],
                               svm__gamma=svm_rbf['gamma'][svm_rbf['mean test accuracy'].idxmax()]).fit(X_ov_tr, y_tr)

print('RBF SVM accuracy (test set): {:.5f}'.format(rbf_pipe.score(X_ov_te, y_te)))

# For the linear kernel
lin_pipe = svm_pipe.set_params(svm__kernel='linear',
                               svm__C=svm_lin['C'][svm_lin['mean test accuracy'].idxmax()]).fit(X_ov_tr, y_tr)

print('Linear SVM accuracy (test set): {:.5f}'.format(lin_pipe.score(X_ov_te, y_te)))

RBF SVM accuracy (test set): 0.71000
Linear SVM accuracy (test set): 0.80600
