In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set()

In [2]:
# Load data
with np.load('cifar4-train.npz', allow_pickle=False) as data:
    X = data['overfeat']
    y = data['labels']
    
print(X.shape)
print(y.shape)

(5000, 4096)
(5000,)


In [3]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, 
                                                    stratify=y, test_size=0.2)


In [4]:
# For both kernels (linear and RBF) we perform PCA for 
# 400 PC's (>95% explained variance) for computational efficiency.

from sklearn.decomposition import PCA
pca = PCA(n_components=400)

# transform data sets:
X_tr = pca.fit_transform(X_train)
X_te = pca.transform(X_test)

# Since SVM's measure distances/similarities, we should scale the data 
# before feeding them to the SVM classifier. After some inspection, we see
# that all features are already more or less in the same scale.

# SVM - Linear Kernel

In [5]:
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import ParameterGrid, cross_validate

linear_svc = LinearSVC() # for the linear kernel we use linearSVC because its faster

In [6]:
grid = ParameterGrid({
    'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
})

val_scores = []
for params_dict in grid:
    
    linear_svc.set_params(**params_dict)
    
    # cross validate uses stratified Kfold by default
    cv_results = cross_validate(linear_svc, X_tr, y_train, cv=5, n_jobs=-1,
                                return_train_score=False)
    
    params_dict['mean val accuracy'] = cv_results['test_score'].mean()
    params_dict['std of val accuracy'] = cv_results['test_score'].std()
    val_scores.append(params_dict)

In [7]:
scores_lin = pd.DataFrame(val_scores)
scores_lin = scores_lin.sort_values(by='mean val accuracy', ascending=False)
scores_lin

Unnamed: 0,C,mean val accuracy,std of val accuracy
1,0.001,0.84075,0.017776
0,0.0001,0.83725,0.01564
2,0.01,0.82425,0.012212
3,0.1,0.81125,0.011673
5,10.0,0.7925,0.010488
4,1.0,0.7905,0.009172
6,100.0,0.789,0.01404


In [8]:
# the best parameter C lies around 0.001. I do a refined search
# to see if we get better results

grid = ParameterGrid({
    'C': np.arange(0.0007, 0.0013, 0.0001)
})

val_scores = []
for params_dict in grid:
    
    linear_svc.set_params(**params_dict)
    
    # cross validate uses stratified Kfold by default
    cv_results = cross_validate(linear_svc, X_tr, y_train, cv=5, n_jobs=-1,
                                return_train_score=False)
    
    params_dict['mean val accuracy'] = cv_results['test_score'].mean()
    params_dict['std of val accuracy'] = cv_results['test_score'].std()
    val_scores.append(params_dict)

In [9]:
scores_lin = pd.DataFrame(val_scores)
scores_lin = scores_lin.sort_values(by='mean val accuracy', ascending=False)
scores_lin

# Indeed, I got slightly better validation scores.

Unnamed: 0,C,mean val accuracy,std of val accuracy
0,0.0007,0.84225,0.017219
1,0.0008,0.842,0.018003
2,0.0009,0.8415,0.018327
3,0.001,0.84075,0.017776
4,0.0011,0.84025,0.017878
5,0.0012,0.839,0.016945


# RBF Kernel

In [10]:
rbf_svc = SVC(kernel='rbf')

grid = ParameterGrid({
    'C': [0.01, 0.03, 0.1, 0.3, 1],
    'gamma': [0.01, 0.03, 0.1, 0.3, 1]
})

val_scores = []
for params_dict in grid:
    
    rbf_svc.set_params(**params_dict)
    
    # cross validate uses stratified Kfold by default
    cv_results = cross_validate(rbf_svc, X_tr, y_train, cv=5, n_jobs=-1,
                                return_train_score=False)
    
    params_dict['mean val accuracy'] = cv_results['test_score'].mean()
    params_dict['std of val accuracy'] = cv_results['test_score'].std()
    val_scores.append(params_dict)

In [11]:
scores_rbf = pd.DataFrame(val_scores)
scores_rbf = scores_rbf.sort_values(by='mean val accuracy', ascending=False)
scores_rbf.head(10)

Unnamed: 0,C,gamma,mean val accuracy,std of val accuracy
12,0.1,0.1,0.6915,0.011303
2,0.01,0.1,0.6915,0.011303
7,0.03,0.1,0.6915,0.011303
17,0.3,0.1,0.6915,0.011303
23,1.0,0.3,0.529,0.013167
18,0.3,0.3,0.529,0.013167
13,0.1,0.3,0.5285,0.013072
8,0.03,0.3,0.528,0.012762
3,0.01,0.3,0.52775,0.012684
20,1.0,0.01,0.284,0.012684


## validation scores for both models

In [12]:
print('Linear SVM - top ccuracy across folds: {:.4f} (std: {:.4f}) '
     'with C={}'.format(scores_lin.iloc[0,1], scores_lin.iloc[0,2], scores_lin.iloc[0,0]))

print('RBF SVM - top ccuracy across folds: {:.4f} (std: {:.4f}) '
     'with C={} and gamma={}.'.format(scores_rbf.iloc[0,2], scores_rbf.iloc[0,3], 
                                     scores_rbf.iloc[0,0], scores_rbf.iloc[0,1]))

Linear SVM - top ccuracy across folds: 0.8422 (std: 0.0172) with C=0.0007
RBF SVM - top ccuracy across folds: 0.6915 (std: 0.0113) with C=0.1 and gamma=0.1.


## Test scores

In [13]:
linear_svc = LinearSVC(C=0.0008)
linear_svc.fit(X_tr, y_train)
lin_score = linear_svc.score(X_te, y_test)

rbf_svc = SVC(kernel='rbf', C=0.1, gamma=0.1)
rbf_svc.fit(X_tr, y_train)
rbf_score = rbf_svc.score(X_te, y_test)

print('Linear SVM accuracy (test set): {:.4f}'.format(lin_score))
print('RBF SVM accuracy (test set): {:.4f}'.format(rbf_score))

Linear SVM accuracy (test set): 0.8280
RBF SVM accuracy (test set): 0.7020
