In [25]:
# Scientific libraries
import numpy as np
import pandas as pd
from numpy import linalg
from numpy import random
import scipy
from scipy.spatial.distance import pdist, cdist, squareform
from scipy.stats import mode

# cvxopt QP solver
import cvxopt
from cvxopt import solvers, matrix
solvers.options['show_progress'] = False # Verbose quite

from itertools import product


import matplotlib.pyplot as plt
%matplotlib inline


# Import data_augmentation and SVM class
import sys
sys.path.append('src')

from data_augmentation import *
from SVM import *


In [21]:
def submit_solution(y_pred):
    # Built the Yte.csv file for submission
    df = pd.DataFrame()
    df['Id'] = np.arange(1, y_pred.shape[0]+1)
    df['Prediction'] = y_pred.astype(int)
    df.to_csv('Yte.csv', index=False)

In [2]:
# Loading the Data
df_X_train = pd.read_csv('data/Xtr.csv', header=None, usecols=np.arange(3072))
df_X_test = pd.read_csv('data/Xte.csv', header=None, usecols=np.arange(3072))
df_y_train = pd.read_csv('data/Ytr.csv')


In [3]:
# Splitting train, val, test 

n_train = 4500
n_val = 500

X_train = np.array(df_X_train, dtype=float)[:n_train]
y_train = np.array(df_y_train['Prediction'], dtype=float)[:n_train]
X_val = np.array(df_X_train, dtype=float)[-n_val:]
y_val = np.array(df_y_train['Prediction'], dtype=float)[-n_val:]
X_test = np.array(df_X_test, dtype=float)

In [None]:
# calling data_augmentation.py to flip images

X_train_flip = flip_lr(X_train)
X_val_flip = flip_lr(X_val)
X_test_flip = flip_lr(X_test)

In [None]:
# Data Preprocessing

# HOG transform with 12 bins, window size 8 and step size 2 (might take 20 min)
hog_train = histogram_of_gradients(X_train, 12, 8, 2)
hog_val = histogram_of_gradients(X_val, 12, 8, 2)
hog_test = histogram_of_gradients(X_test, 12, 8, 2)

# Also on flipped images
hog_train_flip = histogram_of_gradients(X_train_flip, 12, 8, 2)
hog_val_flip = histogram_of_gradients(X_val_flip, 12, 8, 2)
hog_test_flip = histogram_of_gradients(X_test_flip, 12, 8, 2)

In [4]:
# Or loading directly from existing files
hog_train = np.load('hog12_train.npy')
hog_val = np.load('hog12_val.npy')
hog_test = np.load('hog12_test.npy')
hog_train_flip = np.load('hog12_train_flip.npy')
hog_val_flip = np.load('hog12_val_flip.npy')
hog_test_flip = np.load('hog12_test_flip.npy')

In [27]:
%%time
# training a model with our tuned parameters OVA mode might take 5 min
param = {'C' : 4,
          'kernel' : 'rbf',
         'gamma' : 0.008,
          'loss' : 'hinge',
          'mode' : 'OVA'
         }
svm = SVM(**param)
svm.fit(hog_train, y_train)

rbf choosen
CPU times: user 59.2 s, sys: 88 ms, total: 59.3 s
Wall time: 52.2 s


In [28]:
# validation and accuracy
pred = svm.predict(hog_val)
accuracy = np.mean(pred == y_val)
print('Accuracy: ', accuracy)

Accuracy:  0.61


In [None]:
# prediction on the test set and saving the Yte.csv file
test_pred = svm.predict(hog_test)
submit_solution(hog_test)


In [24]:
# For information hyperparameters were tuned using this function from SVM.py

def tune_parameters(X, y, X_val, y_val, param_grid, n_train, X_test = None, verbose = True):
    """X : array which would be split in train and val set according to n_train
    n_train : number of train samples. Integer or percentage
    param_grid : dict containing list of parameters to be tested
    IMPORTANT : param_grid values have to be a list. ex : not 'hinge' but ['hinge']
    IMPORTANT 2 : X_val has to be set beforehand, because if we split randomly and that a flip image 
    arrive in the validation set, the accuracy would be anormaly high
    """
    
    n_total = X.shape[0]
    if n_total != y.shape[0]:
        raise Exception('X and y have different size')
    
    
    # Storing results
    scores = {}
    preds = {}
    preds_test = {}
    estimators = {}
    param_grid = [param_grid]
    for param in param_grid:
        # sort the keys of a dictionary, for reproducibility
        items = sorted(param.items())
        keys, values = zip(*items)
        for v in product(*values):
            params = dict(zip(keys, v))
            # Parameters are ready for fitting the model            
            svm = SVM2(**params)
            
            # Checking if n_train is percentage or integer
            if n_train <= 1:
                idx_train = random.choice(np.arange(n_total), int(n_train*n_total), replace=False)
            else :
                idx_train = random.choice(np.arange(n_total), n_train, replace=False)
               
            #idx_val = list(set(np.arange(n_total)) - set(idx_train))
            # n_val max is set to 2000
            """if len(idx_val) > 2000:
                idx_val = idx_val[:2000]
            """
            # Fitting and storing results
            svm.fit(X[idx_train], y[idx_train])
            pred = svm.predict(X_val)
            estimators[str(params)] = svm.alphas_
            preds[str(params)] = pred
            score = np.mean(pred == y_val)                
            scores[str(params)]= score
            
            if X_test is not None:
                    pred_test = svm.predict(X_test)
                    preds_test[str(params)] = pred_test
            
            if verbose is True:
                print(params)
                print('SCORE : ', score)
    
    return {'scores' : scores, 'preds' : preds, 'estimators' : estimators, 'preds_test' : preds_test}