### Hyper-Parameter Tuning Methodology in Task A2 (Model 2)

This Jupyter Notebook shows the methodology used in task B1 to pick the best parameters for model 2. This model uses face landmarks (provided in lab 2) as features for a Support Vector Machine (SVM).

In order to observe the impact of the models hyper-parameters, Grid Search Cross-Validation was performed with a variety of possible parameters. This method undertakes an exhaustive search over given parameter settings, as to find the combination of parameters which will perform best.

In [1]:
# Import statements
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GridSearchCV
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import glob, os
import matplotlib.pyplot as plt 
from matplotlib import image
import numpy as np
import time
import sys
sys.path.append("..")
import Datasets.LandmarksFT.landmarksA2 as landmarks

Using TensorFlow backend.


### Importing & pre-processing data

The steps taken when importing & pre-processing the data are the same as the ones performed in the final model in A1.py, and described in the report.

In [2]:
def mainA2():
    tr_data, tr_lbs, te_data, te_lbs = landmark_computation()
    data_train = tr_data.reshape(tr_data.shape[0], tr_data.shape[1]*tr_data.shape[2])
    data_test = te_data.reshape(te_data.shape[0], te_data.shape[1]*te_data.shape[2])
    
    pca_train, pca_test = dimensionality_reduction(data_train, data_test)
    return pca_train, pca_test, tr_lbs, te_lbs

def landmark_computation():
    imgs, lbs = landmarks.extract_features_labels()
    tr_data, te_data, tr_lbs, te_lbs = train_test_split(imgs, lbs, test_size=0.2)
    return tr_data, tr_lbs, te_data, te_lbs

def dimensionality_reduction(train_dataset, test_dataset):
    '''
    Scales the data and performs Principal Component 
    Analysis (PCA) on a given dataset
    '''

    print("Dimensionality reduction started!")
    time0 = time.time()

    scaler = StandardScaler()
    scaler.fit(train_dataset)
    train_dataset = scaler.transform(train_dataset)
    test_dataset = scaler.transform(test_dataset)

    pca = PCA(n_components = 'mle', svd_solver = 'full')

    pca.fit(train_dataset)
    train_dataset = pca.transform(train_dataset)
    test_dataset = pca.transform(test_dataset)

    time1 = time.time()
    print("Dimensionality reduction finished, it took: ", (time1-time0)/60, " min")
    return train_dataset, test_dataset

In [3]:
data_train, data_test, lbs_train, lbs_test = mainA2()

Dimensionality reduction started!
Dimensionality reduction finished, it took:  0.01735790173212687  min


### Grid Search Cross-Validation Implementation & Results

In [4]:
# Parameter distribution to perform the search on
param_dist = { 
    # Kernel type to be used in the algorithm
    'kernel': ('linear', 'rbf'),   

    # Regularization parameter
    'C': [0.1,0.3,1,3,10,30],
    #'C': [30,50,70,100,150,200],

    # Kernel coefficient if kernel is 'rbf'
    'gamma': ['scale',0.001,0.01,0.1,0.3,1],

    # Specifying the seed for random distribution of data
    'random_state': [42]
}

In [5]:
def report(results, n_top=3):
    '''
    Helper function to report best scores for model
    '''
    
    for i in range(1, n_top + 1): 
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [6]:
# Running Grid Search

clf = SVC()
grid_search = GridSearchCV(clf, param_grid=param_dist, cv=5)
start = time.time()
grid_search.fit(data_train, lbs_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
    % (time.time() - start, len(grid_search.cv_results_['params'])))
print("")

report(grid_search.cv_results_)

GridSearchCV took 1495.18 seconds for 72 candidate parameter settings.

Model with rank: 1
Mean validation score: 0.896 (std: 0.012)
Parameters: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 42}

Model with rank: 2
Mean validation score: 0.893 (std: 0.011)
Parameters: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf', 'random_state': 42}

Model with rank: 3
Mean validation score: 0.893 (std: 0.012)
Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf', 'random_state': 42}

