# Homework #3 

Matriculation number: A0153347A
Email: e0025553@nus.edu.sg

### General Notes about this assignment 

Data:
X_train.npy: training data numpy array. Each row in the array corresponds to an image unrolled to a vector (50 x 37 = 1850 dimension)

y_train.npy: labels (0-6) of each data corresponding to the image in the same row in X_train.npy

X_test.npy: testing data numpy array for evaluation of your models. The prediction outputs on this test data is to be submitted for scoreboard

Ideas:
- multiclass SVM (see Multiclass classification)
- kNN with PCA 
- CNN  
- Viola-Jones face detector


### Files included with this submission

In my submission there are this iPython notebook file for the code part and my pdf regarding the essay.

## Building and evaluating models

In [None]:
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd 
%matplotlib inline

from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from time import time
import os

# Size the plot appropriately for online display
pl.rcParams['figure.figsize'] = (12.0, 10.0)

In [None]:
#Load dataset

xin= np.load("./X_train.npy")
yin= np.load("./y_train.npy")
xout= np.load("./X_test.npy")

In [None]:
def dimReduction(n_components, X_train):

    h= 50
    w= 37
    t0 = time()
    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    pca = PCA(n_components=n_components, svd_solver='randomized',
              whiten=True).fit(X_train)

    eigenfaces = pca.components_.reshape((n_components, h, w))

    print("Projecting the input data on the eigenfaces orthonormal basis")
    X_pca = pca.transform(X_train)
    print("done in %0.3fs" % (time() - t0))
    return X_pca

In [None]:
def svmModel(ker, xin,yin, c, g, d, coef):
    modelOVO= svm.SVC(decision_function_shape='ovo',kernel=ker, C=c, gamma=g, degree=d, coef0=coef)
    modelOVO.fit(xin,yin)
    modelOVR= svm.SVC(decision_function_shape='ovr',kernel=ker, C=c, gamma=g, degree=d, coef0=coef)
    modelOVR.fit(xin,yin)
    return (modelOVO,modelOVR)

In [None]:
def svmPredict(model,x,y):
    ypredict= model.predict(x)
    score= f1_score(y , ypredict, average='weighted')
    return score

In [None]:
def createSubmission(filename, y):
    fo = open( filename , 'w' )
    fo.write("ImageId,PredictedClass\n")
    for i in range(y.shape[0]):
        fo.write(str(i)+","+str(y[i])+"\n")
    fo.close()

In [None]:
def clean(filename):
    os.remove(filename)

In [None]:
def appendFile(filename, text):
    fo = open(filename, 'a')
    fo.write(text)
    fo.close()

In [None]:
def evaluateModel(kern, C, gamma, degree, coef, xin, yin, xout, yout, filename):
    modelO, modelR= svmModel(kern, xin, yin, C, gamma, degree, coef)
    #print(kern + " C = "+str(C)+" and gamma= "+str(gamma))
    appendFile(filename,"Kernel= "+ str(kern) + " C = "+str(C)+" and gamma= "+str(gamma)+ 
               " degree= "+str(degree)+" coef= "+ str(coef)+"\n")
    scoreO= svmPredict(modelO, xout, yout)
    scoreR= svmPredict(modelR, xout, yout)
    appendFile(filename, "OVO model t1score: "+str(scoreO)+"\n")
    appendFile(filename, "OVO model t1 score: "+str(scoreR)+"\n"+"\n")
    return (scoreO,scoreR)

In [None]:
def tuneParams(kernel, cost, gamma,degree, coef, dim, filename):

    skf = StratifiedKFold(n_splits=4)
    t0= time()
    resultO= 0.0
    resultR= 0.0
    paramsO= ('null', 0.0,0.0, 0, 0, 0.0)
    paramsR= ('null', 0.0,0.0, 0, 0, 0.0)
    
    for train, test in skf.split(xin, yin):
        for k in range(len(kernel)):
            for i in range(C.shape[0]):
                for j in range(gamma.shape[0]):
                    for t in range(degree.shape[0]):
                        for s in range(coef.shape[0]):
                            for h in range(dim.shape[0]):
                                pca= PCA(n_components = dim[h])
                                pca.fit(xin)
                                xin_new= pca.transform(xin)
                                tmp= evaluateModel(kernel[k],C[i],gamma[j], degree[t], coef[s], xin_new[train], yin[train], xin_new[test], yin[test], filename)
                                if(resultO<tmp[0]):
                                    resultO= tmp[0]
                                    paramsO= (kernel[k], C[i], gamma[j], dim[h], degree[t], coef[s])
                                if(resultR<tmp[1]):
                                    resultR= tmp[1]
                                    paramsR= (kernel[k], C[i], gamma[j], dim[h],degree[t], coef[s])
                        
    print("Best obtained for O: "+str(resultO) + " with "+ str(paramsO))
    print("Best obtained for R: " +str(resultR) + " with " +str(paramsR))
    print("Time "+ str(time()-t0))
    appendFile(filename,"Best obtained for O: "+str(resultO) + " with "+ str(paramsO)+"\n")
    appendFile(filename,"Best obtained for R: " +str(resultR) + " with " +str(paramsR)+"\n")
    appendFile(filename,"Time "+ str(time()-t0)+"\n")

In [None]:
C= np.array([1 , 0.1, 0.001, 0.0001])
gamma= np.array([1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001])
kernel= ['linear', 'rbf', 'poly', 'sigmoid']
dim= np.array([966, 400, 350, 250, 200, 150, 100, 50])
degree= np.array([0, 1, 2, 3, 4, 5, 6])
coef= np.array([0, 1, 10, 0.1, 0.01, 0.001, 0.0001, 0.00001])
empty= np.array([1])

In [None]:
clean("svmLinear.txt")
tuneParams([kernel[0]], C, gamma[0:1], empty, empty, dim, "svmLinear.txt")

In [None]:
clean("svmRBF.txt")
tuneParams([kernel[1]], C, gamma, empty, empty, dim, "svmRBF.txt")

In [None]:
clean("svmPoly.txt")
tuneParams([kernel[2]], C, gamma, degree, coef, dim, "svmPoly.txt")

In [None]:
clean("svmSigmoid.txt")
tuneParams([kernel[3]], C, gamma, empty, coef, dim, "svmSigmoid.txt")

## Statement of Individual Work

Please initial (between the square brackets) one of the following statements.

[FP] I, A0153447A , certify that I have followed the CS 3244 Machine Learning class guidelines for homework assignments.  In particular, I expressly vow that I have followed the Facebook rule in discussing with others in doing the assignment and did not take notes (digital or printed) from the discussions.  

I suggest that I should be graded as follows:

100 marks

### References

I have refered to the following list of people and websites in preparing my homework submission:

http://docs.scipy.org/doc/numpy/reference/,
http://stackoverflow.com/,
http://matplotlib.org/api/pyplot_summary.html,
Textbook: Learning From Data,
Lecture's slides