In [58]:
# -*- coding: utf-8 -*-

"""
Uses the model to predict known values in order to evaluate the model (with AUC scoring).

@author(s): Martin Guy

Last modified: July 9, 2016

"""

'\nUses the model to predict known values in order to evaluate the model (with AUC scoring).\n\n@author(s): Martin Guy\n\nLast modified: July 9, 2016\n\n'

In [59]:
#imports
import numpy as np
import pandas as pd

from pandas import DataFrame

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics, cross_validation

from glob import glob

import matplotlib.pyplot as plt

In [60]:
#declaring some parameters
subjects = range(1,13)
training_size = 0.55
cols = ['HandStart','FirstDigitTouch',    #class names
        'BothStartLoadPhase','LiftOff',
        'Replace','BothReleased']

In [78]:
#Process on a fixed subject, classifier and training size
#returns the average auc score of the subject
def process(subject,clf, training_size, show=False, export=True, verbose=False):
    if verbose:
        print("# Begin process of subject %d" % (subject))
        print("Chargement du signal d'entrainement...")
    
    #We use here already filtered signals.
    #Signals were filtered using Alexandre Barachant's "Beat the benchmark" code
    filename = "train_filtered/filtered_"+str(subject)+".npz"
    datas = np.load(filename)
    feattr = datas['arr_0']
    labels = datas['arr_1']
    
    if verbose:
        print("Done")

    nbExemples = feattr.shape[1]
    
    X_train, X_test, labels_train, labels_test = cross_validation.train_test_split(feattr.T, labels.T, test_size=(1.0-training_size), random_state=0)
    
    nbTrain = len(X_train)
    nbTest = len(X_test)

    ################ Train classifiers ########################################
     
    clf_list = [] #classifiers list
    clf_name = clf
    clf_name_info = ""
    clf_names = ["LogisticRegression", "GaussianNB", "6Neighbor"]
    
    if clf == "LogisticRegression":
        clf_list = [LogisticRegression() for i in range(6)]
        clf_name_info = "Logistic Regression"
    elif clf == "GaussianNB":
        clf_list = [GaussianNB() for i in range(6)]
        clf_name_info = "Gaussian Naive Bayesian"
    elif clf == "6Neighbor":
        clf_list = [KNeighborsClassifier(6) for i in range(6)]
        clf_name_nfo = "6-Nearest Neighbors"
    else:
        print("Error: you must select an appropriate classifier :", clf_names)
        return 0
    

    for i in range(6):
        if verbose:
            print('Train subject %d, class %s(%d)' % (subject, cols[i], i))
        clf_list[i].fit(X_train,labels_train.T[i])



    ################ Evaluate classifier ########################################
    avg_auc = 0 #average auc

    #pour chaque classe
    for i in range(6):
        preds = clf_list[i].predict_proba(X_test)[:,1]
        fpr, tpr, _ = metrics.roc_curve(labels_test.T[i], preds)
        auc = metrics.auc(fpr,tpr)
        avg_auc += auc

        label_name = cols[i] + " - AUC : " + str(auc)
        plt.plot(fpr,tpr, label=label_name, lw=2)

    avg_auc /= 6
    global_auc = avg_auc

    if verbose:
        print("Average auc :", avg_auc)
    
    
    ################ Plot Configuration ########################################
    
    if export or show:
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")


        title = "Subject " + str(subject) + " ROC, average auc : " + str(avg_auc) + " using " + clf_name_info + " and learning " +\
        str(nbTrain) + " examples"

        plt.title(title)
        plt.plot([0,1],[0,1])

        fig = plt.gcf()
        fig.set_size_inches(15.5, 10.5)
        plt.legend(loc=4, borderaxespad=0.)
    
    savepath = 'evaluation/'+ clf_name +'_subject_'+str(subject) +'_different_classifier_' + str(training_size) + '.png'

    ################ Plot printing ########################################
    if export:
        plt.savefig(savepath)
    if show:
        plt.show()
    plt.clf()

    ################ Score writing ########################################
    if export:
        with open('evaluation/avg_auc.txt', 'a') as f:
            f.write('(' + str(subject) + ')' + clf_name_info + " : " + str(global_auc) + " (learning = " + str(nbTrain) + ")\n")
    
    return avg_auc

nbSubjects = len(subjects)

#Evaluate a classifier for a given training size list
#returns the matrix of auc scores for each training size and each subject
def process_all_tr(name,training_size_list, verbose = False):
    """
    Evaluate a classifier for a given training size list
    """
    n = len(training_size_list)
    results = np.zeros((nbSubjects, n))
    #for each training_rate
    for (index,tr) in enumerate(training_size_list):
        if verbose:
            print("\n**** Begin with training rate : %0.2f ****" % (tr))
        #process on all subjects
        for subject in subjects:
            results[subject-1][index] = process(subject, name, training_size = tr, export = False, verbose=False)
    
    return results



def arrondi(val):
    return ("%.3f" % val)

#Ne pas oublier d'utiliser:
#\usepackage{array}

#\begin{tabular}{|l|c|r|}
#  \hline
#  colonne 1 & colonne 2 & colonne 3 \\
#  \hline
#  1.1 & 1.2 & 1.3 \\
#  2.1 & 2.2 & 2.3 \\
#  \hline
#\end{tabular}

def exportToLatex(results, colonnes, filename, clfname, mode='w'):
    
    for i in subjects:
        results[i-1] = map(arrondi, results[i-1])
    
    def writeLine(numSubject, f):
        
        valmax = max(results[numSubject])
        
        f.write(str(numSubject+1))
        for (i,col) in enumerate(results[numSubject]):
            f.write(" & ")
            
            if(col == valmax):
                f.write("\\textbf{")
            f.write(str(col))
            if(col == valmax):
                f.write("}")
        f.write(" \\\\ \n")
        f.write("\hline \n")
    
    with open('export/'+filename+'.txt', mode) as f:
        f.write("Results for"+clfname+"\n\n")
        
        f.write("\\begin{tabular}{|")
        f.write("c|"*(len(colonnes)+1) + "}\n")
        f.write("\hline \n")
        
        f.write("Subjects/Training size(\%)")
        
        for (i,col) in enumerate(colonnes):
            f.write(" & ")
            f.write(str(int(col*100)))
        f.write(" \\\\ \n")
        
        f.write("\hline \n")
        
        for i in subjects:
            writeLine(i-1, f)
        
        f.write("\end{tabular}")
        

def exportToNPZ(filename, results):
    np.savez_compressed("export/"+filename+".npz", results)

In [None]:
training_size_list = [0.05, 0.10, 0.25, 0.50]

clf_names = ["6Neighbor"]
latexFilenames = ['6Neighbor']
#clf_names = ["LogisticRegression", "GaussianNB", "6Neighbor"]

nbClassifier = len(clf_names)
n = len(training_size_list)


results = np.empty((nbClassifier, nbSubjects, n))            

for (i,clf) in enumerate(clf_names):
    print("Starting with %s" % (clf))
    results[i] = process_all_tr(clf, training_size_list, verbose = True)
    print("")
print("Done")
    

Starting with 6Neighbor

**** Begin with training rate : 0.05 ****


In [None]:

for i in range(nbClassifier):
    exportToLatex(results[i],training_size_list, latexFilenames[i], clf_names[i])
    exportToNPZ(latexFilenames[i], results[i])

In [59]:
colonnes = training_size_list
lignes = subjects

df_array = []

for i in range(nbClassifier):
    df_array.append(pd.DataFrame(results[i], index=lignes, columns=colonnes))
    





In [60]:
print("Results for ", clf_names[0])
df_array[0]

Results for  LogisticRegression


Unnamed: 0,0.01,0.02,0.03,0.04,0.05
1,0.761778,0.766674,0.769916,0.771192,0.772545
2,0.698553,0.705801,0.709761,0.7118,0.713977
3,0.685777,0.693347,0.697662,0.699113,0.700601
4,0.815895,0.81986,0.821326,0.822059,0.822913
5,0.699108,0.706284,0.71031,0.711814,0.712819
6,0.678286,0.689552,0.693962,0.695973,0.696979
7,0.773746,0.782086,0.786129,0.787994,0.789903
8,0.793802,0.796224,0.797722,0.798432,0.798819
9,0.690122,0.692788,0.693744,0.694724,0.694648
10,0.732787,0.736358,0.737762,0.739009,0.739721


In [61]:
print("Results for ", clf_names[1])
df_array[1]

Results for  GaussianNB


Unnamed: 0,0.01,0.02,0.03,0.04,0.05
1,0.780274,0.781843,0.782289,0.78201,0.78272
2,0.727767,0.7269,0.727107,0.726944,0.725785
3,0.700206,0.702529,0.703325,0.703582,0.703465
4,0.821848,0.823131,0.823002,0.823847,0.823803
5,0.715015,0.71366,0.713203,0.713049,0.712685
6,0.700827,0.702089,0.702736,0.704917,0.704253
7,0.781035,0.782244,0.782291,0.781915,0.782519
8,0.805163,0.805249,0.805321,0.806124,0.806227
9,0.685313,0.69,0.692095,0.692504,0.692707
10,0.739375,0.739707,0.740322,0.740931,0.741466


In [62]:
np.savez_compressed("evaluation/resultats (0.01to0.05_LR_and_GaussianNB).npz", results)