In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from glob import glob
import os

from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')






from IPython.display import display, HTML

#for spliting data into traning and testing
from sklearn.model_selection import train_test_split

#for Logistic Regression
from sklearn.linear_model import LogisticRegression
#for Support Vector Machine
from sklearn.svm import SVC
#for KNN
from sklearn.neighbors import KNeighborsClassifier
#for decision tree
from sklearn.tree import DecisionTreeClassifier
#for random forest and boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#for LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#for metrics
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score

#PCA for Dimension Reduction
from sklearn.decomposition import PCA

#to perform cross validation
from sklearn.model_selection import GridSearchCV

# NOTE: 
1. **change to `subjects = range(1, 13)`** to loop through all the subjects
2. **In order to run:** Move series 7 and 8 for each subject into new directory called `test`. The `train` directory should contain series 1 through 6 for each subject. Also ensure file path is correct, currently it assumes the data is one directory back from this notebook.

### Functions

In [2]:
# read data csv
def read_data(fname):
    """ read and prepare training data """
    # Read data
    data = pd.read_csv(fname)
    # events file
    events_fname = fname.replace('_data','_events')
    # read event file
    labels= pd.read_csv(events_fname)
    clean=data.drop(['id' ], axis=1)#remove id
    labels=labels.drop(['id' ], axis=1)#remove id
    return  clean,labels

# standardise features in preprocessesing
scaler= StandardScaler()
def preprocess(X, t):
    if (t == "train"):
        X_prep = scaler.fit_transform(X)
    else:
         X_prep = scaler.transform(X)
    return X_prep

# train classifier
def myfit(X,y, model):
  # clf = LogisticRegression()
    model.fit(X,y)
    return model

# predict
# def predict(clf,X):
#     preds = clf.predict_proba(X)
#     return np.atleast_2d(preds[:,clf.classes_ == 1])


In [11]:
#for log Reg
Penalty_list = [ 'l1', 'l2']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
logReg_param_grid = {'penalty': Penalty_list, 'C': C_list}

#for Linear SVM
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
Kernel_list= ['linear']
svm_param_grid = {'C':C_list, 'kernel': Kernel_list, 'probability':True}

#KNN 
K_list = [1, 3, 5, 7, 9]
Algorithm_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
knn_param_grid = {'n_neighbors': K_list, 'algorithm': Algorithm_list}

#Decision Tree 
Criterion_list = ['gini', 'entropy']
D_list = [1,2,3,4,5]
decisionTree_param_grid = {'max_depth': D_list, 'criterion': Criterion_list}

#Random Forest 
numberOfTree_list = [5,10,50,100,500,1000,5000,10000]
Criterion_list = ['gini', 'entropy']
randomForest_param_grid = {'n_estimators': numberOfTree_list, 'criterion': Criterion_list}

#AdaBoost
n_estimators_list = [10,50,100,500]
learningRate_list = [0.0001,0.001,0.01,0.1,1.0]
adaBoost_param_grid = {'n_estimators': n_estimators_list, 'learning_rate': learningRate_list}

In [15]:
#prepare models
# models = [id name, classifier, hypermater of the classifier for Cross Validation]
models = []
#models.append(('Logistic Regression', LogisticRegression(),logReg_param_grid))
#models.append(('Support Vector Machine', SVC(),svm_param_grid ))
models.append(('K-nearest Neighbors', KNeighborsClassifier(),knn_param_grid ))
#models.append(('Decision Tree', DecisionTreeClassifier(),decisionTree_param_grid))
# models.append(('Random Forest', RandomForestClassifier(),randomForest_param_grid))
# models.append(('AdaBoost', AdaBoostClassifier(),adaBoost_param_grid))
#models.append(('LDA', LinearDiscriminantAnalysis()))
#print(models)

### Getting data and training and running classifiers

In [None]:
subsample = 100 # training subsample

subjects = range(1,2) # total number of subjects 
pred_tot = []         # all the predictions
label_tot = []        # all the labels
auc_tot = []          # all the auc scors
cols = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase','LiftOff', 'Replace', 'BothReleased'] # hand movements

# looping through the subjects to get train and test data for each subject's series (trials)
# for each subject, train data is series 1 to 6, test data is series 7 and 8
for subject in subjects:
    
    # get train data
    train_files = glob('../train/subj%d_series*_data.csv' % (subject))
    label_raw = []
    data_raw = []

    for f in train_files:
        data, labels = read_data(f)
        data_raw.append(data)
        label_raw.append(labels)

    X_train = np.array(pd.concat(data_raw))
    y_train = np.array(pd.concat(label_raw))

    # get test data
    test_files =  glob('../test/subj%d_series*_data.csv' % (subject))
    label_raw_test = []
    data_raw_test = []
    
    for f in test_files:
        data, labels = read_data(f)
        data_raw_test.append(data)
        label_raw_test.append(labels)

    X_test = np.array(pd.concat(data_raw_test))
    y_test = np.array(pd.concat(label_raw_test))
    
    # train classifiers
#     lr = LogisticRegression()
#     pred = np.empty((X_test.shape[0],6))

    
    for name, model, hyperparam in models:
        print(name)
        #pred = np.empty((X_test.shape[0],6))
        roc_prob = np.empty((X_test.shape[0],6))
    
        X_train = preprocess(X_train, "train")
        X_test = preprocess(X_test, "test")
    
        # train for each movement 
        for i in range(6):
            y = y_train[:,i]
            print('Training subject %d, class %s' % (subject, cols[i]))
            
            # run grid search
            grid_search = GridSearchCV(estimator=model,param_grid=hyperparam,cv=10,scoring = 'roc_auc')
            #this does cv and also train using best hyperparamater
            grid_search.fit(X_train[::subsample,:], y[::subsample])
            roc_prob[:,i] = grid_search.predict_proba(X_test)[:,i]
            
            #myfit(X_train[::subsample,:], y[::subsample], model)
            #model.fit(X_train[::subsample,:], y[::subsample])
            #pred[:,i] = model.predict_proba(X_test)[:,1]

        # append all predictions and labels of the subjects
        #pred_tot.append(pred) 
        pred_tot.append(roc_prob)
        label_tot.append(y_train)
    
        # get auc score for each class for this subject
        #auc = [roc_auc_score(y_test[:,i],pred[:,i]) for i in range(6)] 
        auc = [roc_auc_score(y_test[:,i],roc_prob[:,i]) for i in range(6)] 
        
        # append all auc scores of the subjects 
        auc_tot.append(auc)
    
        print("AUC of Subject ", subject, auc)
        print("\n")

K-nearest Neighbors
Training subject 1, class HandStart
Training subject 1, class FirstDigitTouch


# TODO
1. make df or something to store all the AUC arrays for each class for each subject
2. average over the subjects to get the average AUC score per class over all the subjects
3. use other classifiers
4. graphs

# nOTE: I AM ASSUMING THAT DATA ARE IN X AND THE TARGETS ARE IN Y

# LDA

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X,Y)
X = lda.transform(X)

# Hyperparamaters

# Prepare Models

# Train models

In [None]:
%%time 
def train_model(result,name,model,hyperparam):
    trainingErrorForRepeats = []
    testingErrorForRepeats = []
   
    #run 3 times and obtain mean and standard devation of testing/ testing error
    for repeat in range(3):
        #find optimal combination of hyperaramaters
        grid_search = GridSearchCV(estimator=model,param_grid=hyperparam,cv=10,scoring = 'roc_auc')
        #this does cv and also train using best hyperparamater
        grid_search.fit(X_train,Y_train)
        #Find optimal hyper-paramater
        best_hyperparam = grid_search.best_params_
    
        ##trainng and testing error
        train_error = grid_search.score(X_train,Y_train)
        test_error = grid_search.score(X_test,Y_test)
        trainingErrorForRepeats.append(train_error)
        testingErrorForRepeats.append(test_error)
        
    trainMean = np.mean(trainingErrorForRepeats)
    trainStd = np.std(trainingErrorForRepeats)
    trainError = str(trainMean) + u"\u00B1"+ str(trainStd)
   
    testMean = np.mean(testingErrorForRepeats)
    testStd = np.std(testingErrorForRepeats)
    testError = str(testMean) + u"\u00B1"+ str(testStd)
    
    #toAdd = [name, best_hyperparam, train_error, test_error]
    toAdd = [name, best_hyperparam, trainError, testError]
    result.append(toAdd)

# REAL MEAT

In [None]:
#Split it into train and test
train_size = [0.2,0.5,0.8]
#results
result = []
for size in train_size:
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=size)
    for name, model, hyperparam in models:
        train_model(result,name,model,hyperparam)
        

# Training error/ testing error visulization for 3 differnt train/test split

In [None]:
#strip result into multiple based on what it refeers to

modelNameResult = []
bestHyperparamaterResult = []
trainingErrorResult = []
testingErrorResult = []

for row in result:
    modelNameResult.append(row[0])
    bestHyperparamaterResult.append(row[1])
    trainingErrorResult.append(row[2])
    testingErrorResult.append(row[3])
    


tempDataframe = pd.DataFrame({'Classifier':modelNameResult,
                          'Optimal Hyper-paramater':bestHyperparamaterResult,
                          'Training F1 SCore':trainingErrorResult,
                          'Testing F1 Score':testingErrorResult })

train20Dataframe = tempDataframe[0:5]
train50Dataframe = tempDataframe[6:11]
train80Dataframe = tempDataframe[12:17]

#pretty_print(tempDataframe)
Print("training size: 20%")
pretty_print(train20Dataframe)
Print("training size: 50%")
pretty_print(train50Dataframe)
Print("training size: 80%")
pretty_print(train80Dataframe)

pd.set_option('display.max_rows', 500000)
pd.set_option('display.max_columns', 5000000)
pd.set_option('display.width', 1000)

