In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from glob import glob
import os

from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')



#for spliting data into traning and testing
from sklearn.model_selection import train_test_split

#for Logistic Regression
from sklearn.linear_model import LogisticRegression
#for Support Vector Machine
from sklearn.svm import SVC
#for random forest and boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#for LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#for metrics
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score

#PCA for Dimension Reduction
from sklearn.decomposition import PCA

#to perform cross validation
from sklearn.model_selection import GridSearchCV

# NOTE: 
1. **change to `subjects = range(1, 13)`** to loop through all the subjects
2. **In order to run:** Move series 7 and 8 for each subject into new directory called `test`. The `train` directory should contain series 1 through 6 for each subject. Also ensure file path is correct, currently it assumes the data is one directory back from this notebook.

### Functions

In [2]:
# read data csv
def read_data(fname):
    """ read and prepare training data """
    # Read data
    data = pd.read_csv(fname)
    # events file
    events_fname = fname.replace('_data','_events')
    # read event file
    labels= pd.read_csv(events_fname)
    clean=data.drop(['id' ], axis=1)#remove id
    labels=labels.drop(['id' ], axis=1)#remove id
    return  clean,labels

# standardise features in preprocessesing
scaler= StandardScaler()
def preprocess(X, t):
    if (t == "train"):
        X_prep = scaler.fit_transform(X)
    else:
         X_prep = scaler.transform(X)
    return X_prep

# train classifier
def fit(X,y):
    clf = LogisticRegression()
    clf.fit(X,y)
    return clf

# predict
def predict(clf,X):
    preds = clf.predict_proba(X)
    return np.atleast_2d(preds[:,clf.classes_ == 1])


### Getting data and training and running classifiers

In [6]:
subsample = 100 # training subsample

subjects = range(1,2) # total number of subjects 
pred_tot = []         # all the predictions
label_tot = []        # all the labels
auc_tot = []          # all the auc scors
cols = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase','LiftOff', 'Replace', 'BothReleased'] # hand movements

# looping through the subjects to get train and test data for each subject's series (trials)
# for each subject, train data is series 1 to 6, test data is series 7 and 8
for subject in subjects:
    
    # get train data
    train_files = glob('../train/subj%d_series*_data.csv' % (subject))
    label_raw = []
    data_raw = []

    for f in train_files:
        data, labels = read_data(f)
        data_raw.append(data)
        label_raw.append(labels)

    X_train = np.array(pd.concat(data_raw))
    y_train = np.array(pd.concat(label_raw))

    # get test data
    test_files =  glob('../test/subj%d_series*_data.csv' % (subject))
    label_raw_test = []
    data_raw_test = []
    
    for f in test_files:
        data, labels = read_data(f)
        data_raw_test.append(data)
        label_raw_test.append(labels)

    X_test = np.array(pd.concat(data_raw_test))
    y_test = np.array(pd.concat(label_raw_test))
    
    # train classifiers
    lr = LogisticRegression()
    pred = np.empty((X_test.shape[0],6))
    
    X_train = preprocess(X_train, "train")
    X_test = preprocess(X_test, "test")
    
    # train for each movement 
    for i in range(6):
        y = y_train[:,i]
        print('Training subject %d, class %s' % (subject, cols[i]))
        lr.fit(X_train[::subsample,:], y[::subsample])
        pred[:,i] = lr.predict_proba(X_test)[:,1]

    # append all predictions and labels of the subjects
    pred_tot.append(pred) 
    label_tot.append(y_train)
    
    # get auc score for each class for this subject
    auc = [roc_auc_score(y_test[:,i],pred[:,i]) for i in range(6)] 
        
    # append all auc scores of the subjects 
    auc_tot.append(auc)
    
    print("\033[1m AUC of Subject ", subject, auc)
    print("\n")

Training subject 1, class HandStart
Training subject 1, class FirstDigitTouch
Training subject 1, class BothStartLoadPhase
Training subject 1, class LiftOff
Training subject 1, class Replace
Training subject 1, class BothReleased
[1m AUC of Subject  1 [0.7429219447931625, 0.7010134876469049, 0.7154205634718444, 0.778910967829658, 0.8570135564967338, 0.8113825798169321]




# TODO
1. make df or something to store all the AUC arrays for each class for each subject
2. average over the subjects to get the average AUC score per class over all the subjects
3. use other classifiers
4. graphs

# LDA

In [None]:
lda = LinearDiscriminantAnalysis()
lda.fit(X,Y)
X = lda.transform(X)

In [None]:
 X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size=size)

# Logistic Regression

In [None]:
%%time 

#Hyperparameters 
Penalty_list = [ 'l1', 'l2','elasticnet']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

param_grid = {'penalty': Penalty_list, 'C': C_list}
model = LogisticRegression() 
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=10,scoring = 'f1')
#this does cv and also train using best hyperparamater
grid_search.fit(X_train,Y_train)

#Find optimal hyper-paramater
best_hyperparam = grid_search.best_params_
    
##trainng error and cv error
cross_val_errors = (1- grid_search.cv_results_['mean_test_score'])
train_error = sum(grid_search.best_estimator_.predict(X_train) != Y_train)/(len(Y_train))

#test and optain testing accuracy
test_error = sum(grid_search.best_estimator_.predict(X_test) != Y_test)/(len(Y_test))

print(cross_val_errors)
print(best_hyperparam)
print(1-train_error)
print(1-test_error)

# Linear SVM

In [None]:
%%time 

#Hyperparameters 
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
Kernel_list= ['linear']

param_grid = {'C':C_list, 'kernel': Kernel_list}
model = SVC() 
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=10,scoring = 'f1')
#this does cv and also train using best hyperparamater
grid_search.fit(X_train,Y_train)

#Find optimal hyper-paramater
best_hyperparam = grid_search.best_params_
    
##trainng error and cv error
cross_val_errors = (1- grid_search.cv_results_['mean_test_score'])
train_error = sum(grid_search.best_estimator_.predict(X_train) != Y_train)/(len(Y_train))

#test and optain testing accuracy
test_error = sum(grid_search.best_estimator_.predict(X_test) != Y_test)/(len(Y_test))

print(cross_val_errors)
print(best_hyperparam)
print(1-train_error)
print(1-test_error)

# Random Forest

In [None]:
%%time 

#Hyperparamater
numberOfTree_list = [5,10,50,100,500,1000,5000,10000]
Criterion_list = ['gini', 'entropy']

param_grid = {'n_estimators': numberOfTree_list, 'criterion': Criterion_list}
model = RandomForestClassifier() 
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=10,scoring = 'f1')
#this does cv and also train using best hyperparamater
grid_search.fit(X_train,Y_train)
    
#Find optimal hyper-paramater
best_hyperparam = grid_search.best_params_
    
##trainng error and cv error
cross_val_errors = (1- grid_search.cv_results_['mean_test_score'])
train_error = sum(grid_search.best_estimator_.predict(X_train) != Y_train)/(len(Y_train))

#test and optain testing accuracy
test_error = sum(grid_search.best_estimator_.predict(X_test) != Y_test)/(len(Y_test))

print(cross_val_errors)
print(best_hyperparam)
print(1-train_error)
print(1-test_error)

# AdaBoost

In [None]:
%%time 

#Hyperparamater
n_estimators_list = [10,50,100,500]
learningRate_list = [0.0001,0.001,0.01,0.1,1.0]

param_grid = {'n_estimators': n_estimators_list, 'learning_rate': learningRate_list}
model = AdaBoostClassifier() 
grid_search = GridSearchCV(estimator=model,param_grid=param_grid,cv=10,scoring = 'f1')
#this does cv and also train using best hyperparamater
grid_search.fit(X_train,Y_train)
    
#Find optimal hyper-paramater
best_hyperparam = grid_search.best_params_
    
##trainng error and cv error
cross_val_errors = (1- grid_search.cv_results_['mean_test_score'])
train_error = sum(grid_search.best_estimator_.predict(X_train) != Y_train)/(len(Y_train))

#test and optain testing accuracy
test_error = sum(grid_search.best_estimator_.predict(X_test) != Y_test)/(len(Y_test))

print(cross_val_errors)
print(best_hyperparam)
print(1-train_error)
print(1-test_error)