In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from glob import glob
import os

from sklearn.preprocessing import StandardScaler
from joblib import Parallel, delayed
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, HTML

#for spliting data into traning and testing
from sklearn.model_selection import train_test_split

#for Logistic Regression
from sklearn.linear_model import LogisticRegression
#for Support Vector Machine
from sklearn.svm import SVC
#for KNN
from sklearn.neighbors import KNeighborsClassifier
#for decision tree
from sklearn.tree import DecisionTreeClassifier
#for random forest and boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
#for LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#for metrics
from sklearn.metrics import accuracy_score, roc_auc_score,f1_score

#PCA for Dimension Reduction
from sklearn.decomposition import PCA

#to perform cross validation
from sklearn.model_selection import GridSearchCV

from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

# NOTE: 
1. **In order to run:** Move series 7 and 8 for each subject into new directory called `test`. The `train` directory should contain series 1 through 6 for each subject. Also ensure file path for the data is correct for your machine.

### Functions

In [3]:
# read data csv
def read_data(fname):
    """ read and prepare training data """
    # Read data
    data = pd.read_csv(fname)
    # events file
    events_fname = fname.replace('_data','_events')
    # read event file
    labels = pd.read_csv(events_fname)
    clean = data.drop(['id' ], axis=1)
    labels =labels.drop(['id' ], axis=1) 
    return  clean,labels

# standardise features in preprocessesing
scaler= StandardScaler()
def preprocess(X, t):
    if (t == "train"):
        X_prep = scaler.fit_transform(X)
    else:
         X_prep = scaler.transform(X)
    return X_prep

In [0]:
#for log Reg
Penalty_list = [ 'l1', 'l2']
C_list = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
logReg_param_grid = {'penalty': Penalty_list, 'C': C_list}

#for Linear SVM
C_list = [0.01, 0.1, 1, 10, 100]
Kernel_list= ['linear']
svm_param_grid = {'C':C_list, 'kernel': Kernel_list}

#KNN 
K_list = [1, 3, 5, 7, 9]
Algorithm_list = ['auto', 'ball_tree', 'kd_tree', 'brute']
knn_param_grid = {'n_neighbors': K_list, 'algorithm': Algorithm_list}

#Decision Tree 
Criterion_list = ['gini', 'entropy']
D_list = [1,2,3,4,5]
decisionTree_param_grid = {'max_depth': D_list, 'criterion': Criterion_list}

#Random Forest 
numberOfTree_list = [5,10,50,100,500,1000,5000,10000]
Criterion_list = ['gini', 'entropy']
randomForest_param_grid = {'n_estimators': numberOfTree_list, 'criterion': Criterion_list}

#AdaBoost
n_estimators_list = [10,50,100,500]
learningRate_list = [0.0001,0.001,0.01,0.1,1.0]
adaBoost_param_grid = {'n_estimators': n_estimators_list, 'learning_rate': learningRate_list}

#LDA
solver_list = ['svd','lsqr','eigen']
lda_param_grid = {'solver': solver_list}

In [0]:
#prepare models
models = [id name, classifier, hypermater of the classifier for Cross Validation]
models = []
models.append(('Logistic Regression', LogisticRegression(),logReg_param_grid))
models.append(('LDA', LinearDiscriminantAnalysis(),lda_param_grid))
models.append(('Support Vector Machine', SVC(max_iter=1000, probability=True),svm_param_grid ))
models.append(('K-nearest Neighbors', KNeighborsClassifier(),knn_param_grid ))
models.append(('Decision Tree', DecisionTreeClassifier(),decisionTree_param_grid))
models.append(('Random Forest', RandomForestClassifier(),randomForest_param_grid))
 models.append(('AdaBoost', AdaBoostClassifier(),adaBoost_param_grid))
#print(models)

### Getting data and training and running classifiers

In [0]:
subsample = 100 # training subsample
resultDfRows = []
subjects = range(1,13) # total number of subjects 
pred_tot = []         # all the predictions
label_tot = []        # all the labels
auc_tot = []          # all the auc scors
cols = ['HandStart', 'FirstDigitTouch', 'BothStartLoadPhase','LiftOff', 'Replace', 'BothReleased'] # hand movements

# looping through the subjects to get train and test data for each subject's series (trials)
# for each subject, train data is series 1 to 6, test data is series 7 and 8
for subject in subjects:
    
    # get train data
    train_files = glob('/content/drive/My Drive/189/train/subj%d_series*_data.csv' % (subject))
    
    label_raw = []
    data_raw = []

    for f in train_files:
        data, labels = read_data(f)
        data_raw.append(data)
        label_raw.append(labels)

    X_train = np.array(pd.concat(data_raw))
    y_train = np.array(pd.concat(label_raw))

    # get test data
    test_files =  glob('/content/drive/My Drive/189/test/subj%d_series*_data.csv' % (subject))
    label_raw_test = []
    data_raw_test = []
    
    for f in test_files:
        data, labels = read_data(f)
        data_raw_test.append(data)
        label_raw_test.append(labels)

    X_test = np.array(pd.concat(data_raw_test))
    y_test = np.array(pd.concat(label_raw_test))
    
    for name, model, hyperparam in models:
        print(name)
       
        roc_prob = np.empty((X_test.shape[0],6))
    
        X_train = preprocess(X_train, "train")
        X_test = preprocess(X_test, "test")
    
        # train for each movement 
        for i in range(6):
            y = y_train[:,i]
            print('Training subject %d, class %s' % (subject, cols[i]))
            
            # run grid search
            grid_search = GridSearchCV(estimator=model,param_grid=hyperparam,cv=5,scoring = 'roc_auc')
            # this does cv and also train using best hyperparamater
            print("Done grid search...")
            grid_search.fit(X_train[::subsample,:], y[::subsample])
            print("Fitting...")
            roc_prob[:,i] = grid_search.predict_proba(X_test)[:,1]

        # append all predictions and labels of the subjects
        pred_tot.append(roc_prob)
        label_tot.append(y_train)
    
        # get auc score for each class for this subject
        auc = [roc_auc_score(y_test[:,i],roc_prob[:,i]) for i in range(6)] 
        
        # append all auc scores of the subjects 
        auc_tot.append(auc)
        
        subAndAuc = [subject,name,auc]
        resultDfRows.append(subAndAuc)
        
        print("AUC of Subject ", subject, auc)
        print("\n")

Support Vector Machine
Training subject 1, class HandStart
Done grid search...
Fitting...
Training subject 1, class FirstDigitTouch
Done grid search...
Fitting...
Training subject 1, class BothStartLoadPhase
Done grid search...
Fitting...
Training subject 1, class LiftOff
Done grid search...
Fitting...
Training subject 1, class Replace
Done grid search...
Fitting...
Training subject 1, class BothReleased
Done grid search...
Fitting...
AUC of Subject  1 [0.5722606322386383, 0.6158322915039484, 0.554077743566217, 0.5619185839527656, 0.7013619672506619, 0.625621354137745]


Support Vector Machine
Training subject 2, class HandStart
Done grid search...
Fitting...
Training subject 2, class FirstDigitTouch
Done grid search...
Fitting...
Training subject 2, class BothStartLoadPhase
Done grid search...
Fitting...
Training subject 2, class LiftOff
Done grid search...
Fitting...
Training subject 2, class Replace
Done grid search...
Fitting...
Training subject 2, class BothReleased
Done grid sear

In [0]:
df = pd.DataFrame(resultDfRows)
df.columns = ['Subject', 'Classifier', 'AUC Score']
df.to_csv('/content/drive/My Drive/189/results.csv')
df

Unnamed: 0,Subject,Classifier,AUC Score
0,1,Support Vector Machine,"[0.5722606322386383, 0.6158322915039484, 0.554..."
1,2,Support Vector Machine,"[0.5021307133878637, 0.48478690884151043, 0.41..."
2,3,Support Vector Machine,"[0.4864048721911146, 0.6594008188331627, 0.627..."
3,4,Support Vector Machine,"[0.6272045575213985, 0.6445992425570446, 0.630..."
4,5,Support Vector Machine,"[0.5736658549751039, 0.550228745204026, 0.5380..."
5,6,Support Vector Machine,"[0.5103727706626917, 0.5186512204600789, 0.537..."
6,7,Support Vector Machine,"[0.36861991169518216, 0.6517320674803264, 0.33..."
7,8,Support Vector Machine,"[0.5424488699952497, 0.6127146762777642, 0.630..."
8,9,Support Vector Machine,"[0.6454040942905447, 0.514280407275821, 0.5505..."
9,10,Support Vector Machine,"[0.5947261444802182, 0.5009145906188842, 0.713..."


In [6]:
#from google.colab import drive
#drive.mount('/content/drive')