# Final Report

## 1. Executive Summary

## 2. Aim and Background

### 2.1. Problem

### 2.2. Aim

## 3. Methods


### 3.1. Data Collection

In [1]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
from data_plot import plot_samples_events_individual

print(1)

1


In [3]:
EVENT_ID_MAP = {
    None: 0,
    "L": 1,
    "R": 2,
    "S": 3,
}
EVENT_ID_LETTER_MAP = {EVENT_ID_MAP[i]:i for i in EVENT_ID_MAP}

EVENT_COLOR_MAP = {
    None: "black",
    "L": "red",
    "R": "blue",
    "S": "green",
}

EVENT_ID_NAME_MAP = {
    0: "Nothing",
    1: "Left Wink",
    2: "Right Wink",
    3: "Dbl Blink",
}

BRAINBOX_SAMPLE_RATE = 10000
DOWNSAMPLE_RATE = 100

EVENT_LENGTH = 2 # length of a given event sequence in seconds
EVENT_SAMPLE_COUNT = int(EVENT_LENGTH * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE) # size of event in samples

EVENT_START = -0.75
EVENT_START_OFFSET = int(EVENT_START * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE)

EVENT_END = -0.25
EVENT_END_OFFSET = int(EVENT_END * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE)

INPUT_SHAPE = (EVENT_SAMPLE_COUNT,)
OUTPUT_SHAPE = len(EVENT_ID_MAP)  # number of categories (including None)


EVENTS_PATH = "../src/data_collection/data/events/"
SAMPLES_PATH = "../src/data_collection/data/waves/"

In [4]:
FILE_NAMES_ALL = [
    "DATA_2022-05-13_Josh_0001_3_1652400625",
#     "DATA_2022-05-13_Josh_0001_3_1652400939",
    "DATA_2022-05-13_Josh_0001_4_1652401267",
#     "DATA_2022-05-13_Josh_0001_4_1652401740",
    "DATA_2022-05-13_Josh_0001_5_1652405337",
#     "DATA_2022-05-13_Josh_0001_5_1652405637",
#     "DATA_2022-05-13_Josh_0001_6_1652406023",
#     "DATA_2022-05-13_Josh_0001_6_1652406202",
#     "DATA_2022-05-13_Josh_0001_7_1652406589",
#     "DATA_2022-05-13_Josh_0001_7_1652406788",
#     "DATA_2022-05-13_Josh_0001_8_1652407331",
#     "DATA_2022-05-13_Josh_0001_8_1652407508",
]



### 3.2. Data Preprocessing

In [5]:
from data_ml import get_ml_data

print("Data Preprocessing")
files_data_all = []
files_labels_all = []

for file_name in FILE_NAMES_ALL:
    f_data_all, f_labels_all = get_ml_data(
        events_path = EVENTS_PATH,
        samples_path = SAMPLES_PATH,
        file_names = [file_name],

        event_id_map = EVENT_ID_MAP,
        event_color_map = EVENT_COLOR_MAP,

        event_sample_count = EVENT_SAMPLE_COUNT,
        event_start = EVENT_START,
        event_end = EVENT_END,

        downsample_rate = DOWNSAMPLE_RATE,
        shuffle_data = False,
        filter_data = True,
    )

    files_data_all.append(f_data_all)
    files_labels_all.append(f_labels_all)


Data Preprocessing
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 19047 sequences of size 200

Combined 1 files into 19047 sequences of size 200
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 21609 sequences of size 200

Combined 1 files into 21609 sequences of size 200
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 24467 sequences of size 200

Combined 1 files into 24467 sequences of size 200


### 3.3. Models

In [6]:
import tensorflow as tf

def get_model_ann():
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=INPUT_SHAPE),
        tf.keras.layers.Dropout(.50, input_shape=INPUT_SHAPE),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SHAPE)
    ])
    return model
get_model_ann().summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 200)               0         
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense (Dense)               (None, 64)                12864     
                                                                 
 dense_1 (Dense)             (None, 16)                1040      
                                                                 
 dense_2 (Dense)             (None, 4)                 68        
                                                                 
Total params: 13,972
Trainable params: 13,972
Non-trainable params: 0
_________________________________________________________________


#### 3.3.2. SVM

In [7]:
from sklearn import svm

def get_model_svm():
    clf = svm.SVC()
    return clf

#### 3.3.3 Random Forest

In [12]:
from numpy import mean
from numpy import std
from numpy import arange
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from xgboost import XGBRFClassifier

#Hyper-Parameters
n_trees = [10,50,100,500,1000,5000]
n_features = [x for x in arange(0.1, 1.1, 0.1)]


def get_model_rf(num_trees=50, num_features=0.1):
    return XGBRFClassifier(n_estimators=num_trees, subsample=0.9, colsample_bynode=num_features)

### 3.4. Evaluation Strategy

In [13]:
ANN_EPOCHS = 3
ANN_OPTIMIZER = 'adam'

def train_model_ann(model, train_data, train_labels, test_data, test_labels):
    print("Training ANN with", len(train_labels), "samples")
    start_time = time.time()
    
    # Trains model and returns history dict
    model.compile(
        optimizer=ANN_OPTIMIZER,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    history = model.fit(
        train_data,
        train_labels,
        epochs=ANN_EPOCHS,
        validation_data=(test_data, test_labels)
    #     batch_size=16
    )

    print(f"Completed training ANN in {time.time()-start_time:.2f}s")

    print(f"Generating predictions for test set", len(test_labels))
    start_time = time.time()
    
    test_out = model.predict(test_data)
    test_probs = tf.nn.softmax(test_out, axis=1)
    test_pred = np.argmax(test_probs, axis=1)
    
    print(f"Generated predictions in {time.time()-start_time:.2f}s")

    # Return predictions for test set
    return test_pred

In [14]:
def train_model_svm(model, train_data, train_labels, test_data, test_labels):
    subset = np.random.random(len(train_data)) < 0.1
    print("Training SVM with", sum(subset), "samples")
    start_time = time.time()
    
    model.fit(train_data[subset], train_labels[subset])
    
    print(f"Completed training SVM in {time.time()-start_time:.2f}s")

    print(f"Generating predictions for test set", len(test_labels))
    start_time = time.time()
    
    test_pred = model.predict(test_data)

    print(f"Generated predictions in {time.time()-start_time:.2f}s")
    
    # Return predicions for test set
    return test_pred

In [15]:
#Random Forest Training and Testing
def train_model_xgbrf(model, train_data, train_labels, test_data, test_labels):
    subset = np.random.random(len(train_data)) < 0.1
    print("Training XGBRandomForest with", sum(subset), "samples")
    start_time = time.time()
    
    model.fit(train_data[subset], train_labels[subset])
    
    print(f"Completed training XGBRandomForest in {time.time()-start_time:.2f}s")
    
    print(f"Generating predictions for test set", len(test_labels))
    
    start_time = time.time()
    
    test_pred = model.predict(test_data)
    
    print(f"Generated predictions in {time.time()-start_time:.2f}s")
        
    # Return predicions for test set
    return test_pred

In [16]:
def confusion_matrix(label_pred, label_true):
    # Rows are "real" labels
    # Columns are "predicted" labels
    conf = tf.math.confusion_matrix(
        label_pred,
        label_true
    )

    return conf

In [19]:
CV_K = len(FILE_NAMES_ALL)

cv_conf_ann = []
cv_conf_svm = []
cv_conf_rf = []

for k in range(CV_K):
    print(f"Fold #{k+1}/{CV_K}")
    start_time = time.time()
    
    # Get training data/labels
    train_data = np.concatenate(files_data_all[0:k] + files_data_all[k+1:])
    train_labels = np.concatenate(files_labels_all[0:k] + files_labels_all[k+1:])
    print(train_data.shape, train_labels.shape)
    
    test_data = files_data_all[k]
    test_labels = files_labels_all[k]
    print(test_data.shape, test_labels.shape)

    # Train and test ANN
    model_ann = get_model_ann()
    test_pred_ann = train_model_ann(model_ann, train_data, train_labels, test_data, test_labels)
    conf_ann = confusion_matrix(test_pred_ann, test_labels)
    cv_conf_ann.append(conf_ann)
    
    # Train and test SVM
    model_svm = get_model_svm()
    test_pred_svm = train_model_svm(model_svm, train_data, train_labels, test_data[:10000], test_labels[:10000])
    conf_svm = confusion_matrix(test_pred_svm, test_labels[:10000])
    cv_conf_svm.append(conf_svm)
    
    # Train and Test XGBRandomForest
    model_xgbrf = get_model_rf()
    test_pred_xgbrf = train_model_xgbrf(model_xgbrf, train_data, train_labels, test_data[:10000], test_labels[:10000])
    conf_xgbrf = confusion_matrix(test_pred_xgbrf, test_labels[:10000])
    cv_conf_rf.append(conf_xgbrf)

    print(f"Done fold in {time.time() - start_time:.2f}s\n")

Fold #1/3
(46076, 200) (46076,)
(19047, 200) (19047,)
Training ANN with 46076 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Completed training ANN in 5.01s
Generating predictions for test set 19047
Generated predictions in 0.45s
Training SVM with 4574 samples
Completed training SVM in 0.40s
Generating predictions for test set 10000
Generated predictions in 1.48s
Training XGBRandomForest with 4657 samples
Completed training XGBRandomForest in 65.82s
Generating predictions for test set 10000
Generated predictions in 0.62s
Done fold in 73.98s

Fold #2/3
(43514, 200) (43514,)
(21609, 200) (21609,)
Training ANN with 43514 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Completed training ANN in 5.10s
Generating predictions for test set 21609
Generated predictions in 0.50s
Training SVM with 4369 samples
Completed training SVM in 0.32s
Generating predictions for test set 10000
Generated predictions in 1.27s
Training XGBRandomForest with 4327 samples
Completed training XGBRandomForest in 61.21s
Generating predi

In [20]:
print(cv_conf_ann)
print(cv_conf_svm)
print(cv_conf_rf)

[<tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[16014,   225,   135,   246],
       [  207,   523,     0,     9],
       [  149,     0,   767,     2],
       [   72,     2,     0,   696]])>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[18397,   208,   190,   172],
       [  273,   994,     0,    13],
       [  227,     0,   459,     0],
       [   76,     0,     0,   600]])>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[20787,   170,   146,   458],
       [  185,   742,     0,    69],
       [  286,     0,  1307,     0],
       [   53,    40,     0,   224]])>]
[<tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[8587,  149,   88,  264],
       [  33,  151,    0,    0],
       [  77,    0,  463,    0],
       [   0,    0,    0,  188]])>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[8616,  116,  111,   83],
       [ 126,  336,    0,    0],
       [  48,    0,  288,    0],
       [   8,    0,    0,  268]])>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=

## 4. Results

### 4.1. Part A

In [21]:
def print_confusion_matrix(conf):
    print(conf)
    
    overall_total = sum(sum(conf))
    overall_correct = sum([conf[i][i] for i in range(len(conf))])
    print(f"Overall accuracy: {100*overall_correct/overall_total:.2f}% ({overall_correct}/{overall_total})")
    
    fp = sum(conf[0][1:])
    tp = sum(sum(conf[1:]))
    print(f"False positives {fp}")
    print(f"True positives {tp}")
    print(f"False discovery (fp/(tp+fp)): {fp/(fp+tp):.4f} ({fp}/{fp+tp})")
    
    for i in range(len(conf)):
        letter = str(EVENT_ID_LETTER_MAP[i])[0]
        
        total = sum(conf[i])
        correct = conf[i][i]
        acc_total = 100*correct/total
        
        s = f"Event {letter} ({i}) accuracy: {correct:6}/{total: <6} (t_acc {acc_total:5.2f}%)"
        if i > 0:
            acc_event = 100*correct/(total - conf[i][0])
            s += f" (e_acc {acc_event:5.2f}%)"
        print(s)
        
    print("")
        
    return conf

In [24]:
total_conf_ann = np.sum(cv_conf_ann, axis=0)
print_confusion_matrix(total_conf_ann)

total_conf_svm = np.sum(cv_conf_svm, axis=0)
print_confusion_matrix(total_conf_svm)

total_conf_rf = np.sum(cv_conf_rf, axis=0)
print_confusion_matrix(total_conf_rf)

[[55198   603   471   876]
 [  665  2259     0    91]
 [  662     0  2533     2]
 [  201    42     0  1520]]
Overall accuracy: 94.45% (61510/65123)
False positives 1950
True positives 7975
False discovery (fp/(tp+fp)): 0.1965 (1950/9925)
Event N (0) accuracy:  55198/57148  (t_acc 96.59%)
Event L (1) accuracy:   2259/3015   (t_acc 74.93%) (e_acc 96.13%)
Event R (2) accuracy:   2533/3197   (t_acc 79.23%) (e_acc 99.92%)
Event S (3) accuracy:   1520/1763   (t_acc 86.22%) (e_acc 97.31%)

[[25853   550   328   449]
 [  174   751     0    11]
 [  155     0  1173     0]
 [   11     1     0   544]]
Overall accuracy: 94.40% (28321/30000)
False positives 1327
True positives 2820
False discovery (fp/(tp+fp)): 0.3200 (1327/4147)
Event N (0) accuracy:  25853/27180  (t_acc 95.12%)
Event L (1) accuracy:    751/936    (t_acc 80.24%) (e_acc 98.56%)
Event R (2) accuracy:   1173/1328   (t_acc 88.33%) (e_acc 100.00%)
Event S (3) accuracy:    544/556    (t_acc 97.84%) (e_acc 99.82%)

[[25905   656   385   4

array([[25905,   656,   385,   471],
       [  124,   620,     0,    13],
       [  138,     0,  1105,     0],
       [   26,    26,    11,   520]])

### 4.2. Part B

## 5. Discussion and Conclusion

### 5.1. Issues Addressed

### 5.2. Conclusion

## 6. Student Contribution

### 6.1. Matty

### 6.2. Ashwin

### 6.3. Marcus

### 6.4. Alex

### 6.5. Jingyu

### 6.9. Josh
