# Final Report

## 1. Executive Summary

## 2. Aim and Background

### 2.1. Problem

### 2.2. Aim

## 3. Methods


### 3.1. Data Collection

In [30]:
import time
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

In [1]:
from data_plot import plot_samples_events_individual

print(1)

1


In [140]:
EVENT_ID_MAP = {
    None: 0,
    "L": 1,
    "R": 2,
    "S": 3,
}
EVENT_ID_LETTER_MAP = {EVENT_ID_MAP[i]:i for i in EVENT_ID_MAP}

EVENT_COLOR_MAP = {
    None: "black",
    "L": "red",
    "R": "blue",
    "S": "green",
}

EVENT_ID_NAME_MAP = {
    0: "Nothing",
    1: "Left Wink",
    2: "Right Wink",
    3: "Dbl Blink",
}

BRAINBOX_SAMPLE_RATE = 10000
DOWNSAMPLE_RATE = 100

EVENT_LENGTH = 2 # length of a given event sequence in seconds
EVENT_SAMPLE_COUNT = int(EVENT_LENGTH * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE) # size of event in samples

EVENT_START = -0.75
EVENT_START_OFFSET = int(EVENT_START * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE)

EVENT_END = -0.25
EVENT_END_OFFSET = int(EVENT_END * BRAINBOX_SAMPLE_RATE / DOWNSAMPLE_RATE)

INPUT_SHAPE = (EVENT_SAMPLE_COUNT,)
OUTPUT_SHAPE = len(EVENT_ID_MAP)  # number of categories (including None)


EVENTS_PATH = "../src/data_collection/data/events/"
SAMPLES_PATH = "../src/data_collection/data/waves/"

In [141]:
FILE_NAMES_ALL = [
    "DATA_2022-05-13_Josh_0001_3_1652400625",
#     "DATA_2022-05-13_Josh_0001_3_1652400939",
    "DATA_2022-05-13_Josh_0001_4_1652401267",
#     "DATA_2022-05-13_Josh_0001_4_1652401740",
    "DATA_2022-05-13_Josh_0001_5_1652405337",
#     "DATA_2022-05-13_Josh_0001_5_1652405637",
#     "DATA_2022-05-13_Josh_0001_6_1652406023",
#     "DATA_2022-05-13_Josh_0001_6_1652406202",
#     "DATA_2022-05-13_Josh_0001_7_1652406589",
#     "DATA_2022-05-13_Josh_0001_7_1652406788",
#     "DATA_2022-05-13_Josh_0001_8_1652407331",
#     "DATA_2022-05-13_Josh_0001_8_1652407508",
]



### 3.2. Data Preprocessing

In [142]:
from data_ml import get_ml_data

print("Data Preprocessing")
files_data_all = []
files_labels_all = []

for file_name in FILE_NAMES_ALL:
    f_data_all, f_labels_all = get_ml_data(
        events_path = EVENTS_PATH,
        samples_path = SAMPLES_PATH,
        file_names = [file_name],

        event_id_map = EVENT_ID_MAP,
        event_color_map = EVENT_COLOR_MAP,

        event_sample_count = EVENT_SAMPLE_COUNT,
        event_start = EVENT_START,
        event_end = EVENT_END,

        downsample_rate = DOWNSAMPLE_RATE,
        shuffle_data = False,
        filter_data = True,
    )

    files_data_all.append(f_data_all)
    files_labels_all.append(f_labels_all)


Data Preprocessing
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 19047 sequences of size 200

Combined 1 files into 19047 sequences of size 200
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 21609 sequences of size 200

Combined 1 files into 21609 sequences of size 200
Loading ML data from 1 files

Transforming data into individual sequences...
Transformed into 24467 sequences of size 200

Combined 1 files into 24467 sequences of size 200


### 3.3. Models

In [143]:
import tensorflow as tf

def get_model_ann():
    model = tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=INPUT_SHAPE),
        tf.keras.layers.Dropout(.50, input_shape=INPUT_SHAPE),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(OUTPUT_SHAPE)
    ])
    return model
get_model_ann().summary()

Model: "sequential_52"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_52 (Flatten)        (None, 200)               0         
                                                                 
 dropout_52 (Dropout)        (None, 200)               0         
                                                                 
 dense_156 (Dense)           (None, 64)                12864     
                                                                 
 dense_157 (Dense)           (None, 16)                1040      
                                                                 
 dense_158 (Dense)           (None, 4)                 68        
                                                                 
Total params: 13,972
Trainable params: 13,972
Non-trainable params: 0
_________________________________________________________________


#### 3.3.2. SVM

In [144]:
from sklearn import svm

def get_model_svm():
    clf = svm.SVC()
    return clf

#### 3.3.3 Random Forest

In [145]:
def get_model_rf():
    pass

### 3.4. Evaluation Strategy

In [146]:
ANN_EPOCHS = 3
ANN_OPTIMIZER = 'adam'

def train_model_ann(model, train_data, train_labels, test_data, test_labels):
    print("Training ANN with", len(train_labels), "samples")
    start_time = time.time()
    
    # Trains model and returns history dict
    model.compile(
        optimizer=ANN_OPTIMIZER,
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    history = model.fit(
        train_data,
        train_labels,
        epochs=ANN_EPOCHS,
        validation_data=(test_data, test_labels)
    #     batch_size=16
    )

    print(f"Completed training ANN in {time.time()-start_time:.2f}s")

    print(f"Generating predictions for test set", len(test_labels))
    start_time = time.time()
    
    test_out = model.predict(test_data)
    test_probs = tf.nn.softmax(test_out, axis=1)
    test_pred = np.argmax(test_probs, axis=1)
    
    print(f"Generated predictions in {time.time()-start_time:.2f}s")

    # Return predictions for test set
    return test_pred

In [147]:
def train_model_svm(model, train_data, train_labels, test_data, test_labels):
    subset = np.random.random(len(train_data)) < 0.1
    print("Training SVM with", sum(subset), "samples")
    start_time = time.time()
    
    model.fit(train_data[subset], train_labels[subset])
    
    print(f"Completed training SVM in {time.time()-start_time:.2f}s")

    print(f"Generating predictions for test set", len(test_labels))
    start_time = time.time()
    
    test_pred = model.predict(test_data)

    print(f"Generated predictions in {time.time()-start_time:.2f}s")
    
    # Return predicions for test set
    return test_pred

In [148]:
# Insert Random forest thing

In [149]:
def confusion_matrix(label_pred, label_true):
    # Rows are "real" labels
    # Columns are "predicted" labels
    conf = tf.math.confusion_matrix(
        label_pred,
        label_true
    )

    return conf

In [150]:
CV_K = len(FILE_NAMES_ALL)

cv_conf_ann = []
cv_conf_svm = []
# cv_conf_rf = []

for k in range(CV_K):
    print(f"Fold #{k+1}/{CV_K}")
    start_time = time.time()
    
    # Get training data/labels
    train_data = np.concatenate(files_data_all[0:k] + files_data_all[k+1:])
    train_labels = np.concatenate(files_labels_all[0:k] + files_labels_all[k+1:])
    print(train_data.shape, train_labels.shape)
    
    test_data = files_data_all[k]
    test_labels = files_labels_all[k]
    print(test_data.shape, test_labels.shape)

    # Train and test ANN
    model_ann = get_model_ann()
    test_pred_ann = train_model_ann(model_ann, train_data, train_labels, test_data, test_labels)
    conf_ann = confusion_matrix(test_pred_ann, test_labels)
    cv_conf_ann.append(conf_ann)
    
    # Train and test SVM
    model_svm = get_model_svm()
    test_pred_svm = train_model_svm(model_svm, train_data, train_labels, test_data[:10000], test_labels[:10000])
    conf_svm = confusion_matrix(test_pred_svm, test_labels[:10000])
    cv_conf_svm.append(conf_svm)

    print(f"Done fold in {time.time() - start_time:.2f}s\n")

Fold #1/3
(46076, 200) (46076,)
(19047, 200) (19047,)
Training ANN with 46076 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Completed training ANN in 5.40s
Generating predictions for test set 19047
Generated predictions in 0.33s
Training SVM with 4559 samples
Completed training SVM in 0.83s
Generating predictions for test set 10000
Generated predictions in 2.52s
Done fold in 9.17s

Fold #2/3
(43514, 200) (43514,)
(21609, 200) (21609,)
Training ANN with 43514 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Completed training ANN in 5.08s
Generating predictions for test set 21609
Generated predictions in 0.33s
Training SVM with 4400 samples
Completed training SVM in 0.77s
Generating predictions for test set 10000
Generated predictions in 2.41s
Done fold in 8.67s

Fold #3/3
(40656, 200) (40656,)
(24467, 200) (24467,)
Training ANN with 40656 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Completed training ANN in 4.99s
Generating predictions for test set 24467
Generated predictions in 0.38s
Training SVM with 3996 sa

In [154]:

print(cv_conf_ann)
print(cv_conf_svm)

[<tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[16092,   277,   118,   215],
       [   76,   471,     0,     9],
       [  172,     0,   784,     0],
       [  102,     2,     0,   729]], dtype=int32)>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[18520,   275,   179,   125],
       [  229,   927,     0,     0],
       [  141,     0,   470,     0],
       [   83,     0,     0,   660]], dtype=int32)>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[20910,   204,   301,   481],
       [  239,   692,     0,    64],
       [  132,     0,  1152,     0],
       [   30,    56,     0,   206]], dtype=int32)>]
[<tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[8597,  166,   94,  244],
       [  29,  134,    0,    0],
       [  69,    0,  457,    0],
       [   2,    0,    0,  208]], dtype=int32)>, <tf.Tensor: shape=(4, 4), dtype=int32, numpy=
array([[8642,  131,  135,   80],
       [ 108,  321,    0,    0],
       [  37,    0,  264,    0],
       [  11,    0,    0,  27

## 4. Results

### 4.1. Part A

In [155]:
def print_confusion_matrix(conf):
    print(conf)
    
    overall_total = sum(sum(conf))
    overall_correct = sum([conf[i][i] for i in range(len(conf))])
    print(f"Overall accuracy: {100*overall_correct/overall_total:.2f}% ({overall_correct}/{overall_total})")
    
    fp = sum(conf[0][1:])
    tp = sum(sum(conf[1:]))
    print(f"False positives {fp}")
    print(f"True positives {tp}")
    print(f"False discovery (fp/(tp+fp)): {fp/(fp+tp):.4f} ({fp}/{fp+tp})")
    
    for i in range(len(conf)):
        letter = str(EVENT_ID_LETTER_MAP[i])[0]
        
        total = sum(conf[i])
        correct = conf[i][i]
        acc_total = 100*correct/total
        
        s = f"Event {letter} ({i}) accuracy: {correct:6}/{total: <6} (t_acc {acc_total:5.2f}%)"
        if i > 0:
            acc_event = 100*correct/(total - conf[i][0])
            s += f" (e_acc {acc_event:5.2f}%)"
        print(s)
        
    print("")
        
    return conf

In [156]:
total_conf_ann = np.sum(cv_conf_ann, axis=0)
print_confusion_matrix(total_conf_ann)

total_conf_svm = np.sum(cv_conf_svm, axis=0)
print_confusion_matrix(total_conf_svm)

[[55522   756   598   821]
 [  544  2090     0    73]
 [  445     0  2406     0]
 [  215    58     0  1595]]
Overall accuracy: 94.61% (61613/65123)
False positives 2175
True positives 7426
False discovery (fp/(tp+fp)): 0.2265 (2175/9601)
Event N (0) accuracy:  55522/57697  (t_acc 96.23%)
Event L (1) accuracy:   2090/2707   (t_acc 77.21%) (e_acc 96.63%)
Event R (2) accuracy:   2406/2851   (t_acc 84.39%) (e_acc 100.00%)
Event S (3) accuracy:   1595/1868   (t_acc 85.39%) (e_acc 96.49%)

[[25875   550   312   434]
 [  152   752     0    10]
 [  153     0  1189     0]
 [   13     0     0   560]]
Overall accuracy: 94.59% (28376/30000)
False positives 1296
True positives 2829
False discovery (fp/(tp+fp)): 0.3142 (1296/4125)
Event N (0) accuracy:  25875/27171  (t_acc 95.23%)
Event L (1) accuracy:    752/914    (t_acc 82.28%) (e_acc 98.69%)
Event R (2) accuracy:   1189/1342   (t_acc 88.60%) (e_acc 100.00%)
Event S (3) accuracy:    560/573    (t_acc 97.73%) (e_acc 100.00%)



array([[25875,   550,   312,   434],
       [  152,   752,     0,    10],
       [  153,     0,  1189,     0],
       [   13,     0,     0,   560]])

### 4.2. Part B

## 5. Discussion and Conclusion

### 5.1. Issues Addressed

### 5.2. Conclusion

## 6. Student Contribution

### 6.1. Matty

### 6.2. Ashwin

### 6.3. Marcus

### 6.4. Alex

### 6.5. Jingyu

### 6.9. Josh
