In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn import svm
from sklearn.naive_bayes import GaussianNB

In [2]:
# It creates joined result from bci file and labels file. FFT is already done.
def connectCollectedData(BCI_filename, labels_filename, result_filename):
    # reads in te data from txt files
    with open("gui_data/" + labels_filename) as f:
        labels_content = f.readlines()

    data = pd.read_csv('Open BCI Saved Data/' + BCI_filename, skiprows=6, sep=", ", header=None, engine='python')
    data = data.drop(columns=[0, 17, 18, 19])
    
    # Now we start adding labels
    # If we record our data at midnight then the code probably won't work
    # label values 0 stands for none
    # label value 1 stands for right hand
    # label value 2 stands for left hand
    # label value 5 stands for the period when our program was not working

    label_row = labels_content[0].strip().split("\t")  # taking the row where we started program
    label_time = datetime.strptime(label_row[0], '%Y-%m-%d %H:%M:%S.%f').time()
    labels = []
    
    for index, row in data.iterrows():
        row_time = datetime.strptime(row[20], '%H:%M:%S.%f').time()
        if row_time >= label_time:
            break
        labels.append(5)
    
    for i in range(250):   # break before start
        labels.append(5)

    for i in range(len(labels_content) - 1):
        label_row = labels_content[i + 1].strip().split("\t")
        label_value = getNewLabel(label_row[1])
        for j in range(500):                     # hand raises and breaks
            labels.append(label_value)

    for i in range(data.shape[0] - len(labels)):    # filling not needed data with 5
        labels.append(5)
        
    data['label'] = pd.Series(labels, index=data.index)
    data = data[data.label != 5]
    data = data.reset_index()
    data = data.drop(columns=['index', 20])
    data_abs = doSpectral(data)
    
    new_df_abs = []
    for i in range(len(data_abs)):
        l = []
        for j in range(len(data_abs[i])):
            for n in range(len(data_abs[i][j])):
                l.append(data_abs[i][j][n])
        new_df_abs.append(l)

    tt = pd.DataFrame(new_df_abs)
    tt['label'] = label_vector

    tt = tt.dropna()
    tt.to_csv("gui_BCI_joined/" + result_filename)


def doSpectral(data):
    # spectral analyses
    last_index = 0
    label_vector = [] # train labels
    fft_array_rows = [] # train data prepared
    time_period = 500

    # 1/250 is sampling frequency and it probably should be changed for our own data
    frequencies = np.fft.rfftfreq(500, 1/250)
    wanted_frequencies = frequencies[(frequencies < 15) & (frequencies > 6)]

    for i in range(int(len(data)/time_period)):
        current_matrix = data[last_index:last_index+time_period]
        fft_array_columns = []
        label_vector.append(current_matrix["label"][last_index])
        last_index = last_index + time_period
        
        for i in range(15):  # fft on row
            fft_array_columns.append(np.fft.rfft(current_matrix[i + 1])[0:len(wanted_frequencies)])
        fft_array_rows.append(fft_array_columns)

    return np.abs(fft_array_rows) 
    
def getNewLabel(message):
    if message == "end" or message == "start":
        return 0
    if "right" in message:
        return 1
    if "left" in message:
        return 2
    print("problem with the following message:")
    print(message)

In [261]:
# connecting actual result files
connectCollectedData("OpenBCI-RAW-2018-05-30_11-16-05.txt","2018-05-30T11_37.txt", "s2_run6.csv")

In [3]:
# reads in all the data about one person
def getSubjectData(subjects, scale_labels = True):
    data = pd.DataFrame()    
    for subject_nr in subjects:
        if data.empty:   #adding first value
            data = pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run1.csv', sep=",")
        else:
            data = data.append(pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run1.csv', sep=","))
            
        for i in range(5):          # adding other values
            data = data.append(pd.read_csv('gui_BCI_joined/s1_run' + str(i + 2) + '.csv', sep=","))
        data = data.reset_index()
        data = data.drop(columns=['index', 'Unnamed: 0'])
        if scale_labels:
            np.random.seed(0)
            drop_indices = np.random.choice(data[data.label == 0].index, 72, replace=False)
            data = data.drop(drop_indices)
    return data

# reads in all the data about one person, where the person actually raised the hand.
def getSubjectRaisaData(subjects, scale_labels = True):
    data = pd.DataFrame()    
    for subject_nr in subjects:
        if data.empty:   #adding first value
            data = pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run2.csv', sep=",")
        else:
            data = data.append(pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run2.csv', sep=","))

        for i in range(2):
            index = i * 2 + 4
            data = data.append(pd.read_csv('gui_BCI_joined/s1_run' + str(index) + '.csv', sep=","))
        
        data = data.reset_index()
        data = data.drop(columns=['index', 'Unnamed: 0'])
        if scale_labels:
            np.random.seed(0)
            drop_indices = np.random.choice(data[data.label == 0].index, 36, replace=False)
            data = data.drop(drop_indices)
    return data

def getSubjectThinkData(subjects, scale_labels = True):
    data = pd.DataFrame()    
    for subject_nr in subjects:
        if data.empty:   #adding first value
            data = pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run1.csv', sep=",")
        else:
            data = data.append(pd.read_csv('gui_BCI_joined/s' + str(subject_nr)+ '_run1.csv', sep=","))

    for i in range(2):
        index = i * 2 + 3
        data = data.append(pd.read_csv('gui_BCI_joined/s1_run' + str(index) + '.csv', sep=","))
    data = data.reset_index()
    data = data.drop(columns=['index', 'Unnamed: 0'])
    if scale_labels:
        np.random.seed(0)
        drop_indices = np.random.choice(data[data.label == 0].index, 36, replace=False)
        data = data.drop(drop_indices)
    return data


In [20]:
#data = getSubjectData([1,2])
data = getSubjectThinkData([1,2])
#data = getSubjectRaisaData([1, 2])
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,label
0,75053815.48,5107.185365,1663.165071,4187.034986,2170.235685,3020.505846,469.463643,1852.051940,823.511623,1593.942460,...,1405.793586,255.484940,1222.811660,354.083519,940.392245,513.144884,392.100581,944.862630,386.990398,2
1,75034178.21,3014.591977,2283.704363,622.187933,1204.384529,348.575565,1602.845624,1216.647305,883.775061,963.627810,...,2424.992556,1523.928495,2029.949588,306.302544,1098.721946,854.538296,481.608409,357.287740,375.730406,0
2,75024829.61,2903.109581,585.697519,2017.439948,640.192497,746.258622,301.428414,573.961508,886.204961,47.883575,...,142.179359,1524.790857,1217.738722,677.858292,729.393266,1205.128309,513.179418,1092.616095,569.248981,1
3,75001068.42,3120.277139,2021.535193,1740.383081,1199.630377,1452.705040,508.021476,798.011669,478.436034,121.574922,...,715.699382,535.916246,1294.757065,578.936387,558.573266,286.987945,361.263616,523.185757,462.910046,0
4,75011783.02,2857.214560,1921.269880,1150.467351,78.668874,1066.123460,1412.574747,293.021784,1078.492217,586.416809,...,583.308746,858.758936,584.358711,383.634252,182.593475,265.545930,335.874824,445.332392,414.508444,1
6,75005288.22,8309.553035,5874.332708,4248.574353,1778.221293,918.441865,1963.636217,1760.311185,1095.588367,1789.469572,...,547.298099,1295.491585,658.720884,672.147716,703.799618,404.716323,993.947572,332.348781,497.868373,1
8,74986148.10,1688.641858,1200.274521,1447.276313,467.169465,693.071919,681.967455,482.217606,258.960877,246.006549,...,662.481828,702.856261,982.409806,1716.155543,1512.275798,1438.558520,550.016944,309.359100,726.242538,1
9,74975735.89,2251.497630,136.729774,2745.901974,1634.309387,565.104652,1413.437370,430.439430,652.576356,896.405293,...,1481.950915,1594.000928,1179.584224,864.250380,1012.571306,797.550760,795.299595,718.648818,531.453522,0
10,74953080.46,2795.997058,1024.818549,1463.141514,967.004775,1482.081358,1583.501637,1300.871512,574.669212,1710.943481,...,1428.698467,661.388254,825.504591,1679.061583,957.643899,634.682277,393.191770,249.431711,580.984497,1
11,74942097.26,2366.784298,2504.916898,1278.214828,490.683213,335.362399,1154.260364,408.319222,1146.585494,432.601024,...,1476.022093,482.013777,853.865633,979.859216,348.951114,350.141475,728.099846,347.074670,1043.050449,0


In [21]:
print(data[data.label == 0].shape)
print(data[data.label == 1].shape)
print(data[data.label == 2].shape)

(60, 256)
(56, 256)
(40, 256)


In [22]:
# trains the model with 3 classes and shows results
data_x = pd.DataFrame(data.iloc[:,0:data.shape[1] - 1]) # Instances of the data
data_y = pd.DataFrame(data.iloc[:,data.shape[1] - 1]) # Labels of the data
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0)

# Train the LDA model
clf = LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.4)
clf.fit(X_train, y_train)

# Accuracy of prediction on train data
y_pred = clf.predict(X_train)
print("Accuracy on train data ", 100*accuracy_score(y_train, y_pred))

# Accuracy of prediction on test data
y_pred = clf.predict(X_test)
print("Accuracy on test data ", 100*accuracy_score(y_test, y_pred))

y_pred

Accuracy on train data  64.5161290323
Accuracy on test data  40.625


  y = column_or_1d(y, warn=True)


array([2, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 2, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 2, 2, 2, 0, 1, 1, 0, 0], dtype=int64)

In [23]:
# trains the model with 2 classes and shows results

data2 = data[data.label != 0]
data_x = pd.DataFrame(data2.iloc[:,0:data2.shape[1] - 1]) # Instances of the data
data_y = pd.DataFrame(data2.iloc[:,data2.shape[1] - 1]) # Labels of the data
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0)


# Train the LDA model
clf = LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.4)
clf.fit(X_train, y_train)

# Accuracy of prediction on train data
y_pred = clf.predict(X_train)
print("Accuracy on train data ", 100*accuracy_score(y_train, y_pred))

# Accuracy of prediction on test data
y_pred = clf.predict(X_test)
print("Accuracy on test data ", 100*accuracy_score(y_test, y_pred))

y_pred

Accuracy on train data  81.5789473684
Accuracy on test data  60.0


  y = column_or_1d(y, warn=True)


array([1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1], dtype=int64)

In [24]:
data_x = pd.DataFrame(data.iloc[:,0:data.shape[1] - 1]) # Instances of the data
data_y = pd.DataFrame(data.iloc[:,data.shape[1] - 1]) # Labels of the data
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0)
# Train the Naive Bayes model
clf = GaussianNB()
clf.fit(X_train, y_train)

# Accuracy of prediction on train data
y_pred = clf.predict(X_train)
print("Accuracy on train data ", 100*accuracy_score(y_train, y_pred))

# Accuracy of prediction on test data
y_pred = clf.predict(X_test)
print("Accuracy on test data ", 100*accuracy_score(y_test, y_pred))

y_pred


# trains the model with 2 classes and shows results
data2 = data[data.label != 0]
data_x = pd.DataFrame(data2.iloc[:,0:data2.shape[1] - 1]) # Instances of the data
data_y = pd.DataFrame(data2.iloc[:,data2.shape[1] - 1]) # Labels of the data
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.2, random_state=0)

# Train the LDA model
clf = GaussianNB()
clf.fit(X_train, y_train)

# Accuracy of prediction on train data
y_pred = clf.predict(X_train)
print("Accuracy on train data ", 100*accuracy_score(y_train, y_pred))

# Accuracy of prediction on test data
y_pred = clf.predict(X_test)
print("Accuracy on test data ", 100*accuracy_score(y_test, y_pred))

y_pred

Accuracy on train data  50.0
Accuracy on test data  37.5
Accuracy on train data  63.1578947368
Accuracy on test data  65.0


  y = column_or_1d(y, warn=True)


array([2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1], dtype=int64)