In [None]:
### Preprocessing and Support Vector Machine (SVM) Notebook for EEG Analysis
## By: Nancy Shen and Karla Juego

In [None]:
### Preprocessing of EEG Data Steps ###

In [1]:
# Import Statements
import os
import pickle
import pandas as pd 

In [2]:
# Preprocessing to Change Attention States to a Numerical Value
input = pd.DataFrame()
output = pd.DataFrame()
for dirname, _, filenames in os.walk('/content/data'):
    for filename in filenames:
      data = pickle.load(open(dirname+'/'+filename, 'rb'))
      for i in range(1,5):
        trial = 'trial_'+ str(i)
        # print(trial)
        focussed = pd.DataFrame(data[trial]['focussed'])
        unfocussed = pd.DataFrame(data[trial]['unfocussed'])
        drowsed = pd.DataFrame(data[trial]['drowsed'])
        focussed_output = pd.DataFrame([3] * len(focussed))
        unfocussed_output = pd.DataFrame([2] * len(unfocussed))
        drowsed_output = pd.DataFrame([1] * len(drowsed))
        input = pd.concat([focussed,unfocussed, drowsed,input],axis=0)
        output = pd.concat([focussed_output,unfocussed_output, drowsed_output,output],axis=0)

In [3]:
# Move into a single dataframe.
input["output"] = output

# Check file with EEG data and concentration values.
input

Unnamed: 0,0,1,2,3,4,5,6,output
0,3992.307692,5018.974359,4315.897436,4384.615385,4033.846154,4161.538462,4081.538462,3
1,3991.794872,5017.948718,4320.000000,4384.102564,4032.820513,4157.948718,4083.076923,3
2,3992.820513,5014.871795,4319.487179,4386.666667,4035.384615,4151.794872,4078.974359,3
3,3990.769231,5011.282051,4315.384615,4384.615385,4029.230769,4150.769231,4073.333333,3
4,3989.230769,5015.384615,4315.384615,4379.487179,4020.000000,4152.820513,4076.923077,3
...,...,...,...,...,...,...,...,...
275243,4102.564103,5057.948718,4458.461538,4232.307692,4121.538462,4376.923077,4040.000000,1
275244,4097.435897,5052.307692,4452.307692,4232.307692,4116.923077,4380.512821,4036.923077,1
275245,4092.307692,5039.487179,4447.179487,4234.871795,4108.717949,4384.102564,4032.820513,1
275246,4088.717949,5031.794872,4442.564103,4231.282051,4097.948718,4378.461538,4023.589744,1


In [None]:
### Machine Learning with SVM Steps ###

In [4]:
# Import Statements
import matplotlib.pyplot as plt
%matplotlib inline 
# Remove above since it is not in use?

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [5]:
# Check the properties of the dataframe.
input.shape

(7708184, 8)

In [6]:
# Let's divide the data into the 'x' and 'y' variables.

# X will contain all of the features. In this case, it will include all of the channels/nodes.
X = input.drop(columns=['output'])
X.head(5)

# Y will contain the target variable, which is the level of concentration.
Y = input['output']
Y.head(5)

0    3
1    3
2    3
3    3
4    3
Name: output, dtype: int64

In [7]:
# Scaling (for Easier Training)
X = pd.DataFrame(StandardScaler().fit_transform(X))

X.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,0.292743,-0.33195,-0.025786,1.405936,0.08018,-0.072995,-0.772793
1,0.283083,-0.345186,0.017198,1.398873,0.061899,-0.134863,-0.747703
2,0.302403,-0.384894,0.011825,1.434189,0.107601,-0.240924,-0.814608
3,0.263764,-0.431219,-0.031159,1.405936,-0.002083,-0.258601,-0.906603
4,0.234784,-0.378276,-0.031159,1.335304,-0.166608,-0.223247,-0.848061


In [8]:
# Split into Training and Testing
# For this project, the test size will be 30% of the total data set.

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30)

In [None]:
### Training and Testing Steps ###

In [9]:
# This function allows you to run all of the steps to train and test the model with the desired kernel.
# You will not be able to change any of the other parameters. (Ex. Gamma and Regularization/C)
def runSVC(myKernel, X_Train, Y_Train, X_Test, Y_Test):
    # Train the Algorithm
    svclassifier = SVC(kernel=myKernel)
    svclassifier.fit(X_Train, Y_Train)
    print("Model has been trained successfully.")

    # Make Predictions
    Y_Pred = svclassifier.predict(X_Test)
    print("Model has made its predictions.")
    
    # Evaluate Predictions
    print(confusion_matrix(Y_Test, Y_Pred))
    print(classification_report(Y_Test, Y_Pred))
    print(accuracy_score(Y_Test, Y_Pred))
    return svclassifier

In [None]:
# To Run with Linear Kernel
linear_model = runSVC("linear", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with RBF Kernel
rbf_model = runSVC("rbf", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with Polynomial Kernel
poly_model = runSVC("poly", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# Now, look at all of the results to determine which of the kernels is performing the best.
# We can improve on that specific kernel-type by tuning the Gamma and Regularization/C parameters.

In [None]:
# It is easier to work with the Gamma parameter, so let's start there.
# Run this only ONCE using the BEST kernel type from the last step.
def runSVCwithGamma(myKernel, X_Train, Y_Train, X_Test, Y_Test):
    # Train the Algorithm
    # By default, Gamma was set to Scale. Now, we will try it with Auto.
    svclassifier = SVC(kernel=myKernel, gamma='auto')
    svclassifier.fit(X_Train, Y_Train)
    print("Model has been trained successfully.")

    # Make Predictions
    Y_Pred = svclassifier.predict(X_Test)
    print("Model has made its predictions.")
    
    # Evaluate Predictions
    print(confusion_matrix(Y_Test, Y_Pred))
    print(classification_report(Y_Test, Y_Pred))
    print(accuracy_score(Y_Test, Y_Pred))
    return svclassifier

In [None]:
# To Run with Linear Kernel
linear_gamma_model = runSVCwithGamma("linear", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with RBF Kernel
rbf_gamma_model = runSVCwithGamma("rbf", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with Polynomial Kernel
poly_gamma_model = runSVCwithGamma("polynomial", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# Now, let's compare your newest results (with Gamma on Auto) to the first time where Gamma was set to Scale.
# Decide which kernel-gamma combo performed the best.
# If necessary, vary the regularization (C) parameter.

In [None]:
def runSVCwithGammaAndC(myKernel, myGamma, X_Train, Y_Train, X_Test, Y_Test):
    # Train the Algorithm
    # By default, C is set to 1.
    # We'll try two other numbers, but you can try any desired number (>0).
    
    # First Try: C=0.5
    print("Trying with C=0.5")
    svclassifier_1 = SVC(kernel=myKernel, gamma=myGamma, C=0.5)
    svclassifier_1.fit(X_Train, Y_Train)
    print("Model has been trained successfully.")

    # Make Predictions
    Y_Pred = svclassifier_1.predict(X_Test)
    print("Model has made its predictions.")
    
    # Evaluate Predictions
    print(confusion_matrix(Y_Test, Y_Pred))
    print(classification_report(Y_Test, Y_Pred))
    print(accuracy_score(Y_Test, Y_Pred))
    
    # Second Try: C=5
    print("Trying with C=5")
    svclassifier_2 = SVC(kernel=myKernel, gamma=myGamma, C=5)
    svclassifier_2.fit(X_Train, Y_Train)
    print("Model has been trained successfully.")

    # Make Predictions
    Y_Pred = svclassifier_2.predict(X_Test)
    print("Model has made its predictions.")
    
    # Evaluate Predictions
    print(confusion_matrix(Y_Test, Y_Pred))
    print(classification_report(Y_Test, Y_Pred))
    print(accuracy_score(Y_Test, Y_Pred))

    return svclassifier_1, svclassifier_2

In [None]:
# To Run with Linear Kernel
linear_scale_c_model_1, linear_scale_c_model_2 = runSVCwithGammaAndC("linear", "scale", X_Train, Y_Train, X_Test, Y_Test)
linear_auto_c_model_1, linear_auto_c_model_2 = runSVCwithGammaAndC("linear", "auto", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with RBF Kernel
rbf_scale_c_model_1, rbf_scale_c_model_2 = runSVCwithGammaAndC("rbf", "scale", X_Train, Y_Train, X_Test, Y_Test)
rbf_auto_c_model_1, rbf_auto_c_model_2 = runSVCwithGammaAndC("rbf", "auto", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
# To Run with Polynomial Kernel
poly_scale_c_model_1, poly_scale_c_model_2 = runSVCwithGammaAndC("polynomial", "scale", X_Train, Y_Train, X_Test, Y_Test)
poly_auto_c_model_1, poly_auto_c_model_2= runSVCwithGammaAndC("polynomial", "auto", X_Train, Y_Train, X_Test, Y_Test)

In [None]:
### Exporting the File using Pickle ###
# So that no one has to wait to use the machine learning model.

In [None]:
# Import Statements
import pickle

In [None]:
# Use pickle to save the model.
myModel = linear_model ### Enter your best-performing model.
model_file_name = "Attention_Model.sav"
pickle.dump(myModel, open(model_file_name, "wb"))

In [None]:
# Future Code to Load the Model
# Copy and paste it into the desired notebook.
to_load_file_name = "Attention_Model.sav"
loaded_model = pickle.load(open(filename, "rb"))

# Now, you can do whatever you want with it. (Predict and test with your own data.)

In [None]:
### Future Steps ###

In [None]:
### Alternative: Using GridSearchCV to vary your parameters. 
# I still need to look into it.

In [None]:
### Potential Visualization ###

# I have not yet decided on whether I should visualize the current data as a way to determine which kernel is best.
# Go back to...