Loads EEG labels and basic features and calculates a baseline classifier based on first round of EEG features

In [38]:
import os

import pandas as pd
import numpy as np

from sklearn.decomposition import KernelPCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold,KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [21]:
label_df=pd.read_csv('../../../../../data/raw/EEG/EEG_labels.csv')

In [22]:
y=label_df["Late Seizure Label"].to_numpy()

event_types=["Seizure","PDs","RDAs"]
#event_types=["Seizure","Duration of Seizure","PDs","Total Duration (waxing waning PDs)","RDAs","Total Duration (waxing waning RDAs)"]
#3_17_0007 and 3_17_0012 don't have the full set of duration measurements, so I'm ignoring duration measurements for the initial classifier


#feature_df["Total"].loc[feature_df["Unnamed: 0"]=="Duration of Seizure"][1]
names=[]
features=np.zeros((3,len(label_df["Subject"])))
seizure=[]
pds=[]
rdas=[]

for name_ind,name in enumerate(label_df["Subject"]):
    feature_df=pd.read_csv(f'../../../../../data/processed/EEG/{name}.csv')
    names.append(name)
    
    for index,event in enumerate(event_types):
        features[index,name_ind]=feature_df["Total"].loc[feature_df["Unnamed: 0"]==event].iat[0]


In [23]:
print(features)

[[0. 0. 2. 0. 0. 0. 0. 4. 0. 0.]
 [0. 0. 0. 1. 0. 6. 0. 6. 5. 7.]
 [0. 0. 3. 2. 0. 7. 0. 1. 0. 0.]]


In [24]:
print(y)

[1 1 1 0 0 0 1 1 1 1]


In [44]:
def compare_features(x,y,cv_inner,cv_outer,score_string,feature_string):
    ''' Prints performance based on nested CV of SVC and Decision Tree Classifier for x and y.
    '''

    pipe_log=Pipeline([("scale",StandardScaler()),("log",LogisticRegression())])
    param_grid_log={"log__C": [.1, 1, 10]}

    search_log=GridSearchCV(estimator=pipe_log,param_grid=param_grid_log,scoring=score_string,cv=cv_inner,refit=True)
    scores_log = cross_val_score(search_log, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1)
    
    print(f"Score {feature_string} for Logistic Regression")
    print(f"Mean {scores_log.mean()} and STD {scores_log.std()}")

    pipe_svc=Pipeline([("scale",StandardScaler()),("svm",SVC(probability=True))])
    param_grid_svc={"svm__C": [1, 10, 100], "svm__gamma": [.01, .1]}

    search_svc=GridSearchCV(estimator=pipe_svc,scoring=score_string,param_grid=param_grid_svc,cv=cv_inner,refit=True)
    scores_svc = cross_val_score(search_svc, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    print(f"Score {feature_string} for SVM")
    print(f"Mean {scores_svc.mean()} and STD {scores_svc.std()}")
    
    pipe_rf=Pipeline([("scale",StandardScaler()),("rf",RandomForestClassifier() )])
    param_grid_rf={"rf__n_estimators": [10, 50, 100]}

    search_rf=GridSearchCV(estimator=pipe_rf,scoring=score_string,param_grid=param_grid_rf,cv=cv_inner,refit=True)

    scores_rf = cross_val_score(search_rf, x, y, scoring=score_string, cv=cv_outer, n_jobs=-1)

    print(f"Score {feature_string} for RF")
    print(f"Mean {scores_rf.mean()} and STD {scores_rf.std()}")

In [40]:
cv_outer=StratifiedKFold(n_splits=2,shuffle=True,random_state=42)
cv_inner=KFold(n_splits=2,shuffle=True,random_state=42)

In [51]:
compare_features(features.transpose(),y,cv_inner,cv_outer,"f1","All Events")

Score All Events for Logistic Regression
Mean 0.8194444444444444 and STD 0.06944444444444453
Score All Events for SVM
Mean 0.8194444444444444 and STD 0.06944444444444453
Score All Events for RF
Mean 0.5416666666666666 and STD 0.2083333333333333
