# Imports

In [2]:
import scipy.io
from pyedflib import highlevel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import groupby
import csv
import pickle
from scipy.signal import butter, sosfilt, sosfiltfilt, sosfreqz
from scipy.signal import freqz, iirnotch, filtfilt
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import TransformerMixin, BaseEstimator
import random
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate

In [25]:
def time_of_seizure(df,column):
    list_of_time=[]
    start_seizure=[]
    end_seizure=[]
    start_seizure_seconds=[]
    end_seizure_seconds=[]
    if df[column][0]==1:
        list_of_time.append(0)
    for i in range(len(df)-1):    
        if df[column][i]!=df[column][i+1]:
            list_of_time.append(i+1)
    for j in range(len(list_of_time)):
        if j%2==0:
            start_seizure.append(list_of_time[j])
            start_seizure_seconds.append(list_of_time[j]//sampling_rate)
        else:
            end_seizure.append(list_of_time[j])
            end_seizure_seconds.append(list_of_time[j]//sampling_rate)
    return start_seizure,start_seizure_seconds, end_seizure, end_seizure_seconds

In [26]:
sampling_rate = 256
window_length = 20*sampling_rate

# Data Preparation

## Load the data

In [27]:
df = pd.read_csv('../data_modeling/data_modeling_patient_5.csv', index_col=[0])
df.head()

Unnamed: 0,EEG Fp1-Ref,EEG Fp2-Ref,EEG F7-Ref,EEG F3-Ref,EEG Fz-Ref,EEG F4-Ref,EEG F8-Ref,EEG T3-Ref,EEG C3-Ref,EEG Cz-Ref,EEG C4-Ref,EEG T4-Ref,EEG T5-Ref,EEG P3-Ref,EEG Pz-Ref,EEG P4-Ref,EEG T6-Ref,EEG O1-Ref,EEG O2-Ref,is_seizure_target
0,-1.419105,17.866398,26.912596,13.849781,0.33456,-8.870927,20.188353,8.470444,-16.098475,10.602408,20.922618,22.936879,-4.219819,-8.557335,-7.74193,16.791965,15.138854,-16.910205,-9.048158,1
1,-3.3182,41.790181,53.524897,22.539651,0.1124,-16.763359,39.730104,20.065797,-32.113944,19.72338,41.087818,46.729833,-7.158927,-15.070688,-13.714219,33.680687,30.321825,-30.656286,-15.350078,1
2,-8.260964,42.491697,48.742226,10.640972,-1.428029,-11.712904,37.064375,18.978916,-29.206768,17.656212,38.100061,43.876514,-5.428389,-14.246795,-12.121809,33.083871,29.851972,-25.687254,-10.224155,1
3,-3.282774,35.59319,52.041205,19.698493,-1.821554,-12.84716,40.719617,19.530675,-32.696911,18.75542,41.574157,48.381576,-6.566801,-14.805329,-12.654931,35.169712,32.560589,-28.299376,-11.631809,1
4,3.50518,32.131439,50.848944,19.133736,-2.570428,-13.868957,37.64464,10.022557,-31.576933,18.048527,39.006783,47.168211,-9.262223,-14.579787,-12.167261,34.712738,31.528349,-28.511808,-12.419453,1


## Train/test split

In [28]:
def split_dataframe(df):
    df_=df.copy()
    a=len(df)
    b=(a*30)//100
    df_train=df_[0:b]
    df_test=df_[b:a-1]
    df_test.reset_index(inplace=True)
    df_test.drop(columns="index",inplace=True)

    return df_train,df_test

In [29]:
df_train, df_test = split_dataframe(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.drop(columns="index",inplace=True)


# Feature Engineering

## Create time windows

### Functions

In [30]:
def create_random_windows(df):
    b = random.randint(0,len(df))
    m= random.randint(0,len(df))
    e= random.randint(0,len(df))
    
    d_f=df.copy()
    d_t_b=d_f[b:b+window_length]
    d_t_m=d_f[m:m+window_length]
    d_t_e=d_f[e-window_length:e]
    
    return d_t_b,d_t_m,d_t_e

In [31]:
def create_healthy_windows(df):
    list_s=time_of_seizure(df,"is_seizure_target")[0]
    list_h=time_of_seizure(df,"is_seizure_target")[2]
    b=list_h[0]
    d_f=df.copy()
    d_h_b=d_f[b:b+window_length]
    return d_h_b

In [32]:
def create_seizure_windows(df):
    list_s=time_of_seizure(df,"is_seizure_target")[0]
    list_h=time_of_seizure(df,"is_seizure_target")[2]
    b=list_s[0]
    d_f=df.copy()
    d_s_b=d_f[b:b+window_length]
    return d_s_b

### Data

In [44]:
d_train = {}
d_train['random_1'] = create_random_windows(df_train)[0]
d_train['random_2'] = create_random_windows(df_train)[1]
d_train['random_3'] = create_random_windows(df_train)[2]
d_train['healthy'] = create_healthy_windows(df_train)
d_train['seizure'] = create_seizure_windows(df_train)

In [45]:
d_test = {}
d_test['random_1'] = create_random_windows(df_test)[0]
d_test['random_2'] = create_random_windows(df_test)[1]
d_test['random_3'] = create_random_windows(df_test)[2]
d_test['healthy'] = create_healthy_windows(df_test)
d_test['seizure'] = create_seizure_windows(df_test)

## Flatten the data

### Function

In [46]:
def flatten_window(window_df):
    if len(np.unique(window_df.iloc[:,-1])) == 1:
        target = window_df.iloc[0,-1]
    else:
        target = 1
    t_df = window_df.drop(columns = "is_seizure_target").transpose()
    flatten = pd.DataFrame(np.array(t_df).reshape(1,t_df.shape[0]*t_df.shape[1]))
    flatten["Target"] = target
    return flatten

### Data

In [47]:
for key in d_train.keys():
    d_train[key] = flatten_window(d_train[key])
    
for key in d_test.keys():
    d_test[key] = flatten_window(d_test[key])

## Full data input

### Function

In [48]:
def concatenate_windows(d):
    full_data = pd.concat([d[key] for key in d.keys()], axis=0)
    return full_data

### Data

In [49]:
data_train = concatenate_windows(d_train)
data_test = concatenate_windows(d_test)

In [50]:
data_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,97271,97272,97273,97274,97275,97276,97277,97278,97279,Target
0,-3.504507,-4.404639,-4.71947,-5.553493,-5.380312,-5.088171,-5.733189,-5.395203,-6.55925,-5.070764,...,-16.932721,-15.82209,-13.34432,-11.781941,-11.080658,-10.042773,-8.223471,-7.741448,-7.507545,1
0,41.926782,43.057444,44.773655,44.428517,44.816103,43.841907,41.639629,40.844914,38.396884,38.266974,...,33.271569,34.409193,36.978954,40.606628,42.896409,43.131918,41.250094,41.440607,39.895724,1
0,-2.39644,-4.154917,-11.0372,-4.105408,1.139535,1.108065,-2.234081,0.352371,1.416465,-0.912776,...,14.670116,12.374467,12.860901,14.572197,18.792738,24.775065,32.16422,38.733359,41.727417,1
0,33.534217,30.291892,27.202703,24.273232,20.273816,16.715424,13.403426,11.146554,8.638855,6.020464,...,8.694963,7.131006,6.649818,6.161754,5.195049,5.343116,6.107329,6.217539,6.439541,0
0,-1.419105,-3.3182,-8.260964,-3.282774,3.50518,0.131259,-1.403005,-1.941339,-2.358277,-12.566565,...,14.39291,16.208991,17.628799,19.663654,21.381785,21.911337,22.270192,22.287,22.556942,1


# Baseline model

The baseline model predicts that all the windows will have the most frequent target values

In [51]:
baseline_data = flatten_window(df[0:window_length])

for i in range(1,int(len(df)/window_length)):
    baseline_i = df[window_length*i:window_length*(i+1)]
    baseline_i_flat = flatten_window(baseline_i)
    baseline_data = pd.concat([baseline_data, baseline_i_flat], axis=0)

In [52]:
baseline_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,97271,97272,97273,97274,97275,97276,97277,97278,97279,Target
0,-1.419105,-3.3182,-8.260964,-3.282774,3.50518,0.131259,-1.403005,-1.941339,-2.358277,-12.566565,...,14.39291,16.208991,17.628799,19.663654,21.381785,21.911337,22.270192,22.287,22.556942,1
0,-14.649929,-15.3247,-14.581644,-14.076143,-13.615366,-12.028638,-9.9038,-9.671645,-8.494336,-6.36307,...,-19.683975,-22.723173,-25.46442,-25.616674,-26.387405,-27.931764,-29.367662,-32.38379,-34.573618,1
0,17.923109,18.250612,18.611758,17.865052,18.993725,19.986866,19.669725,19.476448,19.003185,17.76777,...,3.083907,5.577475,9.187187,12.051445,13.707296,14.99596,12.315302,9.341502,4.091622,1
0,-21.165243,-15.643251,-12.695072,-9.445013,-6.675025,-6.753497,-5.852923,-5.422566,-6.500006,-7.659416,...,-3.354696,-5.287242,-6.514407,-8.963307,-10.799737,-12.882502,-15.531327,-19.126361,-22.488253,1
0,17.907917,15.509094,13.547411,13.013918,12.535053,11.372206,11.47932,9.35801,7.382157,6.051535,...,8.879206,9.495799,8.699131,9.13383,8.999082,8.112023,8.210826,8.202021,8.409693,1


In [53]:
print(f"Baseline score: {round(max(baseline_data['Target'].value_counts())/len(baseline_data)*100,1)}%")

Baseline score: 86.5%


# Modeling

## Implement models

In [54]:
X_train = data_train.drop('Target', axis=1)
y_train = data_train['Target']

# Instantiate the model
log_model = LogisticRegression()
knn_model = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the data
log_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train);

## Model evaluation

In [55]:
X_test = data_test.drop('Target', axis=1)
y_test = data_test['Target']

In [59]:
# Accuracy
log_score = accuracy_score(y_test, log_model.predict(X_test))
knn_score =  accuracy_score(y_test, knn_model.predict(X_test))

print(f"Accuracy {log_model}: {log_score}")
print(f"Accuracy {knn_model}: {knn_score}")

Accuracy LogisticRegression(): 0.4
Accuracy KNeighborsClassifier(): 0.6


In [60]:
# Confusion matrix - Model 1

y_true = y_test
y_pred = log_model.predict(X_test)

results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})

confusion_matrix_log = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])

confusion_matrix_log

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,2
1,1,2


In [61]:
# Confusion matrix - Model 2

y_true = y_test
y_pred = knn_model.predict(X_test)

results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})

confusion_matrix_knn = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])

confusion_matrix_knn

predicted,1
actual,Unnamed: 1_level_1
0,2
1,3
