# Imports

In [120]:
import scipy.io
from pyedflib import highlevel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import groupby
import csv
import pickle
from scipy.signal import butter, sosfilt, sosfiltfilt, sosfreqz
from scipy.signal import freqz, iirnotch, filtfilt
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import TransformerMixin, BaseEstimator
import random
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC

# Variables

In [76]:
sample_rate = 256
sec = 10
len_window = sample_rate*sec
overlap = 5

# Data Loading

In [23]:
df = pd.read_csv('../data_modeling/data_modeling_patient_5.csv', index_col=[0])
df.head()

Unnamed: 0,EEG Fp1-Ref,EEG Fp2-Ref,EEG F7-Ref,EEG F3-Ref,EEG Fz-Ref,EEG F4-Ref,EEG F8-Ref,EEG T3-Ref,EEG C3-Ref,EEG Cz-Ref,EEG C4-Ref,EEG T4-Ref,EEG T5-Ref,EEG P3-Ref,EEG Pz-Ref,EEG P4-Ref,EEG T6-Ref,EEG O1-Ref,EEG O2-Ref,is_seizure_target
0,-1.419105,17.866398,26.912596,13.849781,0.33456,-8.870927,20.188353,8.470444,-16.098475,10.602408,20.922618,22.936879,-4.219819,-8.557335,-7.74193,16.791965,15.138854,-16.910205,-9.048158,1
1,-3.3182,41.790181,53.524897,22.539651,0.1124,-16.763359,39.730104,20.065797,-32.113944,19.72338,41.087818,46.729833,-7.158927,-15.070688,-13.714219,33.680687,30.321825,-30.656286,-15.350078,1
2,-8.260964,42.491697,48.742226,10.640972,-1.428029,-11.712904,37.064375,18.978916,-29.206768,17.656212,38.100061,43.876514,-5.428389,-14.246795,-12.121809,33.083871,29.851972,-25.687254,-10.224155,1
3,-3.282774,35.59319,52.041205,19.698493,-1.821554,-12.84716,40.719617,19.530675,-32.696911,18.75542,41.574157,48.381576,-6.566801,-14.805329,-12.654931,35.169712,32.560589,-28.299376,-11.631809,1
4,3.50518,32.131439,50.848944,19.133736,-2.570428,-13.868957,37.64464,10.022557,-31.576933,18.048527,39.006783,47.168211,-9.262223,-14.579787,-12.167261,34.712738,31.528349,-28.511808,-12.419453,1


# Feature Engineering

## Flatten and concatenate the data

In [103]:
def flatten_window(window_df):
    if len(np.unique(window_df.iloc[:,-1])) == 1:
        target = window_df.iloc[0,-1]
    else:
        target = 1
    t_df = window_df.drop(columns = "is_seizure_target").transpose()
    flatten = pd.DataFrame(np.array(t_df).reshape(1,t_df.shape[0]*t_df.shape[1]))
    flatten["Target"] = target
    return flatten

In [104]:
def create_data_input(df):
    data = np.array([flatten_window(df.iloc[i:i+len_window+1]) for i in range(0,len(df)-len_window, overlap*sample_rate)])
    r=data.shape[0]
    c=data.shape[2]
    
    data = pd.DataFrame(data.reshape(r,c))
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return X, y

In [105]:
X_train, y_train = create_data_input(df_train)
X_test, y_test = create_data_input(df_test)

## Balancing the data

In [106]:
print(y_train.value_counts())

1.0    419
0.0    117
Name: 48659, dtype: int64


In [108]:
# Resample the minority class. You can change the strategy to 'auto' if you are not sure.
sm = SMOTE(sampling_strategy='minority', random_state=7)

# Fit the model to generate the data.
X_train, y_train = sm.fit_resample(X_train, y_train)

# Modeling

## Implement models

In [123]:
# Instantiate the model
log_model = LogisticRegression(max_iter=2000)
knn_model = KNeighborsClassifier(n_neighbors=5)
svc_model = svc = SVC(kernel='linear', C=10)

# Fit the model on the data
log_model.fit(X_train, y_train)
knn_model.fit(X_train, y_train)
svc_model.fit(X_train, y_train);

## Model evaluation

### Accuracy

In [125]:
# Accuracy
log_score = accuracy_score(y_test, log_model.predict(X_test))
knn_score =  accuracy_score(y_test, knn_model.predict(X_test))
svc_score =  accuracy_score(y_test, svc_model.predict(X_test))

print(f"Accuracy {log_model}: {log_score}")
print(f"Accuracy {knn_model}: {knn_score}")
print(f"Accuracy {svc_model}: {svc_score}")

Accuracy LogisticRegression(max_iter=2000): 0.5545851528384279
Accuracy KNeighborsClassifier(): 0.013100436681222707
Accuracy SVC(C=10, kernel='linear'): 0.7117903930131004


### Confusion matrix

In [126]:
# Confusion matrix - Model 1

y_true = y_test
y_pred = log_model.predict(X_test)

results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})

confusion_matrix_log = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])

confusion_matrix_log

predicted,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,102,127


In [127]:
# Confusion matrix - Model 2

y_true = y_test
y_pred = knn_model.predict(X_test)

results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})

confusion_matrix_knn = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])

confusion_matrix_knn

predicted,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,226,3


In [128]:
# Confusion matrix - Model 3

y_true = y_test
y_pred = svc_model.predict(X_test)

results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})

confusion_matrix_svc = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])

confusion_matrix_svc

predicted,0.0,1.0
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,66,163


### Cross-validation

In [129]:
# Cross-validation

cv_results_log = cross_validate(log_model, X_train, y_train, cv=5, 
                            scoring=['recall', 'f1'])

cv_results_knn = cross_validate(knn_model, X_train, y_train, cv=5, 
                            scoring=['recall', 'f1'])

cv_results_svc = cross_validate(svc_model, X_train, y_train, cv=5, 
                            scoring=['recall', 'f1'])

print(f"Recall {log_model}: {round(cv_results_log['test_recall'].mean(),3)}")
print(f"Recall {knn_model}: {round(cv_results_knn['test_recall'].mean(),3)}")
print(f"Recall {svc_model}: {round(cv_results_svc['test_recall'].mean(),3)}")

print(f"F1 score {log_model}: {round(cv_results_log['test_f1'].mean(),3)}")
print(f"F1 score {knn_model}: {round(cv_results_knn['test_f1'].mean(),3)}")
print(f"F1 score {svc_model}: {round(cv_results_svc['test_f1'].mean(),3)}")

Recall LogisticRegression(max_iter=2000): 0.723
Recall KNeighborsClassifier(): 0.0
Recall SVC(C=10, kernel='linear'): 0.792
F1 score LogisticRegression(max_iter=2000): 0.825
F1 score KNeighborsClassifier(): 0.0
F1 score SVC(C=10, kernel='linear'): 0.87


## Check for overfitting