# Imports, variables, functions

In [1]:
import scipy.io
from pyedflib import highlevel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import groupby
import csv
import pickle
from scipy.signal import butter, sosfilt, sosfiltfilt, sosfreqz
from scipy.signal import freqz, iirnotch, filtfilt
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import TransformerMixin, BaseEstimator
import random
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from scipy import signal

In [2]:
sample_rate = sampling_rate = 256
sec = 10
len_window = sample_rate*sec
overlap = 5
threshold = 2*sample_rate
sample_rate_downsample = int(0.1*sample_rate)
len_window_downsample = sample_rate_downsample*sec

In [3]:
# Load annotation file
annt = scipy.io.loadmat('../raw_data/annotations_2017.mat')

In [4]:
## -- PREPROCESSING FUNCTIONS --

# Highpass filter
def highpass_filter(signals, sampling_rate, hp_frequency = 0.1):
    sos = butter(N = 3, Wn = hp_frequency, btype="highpass",fs=sampling_rate, output="sos")
    filter_hp = sosfiltfilt(sos, signals)
    return filter_hp

# Powerline filter
def notch_filter(signals, sampling_rate, notch_frequency = 50, quality_factor = 30):
    w0 = notch_frequency/(sampling_rate/2)
    b_notch, a_notch = iirnotch(w0, quality_factor)
    filter_notch = filtfilt(b_notch, a_notch, signals, axis = -1)
    return filter_notch

# Create our own scaler
class CustomTranformer(TransformerMixin, BaseEstimator): 
    # BaseEstimator generates the get_params() and set_params() methods that all Pipelines require
    # TransformerMixin creates the fit_transform() method from fit() and transform()
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.means = X.mean()
        return self
    
    def transform(self, X, y=None):
        norm_features = X - self.means
        return norm_features

# Combination of all filters and Scaler
def filter_signals(signals, sampling_rate, scaler, hp_frequency = 0.1, notch_frequency = 50, quality_factor = 30):
    filter_hp = highpass_filter(signals, sampling_rate)
    filter_notch = notch_filter(signals, sampling_rate, notch_frequency, quality_factor)
    final_signal = scaler.fit_transform(filter_notch)
    return final_signal

## -- LABEL FUNCTIONS --

# Format the EEG 
def eeg_formated(signals, names_ele):
    data_signals = signals.T # transpose the signals from datapoints
    data_signals = pd.DataFrame(data_signals) # create a pandas dataframe
    
    data_signals.columns = names_ele # rename columns
    
    return data_signals

# Format the annotations
def diagnosis(n):
    patient_A=annt["annotat_new"][0][n-1][0]
    patient_B=annt["annotat_new"][0][n-1][1]
    patient_C=annt["annotat_new"][0][n-1][2]
    
    #converting seconds to datapoints

    patient_A=patient_A.tolist()
    patient_B=patient_B.tolist()
    patient_C=patient_C.tolist()
    
    patient_A_dtp=[]
    patient_B_dtp=[]
    patient_C_dtp=[]  
    for elem in patient_A:
        for i in range(sampling_rate):
            patient_A_dtp.append(elem) 
    for elem in patient_B:
        for i in range(sampling_rate):
            patient_B_dtp.append(elem)
        
    for elem in patient_C:
        for i in range(sampling_rate):
            patient_C_dtp.append(elem)
            
    target_=pd.DataFrame({"Diagnosis A":patient_A_dtp,"Diagnosis B":patient_B_dtp,"Diagnosis C":patient_C_dtp})
    
    return target_  

# Add a time column with the seconds
def add_time(df):
    list_time=[]
    for i in range(len(df)):
        list_time.append(i//sampling_rate)
    df["time"]=list_time
    return df

# Create target variables when seizures lasts at least 10
def is_seizure(df):
    
    threshold = sampling_rate*10
    
    df['is_seizure_A'] = df["Diagnosis A"].groupby((df["Diagnosis A"] != df["Diagnosis A"].shift()).cumsum()).transform('size') * df["Diagnosis A"]
    df['is_seizure_A'] = (df['is_seizure_A'] > threshold).astype(int)
    
    df['is_seizure_B'] = df["Diagnosis B"].groupby((df["Diagnosis B"] != df["Diagnosis B"].shift()).cumsum()).transform('size') * df["Diagnosis B"]
    df['is_seizure_B'] = (df['is_seizure_B'] > threshold).astype(int)
    
    df['is_seizure_C'] = df["Diagnosis C"].groupby((df["Diagnosis C"] != df["Diagnosis C"].shift()).cumsum()).transform('size') * df["Diagnosis C"]
    df['is_seizure_C'] = (df['is_seizure_C'] > threshold).astype(int)
    
    return df 

# Create final target
def create_target(df):
    df['is_seizure_target'] = np.where(df['is_seizure_A'] + df['is_seizure_B'] + df['is_seizure_C'] >= 2, 1, 0)
    return df

# Remove useless
def remove_useless_columns(df):
    df.drop(columns=['Diagnosis A', 'Diagnosis B', 'Diagnosis C', 'is_seizure_A', 'is_seizure_B', 'is_seizure_C', 'ECG EKG', 'Resp Effort', 'time'], inplace=True)
    return df

# Final function to label
def label_data(path_raw_data, signals_preprocessed, n):
    
    signals, signal_headers, header = highlevel.read_edf(path_raw_data)
    
    names_ele = [signal_headers[iele]['label'] for iele in range(signals.shape[0])] # extract electrode names
    
    eeg_patient = eeg_formated(signals_preprocessed, names_ele) # format the ECG
    eeg_patient.rename(columns={'ECG EKG-REF':'ECG EKG', 'Resp Effort-REF':'Resp Effort'}, inplace=True)
    
    diagnosis_patient = diagnosis(n) # format the diagnosis
    
    data_patient = pd.merge(left=eeg_patient, right=diagnosis_patient, how='left', left_index=True, right_index=True) # merge ecg and diagnosis
    
    add_time(data_patient)
    is_seizure(data_patient)
    create_target(data_patient)
    remove_useless_columns(data_patient)
    
    return data_patient

def preprocess_and_label(path_raw_data, scaler, patient_number, Fournier=False):
    
    # Load raw data
    signals, signal_headers, header = highlevel.read_edf(path_raw_data)
    
    # Preprocess data 
    signals_preprocessed = filter_signals(signals, sampling_rate, scaler, hp_frequency = 0.1, notch_frequency = 50, quality_factor = 30)
    
    if Fournier == True:
        signals_preprocessed = pd.DataFrame(np.array([abs(rfft(signals_preprocessed[i])) for i in range(len(signals_preprocessed))]))
        
    # Label data
    df = label_data(path_raw_data, signals_preprocessed, patient_number)
    
    return df

## -- MODEL FUNCTIONS --

def flatten_window(window_df):
    if len(np.unique(window_df.iloc[:,-1])) == 1:
        target = window_df.iloc[0,-1]
    elif np.unique(window_df.iloc[:,-1],return_counts=True)[1][1] >= threshold:
        target = 1
    else:
        target = 0
    t_df = window_df.drop(columns = "is_seizure_target").transpose()
    flatten = pd.DataFrame(np.array(t_df).reshape(1,t_df.shape[0]*t_df.shape[1]))
    flatten["Target"] = target
    return flatten

def create_data_input(df):
    data = np.array([flatten_window(df.iloc[i:i+len_window+1]) for i in range(0,len(df)-len_window, overlap*sample_rate)])
    r=data.shape[0]
    c=data.shape[2]
    
    data = pd.DataFrame(data.reshape(r,c))
    X = data.iloc[:,:-1]
    y = data.iloc[:,-1]
    return X, y

def oversampling(X, y): 
    sm = SMOTE(sampling_strategy='minority', random_state=7)
    X, y = sm.fit_resample(X, y)
    return X, y

def test_new_data(path_raw_new_data, scaler, patient_number, model_fit, Fournier=False):
    
    # Preprocess and label new data
    df_new = preprocess_and_label(path_raw_data, scaler, patient_number, Fournier=Fournier)
    
    X_new, y_new =  create_data_input(df_new)
    
    # Dataframe true vs predict
    y_true = y_new
    y_pred = model_fit.predict(X_new)
    
    results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})
    
    confusion_matrix = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])
    
    return confusion_matrix

# Data Preprocessing

In [5]:
patients = [2, 3, 6, 10, 1, 13, 14, 18, 19, 78]

d = {}
for i in patients:
    df_i = preprocess_and_label(f"../raw_data/eeg{i}.edf", CustomTranformer(), i, Fournier=False)
    d[i] = df_i
    d[i].columns= d[i].columns.str.lower()

In [6]:
d[2].head()

Unnamed: 0,eeg fp1-ref,eeg fp2-ref,eeg f3-ref,eeg f4-ref,eeg f7-ref,eeg f8-ref,eeg fz-ref,eeg c3-ref,eeg c4-ref,eeg cz-ref,eeg t3-ref,eeg t5-ref,eeg t4-ref,eeg t6-ref,eeg p3-ref,eeg p4-ref,eeg pz-ref,eeg o1-ref,eeg o2-ref,is_seizure_target
0,-98.645167,-101.167312,-150.805429,-99.376346,-83.182367,-74.78867,20.013543,-143.138792,-87.463916,-77.354371,-139.180264,-9.668076,-84.621197,-61.668935,-7.265924,-127.848488,-58.116788,-140.756929,-126.694843,0
1,-98.821001,-101.178549,-152.318462,-100.563842,-81.687831,-76.560804,20.228367,-146.428835,-89.349899,-78.247286,-139.842162,-11.709217,-63.372738,-63.000002,-9.963888,-129.14847,-61.498554,-143.333635,-129.172922,0
2,-97.067707,-99.46784,-151.524433,-99.432681,-81.461237,-77.183123,19.782924,-145.09298,-88.052963,-77.63894,-157.247211,-12.517739,-78.761883,-61.846963,-11.261941,-127.605515,-61.65199,-142.162175,-126.79794,0
3,-98.678719,-101.018678,-153.656575,-102.018683,-83.169448,-79.906383,19.882898,-147.323366,-89.776689,-78.882967,-138.599045,-10.345296,-68.253342,-63.357444,-11.514185,-130.261691,-62.997133,-144.038379,-128.631423,0
4,-97.898545,-100.469111,-151.807918,-101.011273,-80.232267,-79.79265,19.633745,-145.928515,-88.713656,-77.980124,-144.491543,-10.629773,-71.803903,-64.094909,-12.26519,-128.8469,-63.588286,-142.847917,-128.00232,0


# Feature Engineering

## Downsampling

In [8]:
def downsampling(df):
    
    df_downsample = pd.DataFrame()
    
    num = int(0.1*len(df))
    t = np.array(range(0, len(df), sample_rate))
    
    for i, column in enumerate(df.columns[:-1]):
        x = np.array(df.iloc[:,i])
        x_resampled = signal.resample(x, num, t=t)
        df_downsample[column] = x_resampled[0]
    
    index = ((x_resampled[1]/10)/sample_rate).astype(int)
    df_downsample['target'] = df.iloc[index, -1]
        
    return df_downsample

In [9]:
downsampling(d[19])['target'].value_counts(normalize=True)

0    0.891183
1    0.108817
Name: target, dtype: float64

In [10]:
d[19]['is_seizure_target'].value_counts(normalize=True)

0    0.746724
1    0.253276
Name: is_seizure_target, dtype: float64

In [11]:
d_downsample = {}

for i in patients:
    d_downsample[i] = downsampling(d[i])

## Flatten

In [12]:
def flatten(window_df):
    if len(np.unique(window_df.iloc[:,-1])) == 1:
        target = window_df.iloc[0,-1]
    elif np.unique(window_df.iloc[:,-1],return_counts=True)[1][1] >= threshold:
        target = 1
    else:
        target = 0
    t_df = window_df.drop(columns = "target").transpose()
    flatten = pd.DataFrame(np.array(t_df).reshape(1,t_df.shape[0]*t_df.shape[1]))
    flatten["Target"] = target
    return flatten

def flatten_dataframe(df):
    data = np.array([flatten(df.iloc[i:i+len_window_downsample]) for i in range(0,len(df)-len_window_downsample, overlap*sample_rate_downsample)])
    r=data.shape[0]
    c=data.shape[2]
    data = pd.DataFrame(data.reshape(r,c))
    return data

In [13]:
d_downsample_flat = {}

for i in patients:
    d_downsample_flat[i] = flatten_dataframe(d_downsample[i])

In [14]:
d_downsample_flat[14]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769
0,-14.519347,-14.407475,-11.132750,-18.233438,3.179335,32.260799,32.912941,80.638027,107.640260,122.325819,...,-12.989435,-32.138316,-41.430821,-74.010391,-96.553281,-114.767518,-123.114037,-127.014764,-119.919445,0.0
1,-11.715052,-16.833813,-17.577628,-8.003384,-16.656204,-22.829802,-53.247493,-57.343997,-24.927310,-25.728962,...,13.296643,12.963882,30.561974,25.612745,-0.930472,-10.624787,-24.136070,-25.952611,-24.134601,0.0
2,-142.173169,-140.803962,-118.543185,-56.835492,-76.211400,-92.857397,-49.028413,-36.152929,-69.239353,-79.050516,...,-56.752340,-19.370957,-4.904450,-0.797279,16.166860,30.665713,22.727778,-1.830691,-2.126597,0.0
3,-13.354724,-19.094202,0.611412,10.582678,17.395071,33.926745,46.804657,16.052864,-19.752532,-51.833971,...,12.633344,17.557056,30.132587,28.866146,39.203619,65.546300,67.466272,57.443768,57.192405,0.0
4,10.218606,15.151077,19.243511,3.237212,-31.428835,-39.854032,-46.080836,-40.878366,-32.012381,-41.810033,...,-28.859493,-12.713244,4.012261,14.278264,-2.335471,5.139636,3.191670,-1.785818,6.659276,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
757,106.836971,99.049276,113.536903,83.163710,23.544614,-1.856794,-13.577714,-19.644439,0.199050,11.941309,...,-15.577278,6.015080,15.607177,-7.039668,-2.653134,22.793792,44.443933,82.295276,36.969412,1.0
758,26.434818,40.541517,31.220224,16.106791,-1.920567,10.605341,43.431552,31.990120,44.775687,30.457607,...,-18.984085,-19.200240,-18.984658,-19.199676,-18.985213,-19.199130,-18.985748,-19.198606,-18.986261,1.0
759,11.931849,-9.313195,-45.040186,-11.261313,36.510162,55.176546,66.361038,67.129372,55.652300,62.713172,...,-19.192967,-18.991606,-19.193047,-18.991527,-19.193126,-18.991450,-19.193201,-18.991376,-19.193274,1.0
760,-19.015104,-19.168922,-19.016262,-19.167798,-19.017354,-19.166737,-19.018384,-19.165735,-19.019359,-19.164788,...,-10.575840,1.692001,20.358976,20.766966,13.582094,4.611008,17.917299,44.033149,69.464035,1.0


## Concatenate

In [15]:
df = pd.concat([d_downsample_flat[i] for i in patients])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4760,4761,4762,4763,4764,4765,4766,4767,4768,4769
0,-51.790487,-108.508792,-92.041544,-108.826928,-100.990542,-108.198593,-103.567925,-101.66193,-95.585531,-104.048972,...,76.980718,73.601517,72.33246,81.620938,97.614028,101.950804,117.735185,126.558241,127.614301,0.0
1,16.547354,17.298428,7.587798,15.155318,3.76578,9.679039,10.028958,-5.59268,-7.665993,-10.232078,...,-33.390074,-35.06817,-17.276322,1.167471,13.587604,27.349096,31.310101,37.870772,22.254086,0.0
2,45.284156,31.890903,36.403496,30.405699,16.477934,15.40994,40.087373,53.381421,43.671478,32.200664,...,-11.292246,-26.361058,-38.707241,-31.545073,-42.722679,-52.906327,-49.901224,-65.750416,-79.51369,0.0
3,63.074285,61.859114,53.553447,83.209064,95.865771,69.037228,68.75916,22.428531,38.665883,22.764212,...,6.730311,2.415929,-7.526873,7.44765,7.204598,21.905634,30.744237,31.612091,27.159108,0.0
4,17.77062,14.368514,12.620195,13.836522,12.683923,6.049505,-2.571634,-2.150631,3.986595,7.247816,...,7.059674,1.18586,-1.604514,3.964773,8.127262,22.815604,28.171243,34.704224,34.725902,0.0


In [16]:
def create_x_and_y(df):
    X = df.iloc[:,:-1]
    y = df.iloc[:,-1]
    return X,y

In [17]:
X, y = create_x_and_y(df)

# Modeling

In [18]:
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
    
# Balancing
X_train, y_train = oversampling(X_train, y_train)

In [None]:
# Fit model
model = SVC(kernel='linear', C=10)
model.fit(X_train, y_train)

In [None]:
# Preprocess and label new data
path_raw_new_data = "../raw_data/eeg5.edf"
df_new = preprocess_and_label(path_raw_new_data, CustomTranformer(), 5, Fournier=False)

# Downsampling new data
df_new_downsample = downsampling(df_new)

# Flatten new data
df_new_downsample_flat = flatten_dataframe(df_new_downsample)

In [None]:
# Confusion matrix
y_true = df_new_downsample_flat.iloc[:,-1]
y_pred = model.predict(df_new_downsample_flat.iloc[:,:-1])
    
results_df = pd.DataFrame({"actual": y_true,
                           "predicted": y_pred})
    
confusion_matrix = pd.crosstab(index= results_df['actual'],
                               columns = results_df['predicted'])
    
confusion_matrix