In [1]:
import scipy.io
from pyedflib import highlevel
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from itertools import groupby
import csv
import pickle
from scipy.signal import butter, sosfilt, sosfiltfilt, sosfreqz
from scipy.signal import freqz, iirnotch, filtfilt
from sklearn.preprocessing import MinMaxScaler

# Variables and loading data

In [3]:
sampling_rate = 256

In [5]:
# Load annotation file
annt = scipy.io.loadmat("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/annotations_2017.mat")

In [4]:
# Load the data 
signals, signal_headers, header = highlevel.read_edf('/home/lee/code/mariaaraujovitoria/SeizurePredict/raw_data/eeg6.edf')

In [6]:
signals.shape

(21, 1203968)

In [9]:
signals[:,:100].shape

(21, 100)

# Functions to preprocess

In [7]:
# Highpass filter
def highpass_filter(signals, sampling_rate, hp_frequency = 0.1):
    sos = butter(N = 3, Wn = hp_frequency, btype="highpass",fs=sampling_rate, output="sos")
    filter_hp = sosfiltfilt(sos, signals)
    return filter_hp

In [8]:
# Powerline filter
def notch_filter(signals, sampling_rate, notch_frequency = 50, quality_factor = 30):
    w0 = notch_frequency/(sampling_rate/2)
    b_notch, a_notch = iirnotch(w0, quality_factor)
    filter_notch = filtfilt(b_notch, a_notch, signals, axis = -1)
    return filter_notch

In [9]:
# Combination of all filters and MinMaxScaler
def filter_signals(signals, sampling_rate, hp_frequency = 0.1, notch_frequency = 50, quality_factor = 30):
    filter_hp = highpass_filter(signals, sampling_rate)
    filter_notch = notch_filter(signals, sampling_rate, notch_frequency, quality_factor)
    scaler = MinMaxScaler()
    final_signal = scaler.fit_transform(filter_notch)
    return final_signal

# Functions to label

In [10]:
# Format the EEG 
def eeg_formated(signals, names_ele):
    data_signals = signals.T # transpose the signals from datapoints
    data_signals = pd.DataFrame(data_signals) # create a pandas dataframe
    
    data_signals.columns = names_ele # rename columns
    
    return data_signals

In [11]:
# Format the annotations
def diagnosis(n):
    patient_A=annt["annotat_new"][0][n-1][0]
    patient_B=annt["annotat_new"][0][n-1][1]
    patient_C=annt["annotat_new"][0][n-1][2]
    
    #converting seconds to datapoints

    patient_A=patient_A.tolist()
    patient_B=patient_B.tolist()
    patient_C=patient_C.tolist()
    
    patient_A_dtp=[]
    patient_B_dtp=[]
    patient_C_dtp=[]  
    for elem in patient_A:
        for i in range(sampling_rate):
            patient_A_dtp.append(elem) 
    for elem in patient_B:
        for i in range(sampling_rate):
            patient_B_dtp.append(elem)
        
    for elem in patient_C:
        for i in range(sampling_rate):
            patient_C_dtp.append(elem)
            
    target_=pd.DataFrame({"Diagnosis A":patient_A_dtp,"Diagnosis B":patient_B_dtp,"Diagnosis C":patient_C_dtp})
    
    return target_  

In [12]:
# Add a time column with the seconds
def add_time(df):
    list_time=[]
    for i in range(len(df)):
        list_time.append(i//sampling_rate)
    df["time"]=list_time
    return df

In [13]:
# Create target variables when seizures lasts at least 10
def is_seizure(df):
    
    threshold = sampling_rate*10
    
    df['is_seizure_A'] = df["Diagnosis A"].groupby((df["Diagnosis A"] != df["Diagnosis A"].shift()).cumsum()).transform('size') * df["Diagnosis A"]
    df['is_seizure_A'] = (df['is_seizure_A'] > threshold).astype(int)
    
    df['is_seizure_B'] = df["Diagnosis B"].groupby((df["Diagnosis B"] != df["Diagnosis B"].shift()).cumsum()).transform('size') * df["Diagnosis B"]
    df['is_seizure_B'] = (df['is_seizure_B'] > threshold).astype(int)
    
    df['is_seizure_C'] = df["Diagnosis C"].groupby((df["Diagnosis C"] != df["Diagnosis C"].shift()).cumsum()).transform('size') * df["Diagnosis C"]
    df['is_seizure_C'] = (df['is_seizure_C'] > threshold).astype(int)
    
    return df 

In [14]:
# Create final target
def create_target(df):
    df['is_seizure_target'] = np.where(df['is_seizure_A'] + df['is_seizure_B'] + df['is_seizure_C'] >= 2, 1, 0)
    return df

In [15]:
# Remove useless
def remove_useless_columns(df):
    df.drop(columns=['Diagnosis A', 'Diagnosis B', 'Diagnosis C', 'is_seizure_A', 'is_seizure_B', 'is_seizure_C'], inplace=True)
    return df

In [16]:
# Final function to label
def label_data(path_raw_data, signals_preprocessed, n):
    
    signals, signal_headers, header = highlevel.read_edf(path_raw_data)
    
    names_ele = [signal_headers[iele]['label'] for iele in range(signals.shape[0])] # extract electrode names
    
    eeg_patient = eeg_formated(signals_preprocessed, names_ele) # format the ECG
    diagnosis_patient = diagnosis(n) # format the diagnosis
    
    data_patient = pd.merge(left=eeg_patient, right=diagnosis_patient, how='left', left_index=True, right_index=True) # merge ecg and diagnosis
    
    add_time(data_patient)
    is_seizure(data_patient)
    create_target(data_patient)
    remove_useless_columns(data_patient)
    
    return data_patient

# Final preprocessing & label

In [45]:
signals_preprocessed = filter_signals(signals, sampling_rate, hp_frequency = 0.1, notch_frequency = 50, quality_factor = 30)

In [20]:
d_5 = label_data('eeg5.edf', signals_preprocessed, 5)

In [18]:
d_2=label_data("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/eeg2.edf",signals_preprocessed,2)

In [31]:
d_1=label_data("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/eeg1.edf",signals_preprocessed,1)

In [38]:
d_3=label_data("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/eeg3.edf",signals_preprocessed,3)

In [46]:
d_6=label_data("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/eeg3.edf",signals_preprocessed,6)

In [47]:
d_6

Unnamed: 0,EEG Fp1-Ref,EEG Fp2-Ref,EEG F3-Ref,EEG F4-Ref,EEG F7-Ref,EEG F8-Ref,EEG Fz-Ref,EEG C3-Ref,EEG C4-Ref,EEG Cz-Ref,...,EEG T6-Ref,EEG P3-Ref,EEG P4-Ref,EEG Pz-Ref,EEG O1-Ref,EEG O2-Ref,ECG EKG,Resp Effort,time,is_seizure_target
0,0.590507,0.726594,0.871965,0.728176,0.754922,1.000000,0.565585,0.431106,0.128846,0.455591,...,0.567253,0.708518,0.386171,0.515433,0.507877,0.604322,0.0,0.446013,0,0
1,0.602058,0.765222,0.874056,0.741458,0.827124,1.000000,0.630477,0.525828,0.297665,0.542334,...,0.644377,0.734732,0.499413,0.583477,0.575086,0.663588,0.0,0.558980,0,0
2,0.564586,0.877694,0.865995,0.736234,0.891022,1.000000,0.653592,0.545596,0.347786,0.569440,...,0.628093,0.736504,0.533922,0.608781,0.586320,0.689349,0.0,0.571079,0,0
3,0.577283,0.802207,0.838589,0.705547,0.851305,1.000000,0.588229,0.487214,0.264831,0.503758,...,0.542457,0.699629,0.469806,0.543854,0.534481,0.656229,0.0,0.495736,0,0
4,0.633843,0.638791,0.835949,0.714652,0.902220,1.000000,0.552486,0.466188,0.233661,0.479924,...,0.649914,0.687160,0.442889,0.521489,0.418894,0.653311,0.0,0.458131,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1203963,0.299546,0.294681,0.294384,0.293774,0.292851,0.294086,0.293512,0.293677,0.293893,0.295456,...,0.291449,0.295155,0.294980,0.294454,0.295613,0.294894,1.0,0.000000,4702,0
1203964,0.301231,0.292975,0.291357,0.288180,0.290191,0.291723,0.293715,0.289844,0.291662,0.296582,...,0.300302,0.296588,0.296156,0.298209,0.301208,0.299428,1.0,0.000000,4702,0
1203965,0.256847,0.256236,0.255532,0.254174,0.256197,0.256015,0.257717,0.255256,0.256151,0.257703,...,0.263490,0.257975,0.257871,0.259567,0.260329,0.259904,1.0,0.000000,4702,0
1203966,0.282531,0.289938,0.290968,0.293009,0.292558,0.290970,0.290304,0.292208,0.291125,0.287569,...,0.288209,0.287758,0.288095,0.287367,0.285066,0.286451,1.0,0.000000,4702,0


# Appendix - to remove time and ECG+Resp Effort

In [40]:
def very_clean(df):
    df.drop(columns=['ECG EKG', 'Resp Effort', 'time'], inplace=True)
    return df

In [48]:
d_6 = very_clean(d_6)

In [49]:
d_6.to_csv("/root/code/mariaaraujovitoria/SeizurePredict/raw_data/patient_6.csv")