# Loading the data

Import needed libraries

In [1]:
import pandas as pd
import torch

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

Setup global variables for loading

In [2]:
# when working locally set correct paths from the current directory to

# directory that contains data from kaggle hms
INPUT_DATA_DIR = "data"

# directory in which our npy files are/will be stored
PROCESSED_DATA_DIR = "processed_data"

Load the metadata

In [3]:
train_meta = pd.read_csv(INPUT_DATA_DIR + "/train.csv")
test_meta = pd.read_csv(INPUT_DATA_DIR + "/test.csv")

Create a butterfilter

In [4]:
from scipy.signal import butter, lfilter

def butter_lowpass_filter(data, cutoff_freq=20, sampling_rate=200, order=4):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff_freq / nyquist
    b, a = butter(order, normal_cutoff, btype="low", analog=False)
    filtered_data = lfilter(b, a, data, axis=0)
    return filtered_data

Extract parquet data

In [5]:
# take a parquet dataframe and compute correct values for each column
# we want columns such as "Fp1-F7" as can be seen in /example_figures
def extract_parquet(parquet_data: torch.tensor):
    parquet_data["Fp1-F7"] = parquet_data["Fp1"] - parquet_data["F7"]
    parquet_data["F7-T3"] = parquet_data["F7"] - parquet_data["T3"]
    parquet_data["T3-T5"] = parquet_data["T3"] - parquet_data["T5"]
    parquet_data["T5-O1"] = parquet_data["T5"] - parquet_data["O1"]

    parquet_data["Fp2-F8"] = parquet_data["Fp2"] - parquet_data["F8"]
    parquet_data["F8-T4"] = parquet_data["F8"] - parquet_data["T4"]
    parquet_data["T4-T6"] = parquet_data["T4"] - parquet_data["T6"]
    parquet_data["T6-O2"] = parquet_data["T6"] - parquet_data["O2"]

    parquet_data["Fp1-F3"] = parquet_data["Fp1"] - parquet_data["F3"]
    parquet_data["F3-C3"] = parquet_data["F3"] - parquet_data["C3"]
    parquet_data["C3-P3"] = parquet_data["C3"] - parquet_data["P3"]
    parquet_data["P3-O1"] = parquet_data["P3"] - parquet_data["O1"]

    parquet_data["Fp2-F4"] = parquet_data["Fp2"] - parquet_data["F4"]
    parquet_data["F4-C4"] = parquet_data["F4"] - parquet_data["C4"]
    parquet_data["C4-P4"] = parquet_data["C4"] - parquet_data["P4"]
    parquet_data["P4-O2"] = parquet_data["P4"] - parquet_data["O2"]

    parquet_data["Fz-Cz"] = parquet_data["Fz"] - parquet_data["Cz"]
    parquet_data["Cz-Pz"] = parquet_data["Cz"] - parquet_data["Pz"]

    parquet_data = parquet_data.drop(
        [
            "Fp1",
            "F3",
            "C3",
            "P3",
            "F7",
            "T3",
            "T5",
            "O1",
            "Fz",
            "Cz",
            "Pz",
            "Fp2",
            "F4",
            "C4",
            "P4",
            "F8",
            "T4",
            "T6",
            "O2",
        ],
        axis=1,
    )
    idx = parquet_data.columns[1:].to_list() + [parquet_data.columns[0]]
    parquet_data = parquet_data[idx].values.T
    parquet_data = butter_lowpass_filter(parquet_data)
    parquet_data = torch.from_numpy(parquet_data).type(torch.float32)
    parquet_data = torch.clip(parquet_data, -1024, 1024)
    return parquet_data

Use only the first 50 seconds of each measurement

In case we need more training data we can use the full dataset

In [6]:
train_meta = train_meta.loc[train_meta["eeg_sub_id"] == 0]
train_meta

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
9,2277392603,0,0.0,924234,0,0.0,1978807404,30539,GPD,0,0,5,0,1,5
11,722738444,0,0.0,999431,0,0.0,557980729,56885,LRDA,0,1,0,14,0,1
22,387987538,0,0.0,1084844,0,0.0,4099147263,4264,LRDA,0,0,0,3,0,0
28,2175806584,0,0.0,1219001,0,0.0,1963161945,23435,Seizure,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106780,3910994355,0,0.0,2146798838,0,0.0,4272062867,28488,LPD,0,9,0,2,0,7
106781,3938393892,0,0.0,2146798838,1,60.0,2587113091,28488,LPD,0,9,0,2,0,7
106783,1850739625,0,0.0,2146798838,3,162.0,2394534310,28488,LPD,0,9,0,2,0,7
106784,1306668185,0,0.0,2147312808,0,0.0,1216355904,57480,LPD,0,3,0,0,0,0


Get the data and labels for training

In [7]:
all_labels = train_meta[
    ["seizure_vote", "lpd_vote", "gpd_vote", "lrda_vote", "grda_vote", "other_vote"]
].values

In [8]:
import numpy as np
eeg_data = []
eeg_labels = []
nan_rows = 0

for eeg_id, label in zip(train_meta["eeg_id"],all_labels):
    parquet_data = pd.read_parquet(INPUT_DATA_DIR + "/train_eegs/" + str(eeg_id) + ".parquet")[:10000]
    eeg = extract_parquet(parquet_data)
    if torch.isnan(eeg).sum()>0:
        nan_rows+=1
        continue
    eeg_data.append(eeg)
    eeg_labels.append(label)

In [9]:
print(f"skipped files : {nan_rows}")
print(f"all files     : {len(all_labels)}")

eeg_labels = torch.tensor(eeg_labels,dtype=torch.float32)
eeg_labels = eeg_labels/eeg_labels.sum(dim=1,keepdims=True)

eeg_data = torch.stack(eeg_data)[:,None,:,:]

skipped files : 575
all files     : 17089


  eeg_labels = torch.tensor(eeg_labels,dtype=torch.float32)


In [10]:
eeg_data.type()

'torch.FloatTensor'

---

# Creating a data loader, preprocessing

Setup global variables for dataloder and preprocessing

In [11]:
SAMPLING_FREQUENCY = 200
SAMPLES_IN_MEASUREMENT = 10000
FOLDS = 5
BATCH_SIZE = 16
NUM_WORKERS = 0

Create a HMS dataset class that will help us load the data during the model training

In [12]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(eeg_data,eeg_labels,test_size=0.1,stratify = eeg_labels.argmax(axis=1))

In [13]:
X_train, y_train = X_train.to(DEVICE), y_train.to(DEVICE)
X_test, y_test = X_test.to(DEVICE), y_test.to(DEVICE)

Create custom dataset and dataloater

In [14]:
from torch.utils.data import DataLoader, Dataset

class CustomImageDataset(Dataset):
    def __init__(self, X,y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_loader = DataLoader(CustomImageDataset(X_train,y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(CustomImageDataset(X_test,y_test), batch_size=BATCH_SIZE, shuffle=True)

---

# Creating a model

In [27]:
import torch.nn as nn

class SimpleEEGModel(nn.Module):
    def __init__(
        self
    ):
        super(SimpleEEGModel, self).__init__()
        
        self.network = nn.Sequential(
            # input 19 x 10000 x 1 hwc
            # nn.Conv2d(1, 32, (1,3), 1, (0,1), bias=False),
            # nn.BatchNorm2d(32),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d((1,2)),
            # 
            # # input 19 x 5000 x 32 hwc
            # nn.Conv2d(32, 64, (19,4), 2, 3, bias=False),
            # nn.BatchNorm2d(64),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d(2),
            # 
            # # input 1 x 2500 x 64 hwc
            # nn.Conv2d(64, 64, (1,4), 1, 3, bias=False),
            # nn.BatchNorm2d(64),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d(2),
            # 
            # # input 1 x 1250 x 64 hwc
            # nn.Conv2d(64, 64, (1,4), 1, 3, bias=False),
            # nn.BatchNorm2d(64),
            # nn.ReLU(inplace=True),
            # nn.MaxPool2d(2),
            
            # nn.Dropout(0.2),
            
            nn.Flatten(),
            nn.Linear(190, 6),
            nn.Softmax(dim=1)
            
        )
    
    def forward(self, x):
        # print(x[:,:,:,:10])
        out = self.network(x[:,:,:,:10])
        out2 = out - torch.min(out,dim=-1,keepdim=True)[0]
        # for param in self.network.parameters():
        #     print(param.data)
        
        # print(out2)
        # print("========================================")
        return out2/out2.sum(dim=-1,keepdims=True)

# Training (TBD)

---

In [28]:
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn import CrossEntropyLoss
from torch.optim import Adam, SGD


model = SimpleEEGModel().to(DEVICE)
optimizer = Adam(model.parameters(), lr=0.0001)
# scheduler = ReduceLROnPlateau(optimizer, 'min')
loss_function = nn.KLDivLoss(reduction="batchmean", log_target=False)

In [29]:
EPOCHS = 10

total_loss=0
for epoch in range(EPOCHS):
    iteration = 0
    for batch in train_loader:
        total_loss = 0
        batch_data = nn.functional.normalize(batch[0],dim=-1)
        batch_labels = batch[1]        
        prediction = model(batch_data)
        loss = loss_function(prediction,batch_labels)
        print(loss.item())
        total_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        iteration+=1
    # scheduler.step()
    print("==========================")
    print(total_loss/iteration)
    print("==========================")

-0.5303043127059937
-0.48181694746017456
-0.3810634911060333
-0.3433395028114319
-0.48251861333847046
-0.5218632817268372
-0.35273268818855286
-0.4114859700202942
-0.5779792070388794
-0.3543991446495056
-0.26464590430259705
-0.4924156665802002
-0.45698055624961853
-0.5364706516265869
-0.4349806010723114
-0.4203234314918518
-0.42205432057380676
-0.7709805965423584
-0.5151702165603638
-0.5214502215385437
-0.5426949262619019
-0.312855064868927
-0.3882192373275757
-0.396544873714447
-0.3866080045700073
-0.4660117030143738
-0.3820114731788635
-0.31551939249038696
-0.4356440305709839
-0.5037174820899963
-0.6499422192573547
-0.5229140520095825
-0.49587613344192505
-0.33627307415008545
-0.4838983416557312
-0.3507453203201294
-0.33813977241516113
-0.5731093287467957
-0.514494776725769
-0.7495653033256531
-0.5085393786430359
-0.3861781358718872
-0.5327043533325195
-0.5423557758331299
-0.528671145439148
-0.4848997890949249
-0.3448852300643921
-0.3051217794418335
-0.5321344137191772
-0.40410372614

---