---

---

# Loading the data

Import needed libraries

In [1]:
import os

import pandas as pd
import numpy as np

Setup global variables for loading

In [4]:
# when working locally set correct paths from the current directory to

# directory that contains data from kaggle hms
INPUT_DATA_DIR = "data"

# directory in which our npy files are/will be stored
PROCESSED_DATA_DIR = "processed_data"

Load the metadata

In [5]:
train_meta = pd.read_csv(INPUT_DATA_DIR + "/train.csv")
test_meta = pd.read_csv(INPUT_DATA_DIR + "/test.csv")

In [6]:
# take a parquet dataframe and compute correct values for each column
# we want columns such as "Fp1-F7" as can be seen in /example_figures
def extract_parquet(parquet_data):
    parquet_data["Fp1-F7"] = parquet_data["Fp1"] - parquet_data["F7"]
    parquet_data["F7-T3"] = parquet_data["F7"] - parquet_data["T3"]
    parquet_data["T3-T5"] = parquet_data["T3"] - parquet_data["T5"]
    parquet_data["T5-O1"] = parquet_data["T5"] - parquet_data["O1"]

    parquet_data["Fp2-F8"] = parquet_data["Fp2"] - parquet_data["F8"]
    parquet_data["F8-T4"] = parquet_data["F8"] - parquet_data["T4"]
    parquet_data["T4-T6"] = parquet_data["T4"] - parquet_data["T6"]
    parquet_data["T6-O2"] = parquet_data["T6"] - parquet_data["O2"]

    parquet_data["Fp1-F3"] = parquet_data["Fp1"] - parquet_data["F3"]
    parquet_data["F3-C3"] = parquet_data["F3"] - parquet_data["C3"]
    parquet_data["C3-P3"] = parquet_data["C3"] - parquet_data["P3"]
    parquet_data["P3-O1"] = parquet_data["P3"] - parquet_data["O1"]

    parquet_data["Fp2-F4"] = parquet_data["Fp2"] - parquet_data["F4"]
    parquet_data["F4-C4"] = parquet_data["F4"] - parquet_data["C4"]
    parquet_data["C4-P4"] = parquet_data["C4"] - parquet_data["P4"]
    parquet_data["P4-O2"] = parquet_data["P4"] - parquet_data["O2"]

    parquet_data["Fz-Cz"] = parquet_data["Fz"] - parquet_data["Cz"]
    parquet_data["Cz-Pz"] = parquet_data["Cz"] - parquet_data["Pz"]

    parquet_data = parquet_data.drop(
        [
            "Fp1",
            "F3",
            "C3",
            "P3",
            "F7",
            "T3",
            "T5",
            "O1",
            "Fz",
            "Cz",
            "Pz",
            "Fp2",
            "F4",
            "C4",
            "P4",
            "F8",
            "T4",
            "T6",
            "O2",
        ],
        axis=1,
    )
    idx = parquet_data.columns[1:].to_list() + [parquet_data.columns[0]]
    return parquet_data[idx].values

Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106795,351917269,6,12.0,2147388374,6,12.0,4195677307,10351,LRDA,0,0,0,3,0,0
106796,351917269,7,14.0,2147388374,7,14.0,290896675,10351,LRDA,0,0,0,3,0,0
106797,351917269,8,16.0,2147388374,8,16.0,461435451,10351,LRDA,0,0,0,3,0,0
106798,351917269,9,18.0,2147388374,9,18.0,3786213131,10351,LRDA,0,0,0,3,0,0


Get the data and labels for training

In [20]:
def get_data_and_labels():
    if not os.path.exists(PROCESSED_DATA_DIR):
        os.mkdir(PROCESSED_DATA_DIR)

    # if you have enough ram you can extract all metada rows
    # N_ROWS = train_meta.shape[0] # uncomment this line if you feel confident
    N_ROWS = (
        train_meta["eeg_id"].unique().shape[0]
    )  # comment this line out if you feel confident

    if not os.path.exists(PROCESSED_DATA_DIR + "/data_eeg.npz") or not os.path.exists(
        PROCESSED_DATA_DIR + "/labels_eeg.npz"
    ):
        data_eeg = np.empty(
            (N_ROWS, 10000, 19), dtype=np.float32
        )  # 50 seconds long eeg data
        labels_eeg = np.empty((N_ROWS, 6), dtype=np.float32)  # labels to predict
        nan_rows = 0
        i = 0
        # each eeg id contains several entries
        # we extract them here to make separate entities for training
        for eeg_id in train_meta["eeg_id"].unique():
            # read the parquet file
            parquet_data = pd.read_parquet(
                INPUT_DATA_DIR + "/train_eegs/" + str(eeg_id) + ".parquet"
            )

            # extract the data
            parquet_data = extract_parquet(parquet_data)

            # get entries corresponding to the processed parquet
            sub_train_meta = train_meta.loc[train_meta["eeg_id"] == eeg_id]

            # extract needed columns
            sub_ids = sub_train_meta["eeg_sub_id"]
            offsets = sub_train_meta["eeg_label_offset_seconds"]
            labels = sub_train_meta[
                [
                    "seizure_vote",
                    "lpd_vote",
                    "gpd_vote",
                    "lrda_vote",
                    "grda_vote",
                    "other_vote",
                ]
            ].values

            # create eeg data and label for each metadata entry
            for sub_id, offset, label in zip(sub_ids, offsets, labels):
                d = parquet_data[int(offset * 200) : int((50 + offset) * 200)]
                if np.isnan(d).sum() > 0:
                    nan_rows += 1
                    continue
                data_eeg[i] = parquet_data[int(offset * 200) : int((50 + offset) * 200)]
                labels_eeg[i] = label / label.sum()
                i += 1
                # if you have enough ram you can finish this loop
                break  # comment this line out if you feel confident

        labels_eeg = labels_eeg[:-nan_rows]
        data_eeg = data_eeg[:-nan_rows]

        np.savez(
            PROCESSED_DATA_DIR + "/labels_eeg",
            labels_eeg[:-nan_rows],
            allow_pickle=True,
        )
        np.savez(
            PROCESSED_DATA_DIR + "/data_eeg", data_eeg[:-nan_rows], allow_pickle=True
        )
    else:
        print("loading")
        data_eeg = np.load(PROCESSED_DATA_DIR + "/data_eeg.npz", allow_pickle=True)[
            "arr_0"
        ]
        labels_eeg = np.load(PROCESSED_DATA_DIR + "/labels_eeg.npz", allow_pickle=True)[
            "arr_0"
        ]
    return data_eeg, labels_eeg

---

# Creating a data loader, preprocessing

Setup global variables for datalodar and preprocessing

In [21]:
import torch

FOLDS = 5
BATCH_SIZE = 32
NUM_WORKERS = 0
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Create a lowpass filter to cut out the high and noisy frequencies

In [8]:
from scipy.signal import butter, lfilter


def butter_lowpass_filter(data, cutoff_freq=20, sampling_rate=200, order=4):
    nyquist = 0.5 * sampling_rate
    normal_cutoff = cutoff_freq / nyquist
    b, a = butter(order, normal_cutoff, btype="low", analog=False)
    filtered_data = lfilter(b, a, data, axis=0)
    return filtered_data

device(type='cuda')

Create a HMS dataset class that will help us load the data during the model training

In [23]:
from sklearn.model_selection import KFold
from torch.utils.data import Dataset


class HMSDataset(Dataset):
    def __init__(
        self,
        num_fold: int = FOLDS - 1,
        mode: str = "train",
    ):
        self.kf = KFold(n_splits=FOLDS)
        self.num_fold = num_fold

        self.X, self.y = get_data_and_labels()
        self.train_ids, self.valid_ids = self.__split_data()

        self.mode = mode

    def __split_data(self):
        train_indices, val_indices = list(self.kf.split(self.y))[self.num_fold]
        return train_indices, val_indices

    def __len__(self):
        if self.mode == "train":
            return len(self.train_ids)
        else:
            return len(self.valid_ids)

    def __getitem__(self, index):
        X, y = self.__retrieve_data(index)
        return torch.tensor(X, dtype=torch.float32), torch.tensor(
            y, dtype=torch.float32
        )

    def __retrieve_data(self, index):
        if self.mode == "train":
            X = self.X[self.train_ids[index]]
            y = self.y[self.train_ids[index]]
        else:
            X = self.X[self.valid_ids[index]]
            y = self.y[self.valid_ids[index]]

        X = np.clip(X, -1024, 1024)
        X = butter_lowpass_filter(X)
        return X, y

    def train(self):
        self.mode = "train"

    def eval(self):
        self.mode = "eval"

Initialize the dataset and dataloader!

In [27]:
from torch.utils.data import DataLoader

dataset = HMSDataset()

dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True,
)

---

# Creating a model

In [None]:
import torch.nn as nn


class ResNet_1D_Block(nn.Module):
    def __init__(
        self, in_channels, out_channels, kernel_size, stride, padding, downsampling
    ):
        super(ResNet_1D_Block, self).__init__()
        self.bn1 = nn.BatchNorm1d(num_features=in_channels)
        self.relu = nn.ReLU(inplace=False)
        self.dropout = nn.Dropout(p=0.2, inplace=False)
        self.conv1 = nn.Conv1d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            bias=False,
        )
        self.bn2 = nn.BatchNorm1d(num_features=out_channels)
        self.conv2 = nn.Conv1d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            stride=stride,
            padding=padding,
            bias=False,
        )
        self.maxpool = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
        self.downsampling = downsampling

    def forward(self, x):
        identity = x

        out = self.bn1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.conv2(out)

        out = self.maxpool(out)
        identity = self.downsampling(x)

        out += identity
        return out


class EEGNet(nn.Module):
    def __init__(
        self, kernels=[3, 5, 7, 9], in_channels=19, fixed_kernel_size=17, num_classes=6
    ):
        super(EEGNet, self).__init__()
        self.kernels = kernels
        self.planes = 24
        self.parallel_conv = nn.ModuleList()
        self.in_channels = in_channels

        for i, kernel_size in enumerate(list(self.kernels)):
            sep_conv = nn.Conv1d(
                in_channels=in_channels,
                out_channels=self.planes,
                kernel_size=(kernel_size),
                stride=1,
                padding=0,
                bias=False,
            )
            self.parallel_conv.append(sep_conv)

        self.bn1 = nn.BatchNorm1d(num_features=self.planes)
        self.relu = nn.ReLU(inplace=False)
        self.conv1 = nn.Conv1d(
            in_channels=self.planes,
            out_channels=self.planes,
            kernel_size=fixed_kernel_size,
            stride=2,
            padding=2,
            bias=False,
        )
        self.block = self._make_resnet_layer(
            kernel_size=fixed_kernel_size, stride=1, padding=fixed_kernel_size // 2
        )
        self.bn2 = nn.BatchNorm1d(num_features=self.planes)
        self.avgpool = nn.AvgPool1d(kernel_size=6, stride=6, padding=2)
        self.dropout = nn.Dropout(p=0.2, inplace=False)
        self.rnn = nn.GRU(
            input_size=self.in_channels,
            hidden_size=128,
            num_layers=1,
            bidirectional=True,
        )
        self.fc = nn.Linear(in_features=424, out_features=num_classes)

    def _make_resnet_layer(self, kernel_size, stride, blocks=9, padding=0):
        layers = []
        downsample = None
        base_width = self.planes

        for i in range(blocks):
            downsampling = nn.Sequential(
                nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
            )
            layers.append(
                ResNet_1D_Block(
                    in_channels=self.planes,
                    out_channels=self.planes,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=padding,
                    downsampling=downsampling,
                )
            )

        return nn.Sequential(*layers)

    def extract_features(self, x):
        x = x.permute(0, 2, 1)
        out_sep = []

        for i in range(len(self.kernels)):
            sep = self.parallel_conv[i](x)
            out_sep.append(sep)

        out = torch.cat(out_sep, dim=2)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv1(out)

        out = self.block(out)
        out = self.bn2(out)
        out = self.relu(out)
        #         out = self.dropout(out)
        out = self.avgpool(out)

        out = out.reshape(out.shape[0], -1)
        rnn_out, _ = self.rnn(x.permute(0, 2, 1))
        new_rnn_h = rnn_out[:, -1, :]

        new_out = torch.cat([out, new_rnn_h], dim=1)
        return new_out

    def forward(self, x):
        new_out = self.extract_features(x)
        new_out = self.dropout(new_out)
        result = self.fc(new_out)

        return result

# Training (TBD)

---

---