In [66]:
import numpy as np
import os
import pandas as pd
import random
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

In [10]:
class paths:
    OUTPUT_DIR = "output"
    PRE_LOADED_EEGS = 'brain-eeg-spectrograms/eeg_specs.npy'
    PRE_LOADED_SPECTOGRAMS = 'brain-spectrograms/specs.npy'
    TRAIN_CSV = "train.csv"
    TRAIN_EEGS = "EEG_Spectrograms/"
    TRAIN_SPECTOGRAMS = "train_spectrograms/"

# load meta data

In [11]:
df = pd.read_csv(paths.TRAIN_CSV)
label_cols = df.columns[-6:]
print(f"Train cataframe shape is: {df.shape}")
print(f"Labels: {list(label_cols)}")
df.head()

Train cataframe shape is: (106800, 15)
Labels: ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']


Unnamed: 0,eeg_id,eeg_sub_id,eeg_label_offset_seconds,spectrogram_id,spectrogram_sub_id,spectrogram_label_offset_seconds,label_id,patient_id,expert_consensus,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote
0,1628180742,0,0.0,353733,0,0.0,127492639,42516,Seizure,3,0,0,0,0,0
1,1628180742,1,6.0,353733,1,6.0,3887563113,42516,Seizure,3,0,0,0,0,0
2,1628180742,2,8.0,353733,2,8.0,1142670488,42516,Seizure,3,0,0,0,0,0
3,1628180742,3,18.0,353733,3,18.0,2718991173,42516,Seizure,3,0,0,0,0,0
4,1628180742,4,24.0,353733,4,24.0,3080632009,42516,Seizure,3,0,0,0,0,0


# deduplicate data

In [12]:
train_df = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg({
    'spectrogram_id':'first',
    'spectrogram_label_offset_seconds':'min'
})
train_df.columns = ['spectogram_id','min']

aux = df.groupby('eeg_id')[['spectrogram_id','spectrogram_label_offset_seconds']].agg({
    'spectrogram_label_offset_seconds':'max'
})
train_df['max'] = aux

aux = df.groupby('eeg_id')[['patient_id']].agg('first')
train_df['patient_id'] = aux

aux = df.groupby('eeg_id')[label_cols].agg('sum')
for label in label_cols:
    train_df[label] = aux[label].values
    
y_data = train_df[label_cols].values
y_data = y_data / y_data.sum(axis=1,keepdims=True)
train_df[label_cols] = y_data

aux = df.groupby('eeg_id')[['expert_consensus']].agg('first')
train_df['target'] = aux

train_df = train_df.reset_index()
print('Train non-overlapp eeg_id shape:', train_df.shape )
train_df.head()

Train non-overlapp eeg_id shape: (17089, 12)


Unnamed: 0,eeg_id,spectogram_id,min,max,patient_id,seizure_vote,lpd_vote,gpd_vote,lrda_vote,grda_vote,other_vote,target
0,568657,789577333,0.0,16.0,20654,0.0,0.0,0.25,0.0,0.166667,0.583333,Other
1,582999,1552638400,0.0,38.0,20230,0.0,0.857143,0.0,0.071429,0.0,0.071429,LPD
2,642382,14960202,1008.0,1032.0,5955,0.0,0.0,0.0,0.0,0.0,1.0,Other
3,751790,618728447,908.0,908.0,38549,0.0,0.0,1.0,0.0,0.0,0.0,GPD
4,778705,52296320,0.0,0.0,40955,0.0,0.0,0.0,0.0,0.0,1.0,Other


# load eeg data

In [30]:
all_eegs = {}
for i, eeg_id in enumerate(train_df.eeg_id):
    eeg = pd.read_parquet(f'train_eegs/{eeg_id}.parquet')
    rows = len(eeg)
    offset = (rows-10000)//2
    eeg = eeg.iloc[offset:offset+10000]
    data = np.zeros((10000,len(eeg.columns)))
    for j,col in enumerate(eeg.columns):
        x = eeg[col].values.astype('float32')
        m = np.nanmean(x)
        if np.isnan(x).mean()<1: 
            x = np.nan_to_num(x,nan=m)
        else: 
            x[:] = 0    
        data[:,j] = x          
    all_eegs[eeg_id] = data

In [68]:
class CustomDataset(Dataset):
    def __init__(self, df, all_eegs):
        self.df = df
        self.all_eegs = all_eegs

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        X = self.all_eegs[row.eeg_id].astype(np.float32)
        y = row[-7:-1].values.astype(np.float32)
        return X, y

In [69]:
train_dataset = CustomDataset(train_df, all_eegs)
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=False)

In [70]:
for x, y in train_loader:
    print("x", x.shape)
    print("y", y.shape)
    break

x torch.Size([32, 10000, 20])
y torch.Size([32, 6])


In [71]:
class BaselineModel(nn.Module):
    def __init__(self, input_features, output_features):
        super(BaselineModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_features, 512) 
        self.fc2 = nn.Linear(512, 128)           
        self.fc3 = nn.Linear(128, output_features) 

    def forward(self, x):
        x = self.flatten(x) 
        x = F.relu(self.fc1(x))  
        x = F.relu(self.fc2(x))
        x = self.fc3(x)      
        return x

input_features = 10000 * 20
output_features = 6
model = BaselineModel(input_features, output_features)

In [76]:
device = torch.device("cuda")
model = model.to(device)

In [83]:
criterion = nn.KLDivLoss(reduction="mean")
optimizer = torch.optim.AdamW(model.parameters())
epoch_loss = 0
n_samples = 0
for epoch in range(30):
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        y_pred = model(x)
        y_pred = F.log_softmax(y_pred, dim = 1)
        loss = criterion(y_pred, y)
        epoch_loss += loss.item()
        n_samples += len(x)
        loss.backward()
        optimizer.step()
    epoch_loss /= n_samples
    print(f'{epoch} : {epoch_loss}')



0 : 955609.0261011744
1 : 282114.24772865145
2 : 79755.03807132105
3 : 27954.23414983153
4 : 13247.625207401892
5 : 4229.327085445937
6 : 1880.7005282916782
7 : 1999.9430257966803
8 : 1317.1426412751555
9 : 1084.2527030938252
10 : 885.0456998031112
11 : 477.90863583110905
12 : 701.5775856742215
13 : 618.666144184195
14 : 248.87419367688267
15 : 251.67253305811184
16 : 229.39910642688756
17 : 214.77235234224955
18 : 200.96097437517034
19 : 203.237557745816
20 : 108.89665573042369
21 : 202.73429140152322
22 : 118.22734480300701
23 : 83.35274561329551
24 : 144.86221608360896
25 : 158.31522369526235
26 : 74.62058704411349
27 : 122.35523384856816
28 : 101.2292690410336
29 : 115.92663841617
