# imports

In [1]:
from datetime import datetime
import einops
import wandb
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torch as t
import torch.nn as nn
import torch.nn.functional as F
from functools import lru_cache

device = 'cuda' if t.cuda.is_available() else 'cpu'

# utils

In [2]:
import gc 
def GC():
    gc.collect()
    t.cuda.empty_cache()

In [3]:
# validation / inference loss
@t.no_grad()
def eval(model, x, y, do_eval=True):
    assert not x.isnan().any()
    assert not y.isnan().any()
    if do_eval: model.eval()
    else: model.train()
    logs = model(x.to(device)).log_softmax(-1)
    kl_loss = nn.KLDivLoss(reduction="batchmean")
    loss = kl_loss(logs, y.to(device))
    model.train()
    return loss

In [4]:
@t.no_grad()
def augment_data(data, alpha=0.01):
    # data → ('batch', 'seq', 'channel')
    data = data.to(device)
    std = data.std(dim=1, keepdim=True)
    noise = t.randn_like(data, device=device) * std * alpha
    return data + noise

def augment_if(data, iter):
    if iter == 0: return data
    return augment_data(data)

# config

In [5]:
batch_size = 45
prefetch_factor = 10
num_workers = 3

# data

In [6]:
test_path = './hms-harmful-brain-activity-classification/test_eegs/'
train_path = './hms-harmful-brain-activity-classification/train_eegs/'
train_spec_path = './hms-harmful-brain-activity-classification/train_spectrograms/'
BASE_PATH = './hms-harmful-brain-activity-classification/'
# PRE_PROCESSED_PATH = './preprocessed/'
# PRE_PROCESSED_PATH = './eeg-filtered/'
# PRE_PROCESSED_PATH = './eeg-logged/'
PRE_PROCESSED_PATH = './eeg-robust-filter/'

FEATS_FOR_REAL = ['Fp1', 'F3', 'C3', 'P3', 'F7', 'T3', 'T5', 'O1', 'Fz', 'Cz', 'Pz', 'Fp2', 'F4', 'C4', 'P4', 'F8', 'T4', 'T6', 'O2', 'EKG']
#                   0      1     2     3     4     5     6     7     8     9    10     11    12    13    14    15    16    17    18    19
# group by semantic groups LP, LL, RP, RR https://raw.githubusercontent.com/cdeotte/Kaggle_Images/main/Jan-2024/montage.png
# GROUPS = [
#     ['Fp1', 'F3', 'C3', 'P3', 'O1'],
#     ['Fp1', 'F7', 'T3', 'T5', 'O1'],
#     ['Fp2', 'F4', 'C4', 'P4', 'O2'],
#     ['Fp2', 'F8', 'T4', 'T6', 'O2'],
# ]
GROUPS_IDS = [
    [0, 1, 2, 3, 7],
    [0, 4, 5, 6, 7],
    [11, 12, 13, 14, 18],
    [11, 15, 16, 17, 18],
    # [8, 9, 10, 19] # TODO: try with leftovers?
]
# LEFTOVERS = [8, 9, 10]
# EKG = [19]
# TODO: add frequency domain with fourier's transform
# TODO: add spectrogram to process with conv2d
# TODO: merge several models together

TARGETS = ['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote','other_vote']

In [7]:
train_df = pd.read_csv(f'{BASE_PATH}/train.csv')

In [8]:
class Dataset(Dataset):
    def __init__(self):
        super().__init__()
        self.dataframe = train_df

    def __len__(self):
        return len(self.dataframe)

    # @lru_cache(maxsize=None)
    def __getitem__(self, idx): # preprocessed version
        row = self.dataframe.iloc[idx]
        eeg_id = row['eeg_id']
        eeg_sub_id = row['eeg_sub_id']
        eeg_path = f'{PRE_PROCESSED_PATH}/{eeg_id}_{eeg_sub_id}.pt'
        eeg = t.load(eeg_path)
        labels = row[TARGETS].values.astype(np.float64)
        labels = labels/np.sum(labels)
        labels_out = t.tensor(labels, dtype=t.float64)
        
        # assert not samples.isnan().any()
        # assert not labels_out.isnan().any()
        return eeg, labels_out

In [9]:
dataset = Dataset()
ids = train_df['eeg_id'].unique()
np.random.shuffle(ids)
split = int(len(ids) * 0.95)

train_ids = ids[:split]
test_ids = ids[split:]

now = datetime.now().strftime("%Y-%m-%d_%Hh%M")
t.save(t.tensor(train_ids), f'./splits/{now}_train_ids.pt')
t.save(t.tensor(train_ids), f'./splits/{now}_test_ids.pt')

train_indices = train_df[train_df['eeg_id'].isin(train_ids)].index.tolist()
test_indices = train_df[train_df['eeg_id'].isin(test_ids)].index.tolist()

train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, prefetch_factor=prefetch_factor, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, prefetch_factor=prefetch_factor, shuffle=True)

len(train_dataset), len(test_dataset)

(101433, 5367)

# model 👯‍♀️

## conv1d + GRU

In [10]:
class ConvBlock(nn.Module):
    def __init__(self, d_in, d_out, kernel_size, drop):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(d_in, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0), # reduce sequence size by 2
        )
    def forward(self, x):
        # TODO: add skip for training speed
        return self.model(x)
        
class Model(nn.Module):
    def __init__(self, in_channels=20, gru_hidden_size=128, drop=0.2):
        super().__init__()
        self.pre_out = in_channels * 4
        self.gru_hidden_size = gru_hidden_size
        
        self.pre_process = nn.Sequential(
            nn.BatchNorm1d(in_channels, momentum=None),
            # use conv1d as a denoiser
            # block 1
            ConvBlock(in_channels, in_channels * 2, kernel_size=3, drop=drop),
            nn.BatchNorm1d(in_channels * 2, momentum=None),
            
            # block 2
            ConvBlock(in_channels * 2, in_channels * 4, kernel_size=5, drop=drop),
            nn.BatchNorm1d(self.pre_out, momentum=None),

            # block 3
            ConvBlock(in_channels * 4, in_channels * 4, kernel_size=7, drop=drop),
            nn.BatchNorm1d(self.pre_out, momentum=None),
        )
        
        # TODO: add a learnable first state for GRU or check what is the default
        self.gru = nn.GRU(self.pre_out, self.gru_hidden_size, num_layers=1, batch_first=True, bidirectional=True)

        self.head = nn.Sequential(
            nn.Linear(self.gru_hidden_size * 2, self.gru_hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size * 4, 6)
        )

    def forward(self, x: ('batch', 'seq', 'channel')):
        # pre_process: (batch, channel, seq) → (batch / 4, channel * 4, seq)
        x = x.permute((0, 2, 1))
        x = self.pre_process(x)
        x = x.permute((0, 2, 1))

        # GRU: (batch, seq, input_size), [(2 * num_layers, batch, hidden_size)] → (batch, seq, 2 * hidden_size)
        x, _ = self.gru(x)
        x = x[:, -1, :]

        # head: (batch, 2 * hidden_size) → (batch, 6)
        x = self.head(x)

        # out: → (batch, 6)
        return x

def scope():
    m = Model().to(device)
    x, y = next(train_dataloader.__iter__())
    r = m(x.to(device))
    print(f'{r.shape=}')
    
# scope()

## transformer

In [11]:
class Transformer(nn.Module):
    def __init__(self, d_chan=20, d_model=256, d_clump=4):
        super().__init__()
        self.d_clump = d_clump

        self.start = nn.Parameter(t.randn(1, 1, d_model))
        self.bn = nn.BatchNorm1d(d_chan)
        self.emb = nn.Linear(d_chan * d_clump, d_model)
        self.llm = nn.Transformer(d_model=d_model, nhead=8, num_encoder_layers=3, num_decoder_layers=0, dim_feedforward=d_model * 2, batch_first=True)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.ReLU(),
            nn.Linear(d_model // 2, 6)
        )

    def forward(self, x):
        x = self.bn(x.permute((0, 2, 1))).permute((0, 2, 1))
        x = einops.rearrange(x, 'batch (seq clump) channels -> batch seq (clump channels)', clump=self.d_clump)
        x = self.emb(x)
        # add a fake start token
        x = t.cat([self.start.repeat(x.shape[0], 1, 1), x], dim=1)
        x = self.llm.encoder(x)[:, 0]
        return self.head(x)

def scope():
    val, label = next(train_dataloader.__iter__())
    model = Transformer().to(device)
    output = model(val.to(device))

# scope()

## separated GRU

In [12]:
class ConvBlock(nn.Module):
    def __init__(self, d_in, d_out, kernel_size, drop):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(d_in, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0), # reduce sequence size by 2
        )
    def forward(self, x):
        # TODO: add skip for training speed
        return self.model(x)
        
class SeparatedGRU_old(nn.Module):
    def __init__(self, in_channels=5, gru_hidden_size=128, drop=0.2):
        super().__init__()
        self.d_split = len(GROUPS_IDS)
        self.pre_out = in_channels * 4
        self.gru_hidden_size = gru_hidden_size
        
        self.pre_process = nn.Sequential(
            # nn.LayerNorm(normalized_shape=[in_channels, 10000]),
            nn.BatchNorm1d(in_channels, momentum=None),
            # use conv1d as a denoiser
            # block 1
            ConvBlock(in_channels, in_channels * 2, kernel_size=3, drop=drop),
            # nn.BatchNorm1d(in_channels * 2, momentum=None),
            
            # block 2
            ConvBlock(in_channels * 2, in_channels * 4, kernel_size=5, drop=drop),
            # nn.BatchNorm1d(in_channels * 4, momentum=None),

            # block 3
            ConvBlock(in_channels * 4, in_channels * 4, kernel_size=7, drop=drop),
            # nn.BatchNorm1d(in_channels * 4, momentum=None), # re-enable one more to force training ?
        )
        
        # TODO: add a learnable first state for GRU or check what is the default
        self.gru = nn.GRU(self.pre_out, self.gru_hidden_size, num_layers=1, batch_first=True, bidirectional=True)

        self.post_gru = nn.Sequential(
            # nn.BatchNorm1d(self.gru_hidden_size * 2, momentum=None),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size * 2, self.gru_hidden_size),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size, self.gru_hidden_size),
        )

        self.head = nn.Sequential(
            # nn.BatchNorm1d(self.gru_hidden_size * self.d_split, momentum=None),
            nn.Linear(self.gru_hidden_size * self.d_split, self.gru_hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size * 2, 6)
        )

    def forward(self, x: ('batch', 'seq', 'channel')):
        # separate the input into 4 splits (LP, LL, RP, RR)
        splits = [x[:, :, group] for group in GROUPS_IDS]
        # fold it into batch so we can run in parallel
        x = einops.rearrange(t.stack(splits, dim=0), 'group batch seq channel -> (group batch) seq channel')

        # pre_process: (batch, channel, seq) → (batch / 4, channel * 4, seq)
        x = x.permute((0, 2, 1))
        x = self.pre_process(x)
        x = x.permute((0, 2, 1))

        # GRU: (batch, seq, input_size), [(2 * num_layers, batch, hidden_size)] → (batch, seq, 2 * hidden_size)
        x, _ = self.gru(x)
        x = x[:, -1, :]

        # MLP post GRU
        x = self.post_gru(x)

        # unfold the splits
        x = einops.rearrange(x, '(group batch) hidden -> batch (hidden group)', group=self.d_split)

        # head: (batch, 2 * hidden_size) → (batch, 6)
        x = self.head(x)

        # out: → (batch, 6)
        return x

def scope():
    m = SeparatedGRU().to(device)
    x, y = next(train_dataloader.__iter__())
    r = m(x.to(device))
    print(f'{r.shape=}')
    
# scope()

## separated GRU w/ montage

In [13]:
class ConvBlock(nn.Module):
    def __init__(self, d_in, d_out, kernel_size, drop):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv1d(d_in, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Conv1d(d_out, d_out, kernel_size=kernel_size, padding='same', stride=1),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.MaxPool1d(kernel_size=2, stride=2, padding=0), # reduce sequence size by 2
        )
    def forward(self, x):
        # TODO: add skip for training speed
        return self.model(x)
        
class SeparatedGRU(nn.Module):
    def __init__(self, in_channels=4, gru_hidden_size=128, drop=0.):
        super().__init__()
        self.d_split = len(GROUPS_IDS)
        self.pre_out = in_channels * 4
        self.gru_hidden_size = gru_hidden_size
        
        self.pre_process = nn.Sequential(
            # nn.LayerNorm(normalized_shape=[in_channels, 10000]),
            # nn.BatchNorm1d(in_channels, momentum=None),
            # use conv1d as a denoiser
            # block 1
            ConvBlock(in_channels, in_channels * 2, kernel_size=3, drop=drop),
            # nn.BatchNorm1d(in_channels * 2, momentum=None),
            
            # block 2
            ConvBlock(in_channels * 2, in_channels * 4, kernel_size=5, drop=drop),
            # nn.BatchNorm1d(in_channels * 4, momentum=None),

            # block 3
            ConvBlock(in_channels * 4, in_channels * 4, kernel_size=7, drop=drop),
            # nn.BatchNorm1d(in_channels * 4, momentum=None), # re-enable one more to force training ?
        )
        
        # TODO: add a learnable first state for GRU or check what is the default
        self.gru = nn.GRU(self.pre_out, self.gru_hidden_size, num_layers=1, batch_first=True, bidirectional=True)

        self.post_gru = nn.Sequential(
            # nn.BatchNorm1d(self.gru_hidden_size * 2, momentum=None),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size * 2, self.gru_hidden_size),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size, self.gru_hidden_size),
        )

        self.head = nn.Sequential(
            # nn.BatchNorm1d(self.gru_hidden_size * self.d_split, momentum=None),
            nn.Linear(self.gru_hidden_size * self.d_split, self.gru_hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(self.gru_hidden_size * 2, 6)
        )

    def montage(self, x):
        splits = [x[:, :, group] for group in GROUPS_IDS]
        splits = [s[:, :, :-1] - s[:, :, 1:] for s in splits]
        return einops.rearrange(t.stack(splits, dim=0), 'group batch seq channel -> (group batch) seq channel')

    def forward(self, x: ('batch', 'seq', 'channel')):
        # separate the input into 4 montages (LP, LL, RP, RR)
        x = self.montage(x)
        # pre_process: (batch, channel, seq) → (batch / 4, channel * 4, seq)
        x = x.permute((0, 2, 1))
        x = self.pre_process(x)
        x = x.permute((0, 2, 1))
        # GRU: (batch, seq, input_size), [(2 * num_layers, batch, hidden_size)] → (batch, seq, 2 * hidden_size)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        # MLP post GRU
        x = self.post_gru(x)
        # unfold the splits
        x = einops.rearrange(x, '(group batch) hidden -> batch (hidden group)', group=self.d_split)
        # head: (batch, 2 * hidden_size) → (batch, 6)
        x = self.head(x)
        # out: → (batch, 6)
        return x

def scope():
    m = SeparatedGRU().to(device)
    x, y = next(train_dataloader.__iter__())
    r = m(x.to(device))
    print(f'{r.shape=}')
    
# scope()

# train

In [14]:
GC()
# model = Model().to(device)
# model = Transformer().to(device)
model = SeparatedGRU().to(device)
# TODO: try cranking the weight decay
# TODO: try using a scheduler
opt = t.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)
print(f'model has {sum(p.numel() for p in model.parameters())} params')

model has 303582 params


In [15]:
def train(model, opt, wnb=True, do_eval=True):
    model.train()
    validation_test, validation_test_label = next(test_dataloader.__iter__())
    validation_train, validation_train_label = next(train_dataloader.__iter__())

    if wnb: wandb.init(project='kaggle-eeg-rc')
    for epoch in range(1000):
        replay_buffer, maxi = [], 3
        tq = tqdm(train_dataloader)
        for x_train, y_train in tq:
            replay_buffer.append((x_train, y_train))
            replay_buffer = replay_buffer[-maxi:]
            for x_train, y_train in replay_buffer: # burn more GPU otherwise we bottleneck on disk IO
            # for k in range(3): # the data reading is too slow, so force the GPU to spin
                x_train = augment_data(x_train)
                logs = model(x_train.to(device)).log_softmax(-1)
                kl_loss = nn.KLDivLoss(reduction="batchmean")
                loss = kl_loss(logs, y_train.to(device))
                opt.zero_grad()
                loss.backward()
                opt.step()
                tq.set_description(f'loss = {loss:.4f}')
                if wnb: wandb.log({'loss': loss.item()})
        
        now = datetime.now().strftime("%Y-%m-%d_%Hh%M")
        if wnb and do_eval:
            wandb.log({'val_test':   eval(model, validation_test, validation_test_label, do_eval=True), 'now': f'{now}'})
            wandb.log({'val_train':  eval(model, validation_train, validation_train_label, do_eval=True), 'now': f'{now}'})
        t.save(model.state_dict(), f'weights/gru-4-splits_{now}.pt')
    if wnb: wandb.finish()

train(model, opt, wnb=True)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpeluche[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2255 [00:00<?, ?it/s]

KeyboardInterrupt: 

# save / load

In [None]:
# t.save(model.state_dict(),'model-weights4.pt')

In [None]:
# model = SeparatedGRU().to(device)
# model.load_state_dict(t.load('weights/gru-4-splits_2024-03-24_15h03.pt', map_location=device))

In [None]:
x_train, y_train = next(train_dataloader.__iter__())
x_val, y_val = next(test_dataloader.__iter__())

print(f'eva.eval(): {eval(model, x_train, y_train, do_eval=True)}')
print(f'val.eval():  {eval(model, x_val, y_val, do_eval=True)}')
print('--')
print(f'train.train(): {eval(model, x_train, y_train, do_eval=False)}')
print(f'val.train():  {eval(model, x_val, y_val, do_eval=False)}')

train.eval(): 1.266937549619349
val.eval():  1.5727584185788916
--
train.train(): 0.37932582726234854
val.train():  1.5525816245077804


# submit

In [None]:
@t.no_grad()
def submit(model, test_dataloader, test_df):
    model.eval()
    res = []
    for batch in test_dataloader:
        prob = model(batch.to(device)).softmax(-1)
        res.append(prob.detach().cpu())

    res = t.cat(res, dim=0)
    sub = test_df[["eeg_id"]].copy()
    sub[TARGETS] = res
    sub.to_csv('submission.csv',index=False)
    print('Submission shape',sub.shape)
    display(sub.head())