# preprocess data
split the eeg into small chunks

In [1]:
from tqdm.notebook import tqdm
from scipy.signal import butter, lfilter
import pandas as pd
import numpy as np
import torch as t

In [2]:
OUT_DIR = './eeg-filtered/'
BASE_PATH = './hms-harmful-brain-activity-classification/'
DATA_PATH = './hms-harmful-brain-activity-classification/train_eegs/'

In [3]:
def normalize_signals(data):
    mean = data.mean(axis=0, keepdims=True)
    std = data.std(axis=0, keepdims=True)
    std_adjusted = np.where(std > 1e-10, std, 1)
    return (data - mean) / std_adjusted

def butter_bandpass_filter(data, lowcut=1. , highcut =40., fs=200, order=6):
    b, a = butter(order, [lowcut / (0.5 * fs), highcut / (0.5 * fs)], btype='band')
    return lfilter(b, a, data)

def filters(data):
    # TODO: clip ?
    # TODO: log scale ?
    data = normalize_signals(data)
    data = butter_bandpass_filter(data)
    return data

In [4]:
def preprocess(df):
    sample_rate = 200
    duration = 10_000
    for eeg_id, group in tqdm(df.groupby('eeg_id')):
        parquet_file = f'{DATA_PATH}{eeg_id}.parquet'
        raw_eeg = pd.read_parquet(parquet_file)
        for _, row in group.iterrows():
            eeg_sub_id = row['eeg_sub_id']
            offset = int(row['eeg_label_offset_seconds'] * sample_rate)
            eeg = raw_eeg.iloc[offset:offset + duration]
            eeg = eeg.ffill(axis=0).fillna(0)
            filtered_eeg = filters(eeg.values)
            data = t.tensor(filtered_eeg).float()
            t.save(data, f'{OUT_DIR}{eeg_id}_{eeg_sub_id}.pt')

df = pd.read_csv(f'{BASE_PATH}train.csv')
preprocess(df)

  0%|          | 0/17089 [00:00<?, ?it/s]