In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.preprocessing import normalize, minmax_scale

import os
import gc
import random
from pathlib import Path

## Load all EEGs

In [2]:
PATH = Path('train_eegs/')
files = os.listdir(PATH)
metadata = pd.read_csv('train.csv')
test_metadata = pd.read_csv('test.csv')

In [3]:
eeg_ids = metadata.eeg_id.unique()
label_ids = metadata.label_id.unique()
eeg_data = np.zeros((len(label_ids), 20*50, 20))
eeg_labels = np.zeros((len(label_ids), 6))

In [4]:
num_nans = []
counter = 0  # for determining how many eegs we keep
DOWN_RATE = 5


for i, eeg_id in enumerate(eeg_ids):
    eeg = pd.read_parquet(PATH / (str(eeg_id)+'.parquet')).to_numpy()

    # get all rows with this eeg_id and randomly select one subid
    eeg_selection = metadata[metadata.eeg_id == eeg_id]
    subids = eeg_selection.eeg_sub_id.unique()
    for subid in subids:
        # get the offset for this subid
        eeg_offset = int(eeg_selection[eeg_selection.eeg_sub_id == subid].eeg_label_offset_seconds)

        # get eeg slice and minmax normalize
        eeg_temp = eeg[eeg_offset*200:(eeg_offset+50)*200]
        eeg_temp = minmax_scale(eeg_temp, axis=0)

        # count nans
        num_nans.append(np.sum(np.isnan(eeg_temp)))

        if np.sum(np.isnan(eeg_temp)) == 0:
            # add to eeg_data
            eeg_temp = sp.signal.decimate(eeg_temp, DOWN_RATE, axis=0, zero_phase=True)
            eeg_data[counter,:,:] = eeg_temp
            eeg_label = eeg_selection[eeg_selection.eeg_sub_id == subid][['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].to_numpy()
            eeg_labels[counter] = normalize(eeg_label, norm='l1')
            counter += 1
        elif np.sum(np.isnan(eeg_temp)) <= 20:
            # interpolate missing values
            eeg_temp = pd.DataFrame(eeg_temp).interpolate().to_numpy()
            eeg_temp = sp.signal.decimate(eeg_temp, DOWN_RATE, axis=0, zero_phase=True)
            eeg_data[counter,:,:] = eeg_temp
            eeg_label = eeg_selection[eeg_selection.eeg_sub_id == subid][['seizure_vote', 'lpd_vote', 'gpd_vote', 'lrda_vote', 'grda_vote', 'other_vote']].to_numpy()
            eeg_labels[counter] = normalize(eeg_label, norm='l1')
            counter += 1
        else:
            print(f'Found {np.sum(np.isnan(eeg_temp))} nans in {eeg_id} at offset {eeg_offset}')

  eeg_offset = int(eeg_selection[eeg_selection.eeg_sub_id == subid].eeg_label_offset_seconds)


ValueError: could not broadcast input array from shape (2000,20) into shape (1000,20)

In [None]:
print(label_ids.shape[0], counter)

106800 106377


In [None]:
# delete all entries in 1st dimension with all zeros
# eeg_data = eeg_data[~np.all(eeg_labels == 0, axis=1)]
# eeg_labels = eeg_labels[~np.all(eeg_labels == 0, axis=1)]

In [None]:
del eeg
gc.collect()

# delete empty rows
eeg_data = eeg_data[:counter,:,:]
eeg_labels = eeg_labels[:counter,:]

In [None]:
np.save(f'scaled_eeg_data_{DOWN_RATE}dsr.npy', eeg_data)
np.save(f'scaled_eeg_labels_{DOWN_RATE}dsr.npy', eeg_labels)

In [None]:
del eeg_data, eeg_labels,
gc.collect()

0