# Dataset

PyTorch의 EEG 데이터를 Dataset class 및 DataLoader class로 처리해보는 노트북

-----

## 환경 구성

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load some packages
import os
import glob
import json

import matplotlib.pyplot as plt
import pprint

import numpy as np
import random
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms

# custom package
from utils.eeg_dataset import *

In [3]:
# Other settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # cleaner text

plt.style.use('default') 
# ['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 
#  'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 
#  'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 
#  'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 
#  'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']

plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams["font.family"] = 'NanumGothic' # for Hangul in Windows

In [4]:
print('PyTorch version:', torch.__version__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.is_available(): print('cuda is available.')
else: print('cuda is unavailable.')

PyTorch version: 1.9.0
cuda is available.


In [5]:
# Data file path
root_path = r'dataset/02_Curated_Data/'

In [6]:
meta_path = os.path.join(root_path, 'metadata_debug.json')
with open(meta_path, 'r') as json_file:
    metadata = json.load(json_file)

pprint.pprint(metadata[0])

{'age': 78,
 'birth': '1940-06-02',
 'dx1': 'mci_rf',
 'edfname': '00001809_261018',
 'events': [[0, 'Start Recording'],
            [0, 'New Montage - Montage 002'],
            [36396, 'Eyes Open'],
            [72518, 'Eyes Closed'],
            [73862, 'Eyes Open'],
            [75248, 'Eyes Closed'],
            [76728, 'swallowing'],
            [77978, 'Eyes Open'],
            [79406, 'Eyes Closed'],
            [79996, 'Photic On - 3.0 Hz'],
            [80288, 'Eyes Open'],
            [81296, 'Eyes Closed'],
            [82054, 'Photic Off'],
            [84070, 'Photic On - 6.0 Hz'],
            [84488, 'Eyes Open'],
            [85538, 'Eyes Closed'],
            [86086, 'Photic Off'],
            [88144, 'Photic On - 9.0 Hz'],
            [90160, 'Photic Off'],
            [91458, 'Eyes Open'],
            [92218, 'Photic On - 12.0 Hz'],
            [92762, 'Eyes Closed'],
            [94198, 'Photic Off'],
            [94742, 'Eyes Open'],
            [95708, 'Eyes Close

-----

## Data Filtering by Diagnosis

#### Non-Vascular Dementia, Non-Vascular MCI, Normal

In [7]:
diagnosis_filter = [
    # Normal
    {'type': 'Normal',
     'include': ['normal'], 
     'exclude': []},
    # Non-vascular MCI
    {'type': 'Non-vascular MCI',
     'include': ['mci'], 
     'exclude': ['mci_vascular']},
    # Non-vascular dementia
    {'type': 'Non-vascular dementia',
     'include': ['dementia'], 
     'exclude': ['vd']},
]

def generate_class_label(label):
    for c, f in enumerate(diagnosis_filter):
        # inc = set(f['include']) & set(label) == set(f['include'])
        inc = len(set(f['include']) & set(label)) > 0        
        exc = len(set(f['exclude']) & set(label)) == 0
        if  inc and exc:
            return (c, f['type'])
    return (-1, 'The others')

class_label_to_type = [d_f['type'] for d_f in diagnosis_filter]
print('class_label_to_type:', class_label_to_type)

class_label_to_type: ['Normal', 'Non-vascular MCI', 'Non-vascular dementia']


In [8]:
splitted_metadata = [[] for i in diagnosis_filter]

for m in metadata:
    c, n = generate_class_label(m['label'])
    if c >= 0:
        m['class_type'] = n
        m['class_label'] = c
        splitted_metadata[c].append(m)
        
for i, split in enumerate(splitted_metadata):
    if len(split) == 0:
        print(f'(Warning) Split group {i} has no data.')
    else:
        print(f'- There are {len(split):} data belonging to {split[0]["class_type"]}')

- There are 463 data belonging to Normal
- There are 347 data belonging to Non-vascular MCI
- There are 229 data belonging to Non-vascular dementia


-----

## Configure the Train, Validation, and Test Splits

#### Split the filtered dataset and shuffle them

In [9]:
# Train : Val : Test = 8 : 1 : 1
ratio1 = 0.8
ratio2 = 0.1

metadata_train = []
metadata_val = []
metadata_test = []

for split in splitted_metadata:
    random.shuffle(split)
    
    n1 = round(len(split) * ratio1)
    n2 = n1 + round(len(split) * ratio2)

    metadata_train.extend(split[:n1])
    metadata_val.extend(split[n1:n2])
    metadata_test.extend(split[n2:])

random.shuffle(metadata_train)
random.shuffle(metadata_val)
random.shuffle(metadata_test)

print('Train data size\t\t:', len(metadata_train))
print('Validation data size\t:', len(metadata_val))
print('Test data size\t\t:', len(metadata_test))

print('\n', '--- Recheck ---', '\n')
train_class_nums = np.zeros((len(class_label_to_type)), dtype=np.int32)
for m in metadata_train:
    train_class_nums[m['class_label']] += 1

val_class_nums = np.zeros((len(class_label_to_type)), dtype=np.int32)
for m in metadata_val:
    val_class_nums[m['class_label']] += 1

test_class_nums = np.zeros((len(class_label_to_type)), dtype=np.int32)
for m in metadata_test:
    test_class_nums[m['class_label']] += 1

print('Train data label distribution\t:', train_class_nums, train_class_nums.sum())
print('Val data label distribution\t:', val_class_nums, val_class_nums.sum())
print('Test data label distribution\t:', test_class_nums, test_class_nums.sum())

Train data size		: 831
Validation data size	: 104
Test data size		: 104

 --- Recheck --- 

Train data label distribution	: [370 278 183] 831
Val data label distribution	: [46 35 23] 104
Test data label distribution	: [47 34 23] 104


-----

## Test TorchVision Transform

#### Random crop

In [10]:
for i in range(2):
    dataset = EEGDataset(root_path, metadata_train, EEGRandomCrop(3))
    print(dataset[0]['signal'])
    print('\n')
    print('-' * 100)
    print('\n')

[[-22. -25. -26.]
 [ -9.  -9.  -7.]
 [ -5.  -4.  -1.]
 [  3.   3.   4.]
 [  0.   0.  -2.]
 [  4.   9.  10.]
 [  9.   7.   7.]
 [  1.   2.  -1.]
 [  6.   7.   5.]
 [  6.   6.   5.]
 [ -8.  -8.  -7.]
 [-10.  -8.  -3.]
 [ -4.  -3.  -2.]
 [ -5.  -7.  -5.]
 [  5.   8.   7.]
 [  4.   6.   4.]
 [ -3.  -4.  -4.]
 [  1.  -2.  -2.]
 [  9.   7.   3.]
 [-21. -21. -18.]
 [  0.   1.   2.]]


----------------------------------------------------------------------------------------------------


[[ 29.  27.  26.]
 [  3.   2.   0.]
 [ -1.  -1.  -2.]
 [  2.   2.   1.]
 [ -7.  -7.  -5.]
 [-24. -24. -27.]
 [ -6.  -7.  -7.]
 [ -4.  -4.  -2.]
 [ -7.  -6.  -6.]
 [ -8.  -8.  -9.]
 [ 28.  27.  28.]
 [ -1.   1.   0.]
 [ -5.  -5.  -3.]
 [ 19.  18.  20.]
 [ -9.  -8.  -5.]
 [ -3.  -4.  -5.]
 [  6.   4.   1.]
 [ -1.   0.  -2.]
 [ -6.  -4.  -5.]
 [ 58. 177. 197.]
 [ -1.  -1.  -1.]]


----------------------------------------------------------------------------------------------------




#### Normalization per signal

In [11]:
dataset = EEGDataset(root_path, metadata_train, EEGNormalizePerSignal())
print(dataset[0])

print()
print('-' * 100)
print()

print('Mean:', np.mean(dataset[0]['signal'], axis=1))
print('Std:', np.std(dataset[0]['signal'], axis=1))

{'signal': array([[-9.4980073e-01, -7.6441890e-01, -5.1724309e-01, ...,
        -9.9099672e-01, -1.0527906e+00, -1.0321927e+00],
       [ 5.3544533e-01,  1.2499492e+00,  1.2499492e+00, ...,
        -7.1493649e-01, -7.1493649e-01, -5.3631055e-01],
       [ 1.5363599e+00,  1.7070744e+00,  8.5350204e-01, ...,
        -7.0310736e-05,  1.7064416e-01,  5.1207310e-01],
       ...,
       [ 8.6157030e-01,  5.1706815e-01,  1.7256606e-01, ...,
        -1.0331913e+00,  3.1498959e-04,  8.6157030e-01],
       [-4.7229922e-01,  1.2586252e-01,  1.5734471e-01, ...,
         1.6527491e+00,  1.5740435e+00,  1.5110791e+00],
       [ 1.2456863e-03,  1.4145758e+00,  1.4145758e+00, ...,
         1.2456863e-03,  7.0791072e-01, -1.4120845e+00]], dtype=float32), 'age': 56, 'class_label': 0, 'metadata': {'serial': '00774', 'edfname': '01063145_220915', 'birth': '1958-12-09', 'record': '2015-09-22T13:11:12', 'age': 56, 'dx1': 'cb_normal', 'label': ['normal', 'cb_normal'], 'events': [[0, 'Start Recording'], [0, '

#### Age normalization

In [12]:
ages = []
for m in metadata_train:
    ages.append(m['age'])

ages = np.array(ages)
age_mean = np.mean(ages)
age_std = np.std(ages)

print('Age mean and standard deviation:\t', age_mean, ',\t', age_std)

print()
print('-' * 100)
print()

print('before:')
dataset = EEGDataset(root_path, metadata_train, None)
for i in range(5):
    print(dataset[i]['age'])

print()
print('-' * 100)
print()

print('after:')
dataset = EEGDataset(root_path, metadata_train, EEGNormalizeAge(mean=age_mean, std=age_std))
for i in range(5):
    print(dataset[i]['age'])

Age mean and standard deviation:	 70.2202166064982 ,	 9.617642005132124

----------------------------------------------------------------------------------------------------

before:
56
58
76
59
56

----------------------------------------------------------------------------------------------------

after:
-1.4785554072530993
-1.2706042278628438
0.6009563866494568
-1.166628638167716
-1.4785554072530993


#### Drop EKG channel

In [11]:
dataset = EEGDataset(root_path, metadata_train, None)
print('before:', dataset[0]['signal'].shape)
print(dataset[0]['signal'])

print()
print('-' * 100)
print()

dataset = EEGDataset(root_path, metadata_train, EEGDropEKGChannel())
print('after:', dataset[0]['signal'].shape)
print(dataset[0]['signal'])

before: (21, 120200)
[[-39.  -9.  -9. ...   3.   3.   1.]
 [ 36.  40.  40. ...  11.  11.   9.]
 [-32. -34. -33. ...  -6.  -7.  -6.]
 ...
 [ 13.  14.  15. ...  -2.  -1.   0.]
 [-23. -16.   7. ...   6.  35. -27.]
 [  0.   0.  -2. ...   0.   2.   2.]]

----------------------------------------------------------------------------------------------------

after: (20, 120200)
[[-39.  -9.  -9. ...   3.   3.   1.]
 [ 36.  40.  40. ...  11.  11.   9.]
 [-32. -34. -33. ...  -6.  -7.  -6.]
 ...
 [ 26.  25.  26. ...   2.   3.   3.]
 [ 13.  14.  15. ...  -2.  -1.   0.]
 [  0.   0.  -2. ...   0.   2.   2.]]


#### Drop photic stimulation channel

In [13]:
dataset = EEGDataset(root_path, metadata_train, None)
print('before:', dataset[0]['signal'].shape)
print(dataset[0]['signal'])

print()
print('-' * 100)
print()

dataset = EEGDataset(root_path, metadata_train, EEGDropPhoticChannel())
print('after:', dataset[0]['signal'].shape)
print(dataset[0]['signal'])

before: (21, 121400)
[[-46. -37. -25. ... -48. -51. -50.]
 [  6.  14.  14. ...  -8.  -8.  -6.]
 [  9.  10.   5. ...   0.   1.   3.]
 ...
 [  5.   3.   1. ...  -6.   0.   5.]
 [-30.   8.  10. ... 105. 100.  96.]
 [  0.   2.   2. ...   0.   1.  -2.]]

----------------------------------------------------------------------------------------------------

after: (20, 121400)
[[-46. -37. -25. ... -48. -51. -50.]
 [  6.  14.  14. ...  -8.  -8.  -6.]
 [  9.  10.   5. ...   0.   1.   3.]
 ...
 [  4.   7.   5. ...  -6.  -1.  -1.]
 [  5.   3.   1. ...  -6.   0.   5.]
 [-30.   8.  10. ... 105. 100.  96.]]


#### To Tensor

In [14]:
dataset = EEGDataset(root_path, metadata_train, None)
print('before:')
print(dataset[0])

print()
print('-' * 100)
print()

dataset = EEGDataset(root_path, metadata_train, EEGToTensor())
print('before:')
print(dataset[0])

before:
{'signal': array([[-46., -37., -25., ..., -48., -51., -50.],
       [  6.,  14.,  14., ...,  -8.,  -8.,  -6.],
       [  9.,  10.,   5., ...,   0.,   1.,   3.],
       ...,
       [  5.,   3.,   1., ...,  -6.,   0.,   5.],
       [-30.,   8.,  10., ..., 105., 100.,  96.],
       [  0.,   2.,   2., ...,   0.,   1.,  -2.]], dtype=float32), 'age': 56, 'class_label': 0, 'metadata': {'serial': '00774', 'edfname': '01063145_220915', 'birth': '1958-12-09', 'record': '2015-09-22T13:11:12', 'age': 56, 'dx1': 'cb_normal', 'label': ['normal', 'cb_normal'], 'events': [[0, 'Start Recording'], [0, 'New Montage - Pz Montage'], [352, 'Eyes Open'], [5948, 'Eyes Closed'], [13634, 'Eyes Open'], [18212, 'Eyes Closed'], [24386, 'Eyes Open'], [30139, 'Eyes Closed'], [36398, 'Eyes Open'], [42152, 'Eyes Closed'], [48662, 'Eyes Open'], [54541, 'Eyes Closed'], [60254, 'Eyes Open'], [66344, 'Eyes Closed'], [72224, 'Eyes Open'], [79364, 'Eyes Closed'], [84698, 'Eyes Open'], [90074, 'Eyes Closed'], [96290,

#### Short time Fourier transform (STFT or spectrogram)

In [16]:
composed = transforms.Compose([EEGToTensor(), EEGSpectrogram(n_fft=200, complex_mode='as_real')])
dataset = EEGDataset(root_path, metadata_train, composed)
print(dataset[0]['signal'].shape, dataset[0]['signal'].dtype, type(dataset[0]['signal']))
print(dataset[0]['signal'][:, :, 10])

print()
print('-' * 100)
print()

composed = transforms.Compose([EEGToTensor(), EEGSpectrogram(n_fft=200, complex_mode='complex')])
dataset = EEGDataset(root_path, metadata_train, composed)
print(dataset[0]['signal'].shape, dataset[0]['signal'].dtype, type(dataset[0]['signal']))
print(dataset[0]['signal'][:, :, 10])

print()
print('-' * 100)
print()

composed = transforms.Compose([EEGToTensor(), EEGSpectrogram(n_fft=200, complex_mode='remove')])
dataset = EEGDataset(root_path, metadata_train, composed)
print(dataset[0]['signal'].shape, dataset[0]['signal'].dtype, type(dataset[0]['signal']))
print(dataset[0]['signal'][:, :, 10])

torch.Size([42, 101, 2429]) torch.float32 <class 'torch.Tensor'>
tensor([[ 7.9370e+03,  5.7561e+03,  1.6963e+03,  ..., -4.3464e+00,
         -2.4824e+00, -9.0000e+00],
        [ 2.5200e+02,  7.9001e+02,  2.5220e+02,  ...,  6.7221e-01,
         -1.7404e+00,  6.0000e+00],
        [ 1.6000e+02, -8.1314e+01,  1.3739e+01,  ..., -2.4583e+00,
          6.5933e+00,  0.0000e+00],
        ...,
        [ 0.0000e+00,  1.2316e+02,  4.2250e+02,  ..., -3.9330e+00,
          3.8433e+00,  0.0000e+00],
        [ 0.0000e+00, -1.9486e+03,  1.7935e+03,  ...,  3.8528e+00,
          7.7595e+00,  0.0000e+00],
        [ 0.0000e+00, -5.7612e+00,  2.5108e+01,  ..., -3.8516e+00,
          2.0368e+00,  0.0000e+00]])

----------------------------------------------------------------------------------------------------

torch.Size([21, 101, 2429]) torch.complex64 <class 'torch.Tensor'>
tensor([[ 7.9370e+03+0.0000e+00j,  5.7561e+03-4.0513e+03j,
          1.6963e+03-5.5050e+03j,  ...,
         -4.3464e+00-3.1604e-01j, 

In [66]:
composed = transforms.Compose([EEGToTensor(), EEGSpectrogram(n_fft=200, complex_mode='complex')])
dataset1 = EEGDataset(root_path, metadata_train, composed)

composed = transforms.Compose([EEGToTensor()])
dataset2 = EEGDataset(root_path, metadata_train, composed)

diff = torch.istft(dataset1[0]['signal'], n_fft=200) - dataset2[0]['signal']
print(torch.sum(diff > 1e-4))
print(torch.sum(diff > 1e-6))
print(torch.sum(diff > 1e-8))
print(torch.mul(*diff.shape))

tensor(0)
tensor(107175)
tensor(633266)
tensor(636218)
tensor(2549400)


In [67]:
composed = transforms.Compose([EEGNormalizeAge(mean=age_mean, std=age_std),
                               EEGDropPhoticChannel(),
                               EEGRandomCrop(crop_length=200*60), # 1 minutes
                               EEGNormalizePerSignal(),
                               EEGToTensor(),
                               EEGSpectrogram(n_fft=200, complex_mode='complex', hop_length=200 // 2)])
dataset = EEGDataset(root_path, metadata_train, composed)
print(dataset[0]['signal'].shape, dataset[0]['signal'].dtype, type(dataset[0]['signal']))
print(dataset[0]['signal'][:, :, 10])

torch.Size([20, 101, 119]) torch.complex64 <class 'torch.Tensor'>
tensor([[-1.2514e+02+0.0000e+00j, -1.2403e+00-5.1921e+01j,
          5.4556e+00-2.6447e+01j,  ...,
          1.0979e+00+6.1226e-04j,  1.0772e+00-3.6137e-02j,
          1.0872e+00+0.0000e+00j],
        [-1.6511e+02+0.0000e+00j, -2.4113e+00+1.6019e+01j,
         -1.3783e+01-6.9509e-01j,  ...,
          1.1358e+00+5.4370e-02j,  9.1314e-01-5.6526e-01j,
          1.0455e+00+0.0000e+00j],
        [ 2.6962e+01+0.0000e+00j, -2.5124e+00-1.7830e+01j,
         -4.1437e+01-1.7742e+01j,  ...,
          1.5830e+00-2.2078e-02j, -9.3204e-01-4.1704e-01j,
          1.7951e-01+0.0000e+00j],
        ...,
        [-6.1104e+01+0.0000e+00j, -2.2689e+01-2.1311e+01j,
         -2.7893e-01-1.4965e+01j,  ...,
          3.6751e-01+2.7100e-01j, -1.2049e-01+5.2408e-01j,
          7.9562e-01+0.0000e+00j],
        [ 1.7993e+01+0.0000e+00j,  4.1547e+01+6.5779e+00j,
         -1.4255e+01+2.6794e+01j,  ...,
         -1.5551e+00-1.3285e-01j, -1.6725e+00+2.30

#### Compose some at once

In [None]:
composed = transforms.Compose([EEGNormalizeAge(mean=age_mean, std=age_std),
                               EEGDropPhoticChannel(),
                               EEGRandomCrop(crop_length=200*60), # 1 minutes
                               EEGNormalizePerSignal(),
                               EEGToTensor()])

train_dataset = EEGDataset(root_path, metadata_train, composed)
val_dataset = EEGDataset(root_path, metadata_val, composed)
test_dataset = EEGDataset(root_path, metadata_test, composed)

print(train_dataset[0]['signal'].shape)
print(train_dataset[0])

print()
print('-' * 100)
print()

print(val_dataset[0]['signal'].shape)
print(val_dataset[0])

print()
print('-' * 100)
print()

print(test_dataset[0]['signal'].shape)
print(test_dataset[0])

#### Data loader test

In [None]:
print('Current PyTorch device:', device)
if device.type == 'cuda':
    num_workers = 0 # A number other than 0 causes an error
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = DataLoader(train_dataset, 
                          batch_size=32, 
                          shuffle=True, 
                          drop_last=True,
                          num_workers=num_workers, 
                          pin_memory=pin_memory,
                          collate_fn=eeg_collate_fn)

for i_batch, sample_batched in enumerate(train_loader):
    sample_batched['signal'].to(device)
    sample_batched['age'].to(device)
    sample_batched['class_label'].to(device)
    
    print(i_batch, 
          sample_batched['signal'].shape, 
          sample_batched['age'].shape, 
          sample_batched['class_label'].shape, 
          len(sample_batched['metadata']))
    
    if i_batch > 3:
        break

#### Train, validation, test dataloaders

In [None]:
train_loader = DataLoader(train_dataset, 
                          batch_size=32, 
                          shuffle=True, 
                          drop_last=True,
                          num_workers=num_workers, 
                          pin_memory=pin_memory,
                          collate_fn=eeg_collate_fn)

val_loader = DataLoader(val_dataset, 
                        batch_size=32, 
                        shuffle=False, 
                        drop_last=False,
                        num_workers=num_workers, 
                        pin_memory=pin_memory,
                        collate_fn=eeg_collate_fn)

test_loader = DataLoader(test_dataset, 
                         batch_size=32, 
                         shuffle=False, 
                         drop_last=False,
                         num_workers=num_workers, 
                         pin_memory=pin_memory,
                         collate_fn=eeg_collate_fn)

In [None]:
for batch_i, sample_batched in enumerate(train_loader):
    # pull up the batch data
    x = sample_batched['signal'].to(device)
    age = sample_batched['age'].to(device)
    target = sample_batched['class_label'].to(device)
    
    print(x)
    print(age)
    print(target)
    
    break