# TUAB

This notebook looks around the Temple University Hospital EEG Abnormal Corpus (`TUAB`) dataset and convert the file format from the `EDF` to `NumPy memmap` for the speed-up.

-----

## Configure environments

In [3]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

C:\Users\Minjae\Desktop\EEG_Project


In [4]:
# Load some packages
import os
import glob

import math
import json
import pyedflib
import numpy as np

import pprint
from tqdm.auto import tqdm

# custom package
from datasets.temple_eeg_dataset import *
from datasets.temple_eeg_script import build_dataset_for_tuab_train
from datasets.pipeline import *

---
## `TUAB` dataset

In [3]:
# Data file path
origin_path = r'H:\Other_DB\Temple_EEG\tuh_eeg_abnormal\v2.0.0\edf'
desired_path = r'local/dataset/tuab/'

### Age

In [None]:
count = 0
text_files = glob.glob(os.path.join(origin_path, 'train/normal/01_tcp_ar/*/*/*/*.txt'))
for filename in text_files:
    with open(filename, 'rt', encoding='UTF-8') as f:
        text_script = f.read()
        if 'year' in text_script:
            count += 1
        else:
            print(text_script)
            print('-----' * 4)
            
print(count, len(text_files)) 

## File Conversion to MEMMAP

In [4]:
ref_headers = ['EEG FP1-REF', 'EEG FP2-REF', 'EEG F3-REF', 'EEG F4-REF', 
               'EEG C3-REF', 'EEG C4-REF', 'EEG P3-REF', 'EEG P4-REF', 
               'EEG O1-REF', 'EEG O2-REF', 'EEG F7-REF', 'EEG F8-REF', 
               'EEG T3-REF', 'EEG T4-REF', 'EEG T5-REF', 'EEG T6-REF', 
               'EEG A1-REF', 'EEG A2-REF', 'EEG FZ-REF', 'EEG CZ-REF', 
               'EEG PZ-REF', 'EEG T1-REF', 'EEG T2-REF', 'EEG EKG1-REF']
C = len(ref_headers)

signal_labels_dict = dict()

for i, edf_file in enumerate(tqdm(glob.glob(os.path.join(origin_path, '*/*/*/*/*/*/*.edf')))):
    if os.path.basename(edf_file) in ['00006563_s010_t000.edf', 
                                      '00006904_s004_t000.edf', 
                                      '00006904_s005_t000.edf']:
        continue
        
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(edf_file)
    
    for sh in signal_headers:
        signal_labels_dict[sh['label']] = signal_labels_dict.get(sh['label'], 0) + 1

    signal_array = np.zeros((C, *signals[0].shape), dtype=np.float32)        
    
    for k, ch in enumerate(ref_headers):
        idx = -1
        for kk, sh in enumerate(signal_headers):
            if ch == sh['label']:
                idx = kk
        
        if idx < 0:
            raise ValueError(f"{edf_file} \n\t does not have {ch} label.")
        signal_array[k] = signals[idx]
    signals = signal_array
    
    train_or_eval = edf_file.split('edf')[1].split('01_tcp_ar')[0].split('\\')[1]
    pathology = edf_file.split('edf')[1].split('01_tcp_ar')[0].split('\\')[2]
    memmap_file = os.path.join(desired_path, train_or_eval, pathology, os.path.basename(edf_file).split('.edf')[0] + '.dat')
    os.makedirs(os.path.dirname(memmap_file), exist_ok=True)
    fp = np.memmap(memmap_file, 
                   dtype='float32', mode='w+', shape=signals.shape)
    fp[:] = signals[:]
    fp.flush()

print(signal_labels_dict)

  0%|          | 0/2993 [00:00<?, ?it/s]

{'EEG FP1-REF': 2990, 'EEG FP2-REF': 2990, 'EEG F3-REF': 2990, 'EEG F4-REF': 2990, 'EEG C3-REF': 2990, 'EEG C4-REF': 2990, 'EEG P3-REF': 2990, 'EEG P4-REF': 2990, 'EEG O1-REF': 2990, 'EEG O2-REF': 2990, 'EEG F7-REF': 2990, 'EEG F8-REF': 2990, 'EEG T3-REF': 2990, 'EEG T4-REF': 2990, 'EEG T5-REF': 2990, 'EEG T6-REF': 2990, 'EEG A1-REF': 2990, 'EEG A2-REF': 2990, 'EEG FZ-REF': 2990, 'EEG CZ-REF': 2990, 'EEG PZ-REF': 2990, 'EEG ROC-REF': 2786, 'EEG LOC-REF': 2786, 'EEG EKG1-REF': 2990, 'EEG T1-REF': 2990, 'EEG T2-REF': 2990, 'PHOTIC-REF': 2836, 'IBI': 2990, 'BURSTS': 2990, 'SUPPR': 2990, 'EMG-REF': 1814, 'EEG 26-REF': 1663, 'EEG 27-REF': 1638, 'EEG 28-REF': 1638, 'EEG 29-REF': 1638, 'EEG 30-REF': 1638, 'EEG 31-REF': 111, 'EEG 32-REF': 111, 'EEG C3P-REF': 111, 'EEG C4P-REF': 111, 'EEG SP1-REF': 105, 'EEG SP2-REF': 105, 'EEG PG1-REF': 2, 'EEG PG2-REF': 2}


In [None]:
np.memmap(r"C:\Users\Minjae\Desktop\EEG_Project\local\dataset\tuab\eval\abnormal\00000768_s003_t000_00.dat", 
          dtype="int32", mode="r")  == np.memmap(r"C:\Users\Minjae\Desktop\EEG_Project\local\dataset\tuab\eval\abnormal\00000768_s003_t000.dat", 
          dtype="int32", mode="r")

In [None]:
import os
import glob
import pyedflib
import numpy as np
from tqdm import tqdm

ref_headers = ['EEG FP1-REF', 'EEG FP2-REF', 'EEG F3-REF', 'EEG F4-REF',
               'EEG C3-REF', 'EEG C4-REF', 'EEG P3-REF', 'EEG P4-REF',
               'EEG O1-REF', 'EEG O2-REF', 'EEG F7-REF', 'EEG F8-REF',
               'EEG T3-REF', 'EEG T4-REF', 'EEG T5-REF', 'EEG T6-REF',
               'EEG A1-REF', 'EEG A2-REF', 'EEG FZ-REF', 'EEG CZ-REF',
               'EEG PZ-REF', 'EEG T1-REF', 'EEG T2-REF', 'EEG EKG1-REF']

signal_labels_dict = {}

for i, edf_file in enumerate(tqdm(glob.glob(os.path.join(origin_path, '*/*/*/*/*/*/*.edf')))):
    if os.path.basename(edf_file) in ['00006563_s010_t000.edf',
                                      '00006904_s004_t000.edf',
                                      '00006904_s005_t000.edf']:
        continue

    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(edf_file)

    for sh in signal_headers:
        signal_labels_dict[sh['label']] = signal_labels_dict.get(sh['label'], 0) + 1

    signal_array = np.zeros((len(ref_headers), *signals[0].shape), dtype=np.float32)
    label_indices = {label: i for i, label in enumerate(ref_headers)}

    for kk, sh in enumerate(signal_headers):
        idx = label_indices.get(sh['label'], -1)
        if idx >= 0:
            signal_array[idx] = signals[kk]

    train_or_eval, pathology, _ = edf_file.split('edf')[1].split('01_tcp_ar')[0].split('\\')[1:4]
    memmap_file = os.path.join(desired_path, train_or_eval, pathology, os.path.basename(edf_file).split('.edf')[0] + '.dat')
    os.makedirs(os.path.dirname(memmap_file), exist_ok=True)
    signal_array.tofile(memmap_file)

print(signal_labels_dict)


---
## Dataset and loader

In [5]:
config = {}
config['dataset_path'] = './local/dataset/tuab/'
config['file_format'] = 'memmap'
config['seq_length'] = 2500
config['EKG'] = 'O'
config['device'] = 'cpu'
config['minibatch'] = 256
config['input_norm'] = 'dataset'
config['use_age'] = 'no'
train_loader, val_loader, test_loader, multicrop_test_loader = build_dataset_for_tuab_train(config, verbose=True)



 	==> run_mode is set to "train" automatically.


transform: Compose(
    EegRandomCrop(crop_length=2500, length_limit=10000000, multiple=1, latency=0, segment_simulation=False, return_timing=False, reject_events=False)
    EegToTensor()
)

----------------------------------------------------------------------------------------------------

transform_multicrop: Compose(
    EegRandomCrop(crop_length=2500, length_limit=10000000, multiple=8, latency=0, segment_simulation=False, return_timing=False, reject_events=False)
    EegToTensor()
)

----------------------------------------------------------------------------------------------------


task config:
{'class_label_to_name': ['Normal', 'Abnormal'],
 'class_name_to_label': {'Abnormal': 1, 'Normal': 0},
 'signal_header': ['EEG FP1-REF', 'EEG FP2-REF', 'EEG F3-REF', 'EEG F4-REF',
                   'EEG C3-REF', 'EEG C4-REF', 'EEG P3-REF', 'EEG P4-REF',
                   'EEG O1-REF', 'EEG O2-REF', 'EEG F7-REF', 'EEG F8-REF',
         

In [6]:
print(len(train_loader.dataset), len(val_loader.dataset), len(test_loader.dataset))

2441 273 276


In [7]:
train_loader.dataset[0]

{'serial': '00000016_s005_t001.dat',
 'full_path': './local/dataset/tuab/train/abnormal\\00000016_s005_t001.dat',
 'class_name': 'abnormal',
 'class_label': tensor(1),
 'signal': tensor([[-1.0486e+09, -1.0887e+09,  1.0948e+09,  ..., -1.0329e+09,
          -1.0339e+09, -1.0313e+09],
         [-1.0487e+09, -1.0455e+09, -1.0554e+09,  ..., -1.0356e+09,
          -1.0309e+09, -1.0319e+09],
         [-1.0508e+09, -1.0553e+09, -1.0609e+09,  ..., -1.0398e+09,
          -1.0431e+09, -1.0441e+09],
         ...,
         [-1.0655e+09, -1.0641e+09,  1.0833e+09,  ..., -1.0498e+09,
          -1.0524e+09, -1.0486e+09],
         [-1.0648e+09, -1.0651e+09, -1.0719e+09,  ...,  1.0853e+09,
           1.0762e+09,  1.0891e+09],
         [-1.0569e+09,  1.0782e+09,  1.0505e+09,  ...,  1.1082e+09,
           1.1049e+09,  1.1028e+09]]),
 'age': tensor(0.)}