# Temple EEG

This notebook looks around the Temple University Hospital EEG Abnormal Corpus (`TUAB`) dataset and convert the file format from the `EDF` to `NumPy memmap` for the speed-up.

-----

## Configure environments

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

C:\Users\Minjae\Desktop\EEG_Project


In [2]:
# Load some packages
import os
import glob

import math
import json
import pyedflib
import numpy as np

import pprint
from tqdm.auto import tqdm

# custom package
from datasets.temple_eeg_dataset import *
from datasets.pipeline import *

In [3]:
# Data file path
origin_path = r'H:\Other_DB\Temple_EEG\tuh_eeg_abnormal\v2.0.0\edf'
desired_path = r'local/dataset/Temple_EEG/'

---
## `TUAB` dataset

### Age

In [None]:
count = 0
text_files = glob.glob(os.path.join(origin_path, 'train/normal/01_tcp_ar/*/*/*/*.txt'))
for filename in text_files:
    with open(filename, 'rt', encoding='UTF-8') as f:
        text_script = f.read()
        if 'year' in text_script:
            count += 1
        else:
            print(text_script)
            print('-----' * 4)
            
print(count, len(text_files)) 

## File Conversion to MEMMAP

In [None]:
ref_headers = ['EEG FP1-REF', 'EEG FP2-REF', 'EEG F3-REF', 'EEG F4-REF', 
               'EEG C3-REF', 'EEG C4-REF', 'EEG P3-REF', 'EEG P4-REF', 
               'EEG O1-REF', 'EEG O2-REF', 'EEG F7-REF', 'EEG F8-REF', 
               'EEG T3-REF', 'EEG T4-REF', 'EEG T5-REF', 'EEG T6-REF', 
               'EEG A1-REF', 'EEG A2-REF', 'EEG FZ-REF', 'EEG CZ-REF', 
               'EEG PZ-REF', 'EEG T1-REF', 'EEG T2-REF', 'EEG EKG1-REF']
C = len(ref_headers)

signal_labels_dict = dict()

for i, edf_file in enumerate(tqdm(glob.glob(os.path.join(origin_path, '*/*/*/*/*/*/*.edf')))):
    if i < 729:
        continue
        
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(edf_file)
    
    for sh in signal_headers:
        signal_labels_dict[sh['label']] = signal_labels_dict.get(sh['label'], 0) + 1

    signal_array = np.zeros((C, *signals[0].shape), dtype=np.float32)        
    
    for k, ch in enumerate(ref_headers):
        idx = -1
        for kk, sh in enumerate(signal_headers):
            if ch == sh['label']:
                idx = kk
        
        if idx < 0:
            raise ValueError(f"{edf_file} \n\t does not have {ch} label.")
        signal_array[k] = signals[idx]
    signals = signal_array
    
    train_or_eval = edf_file.split('edf')[1].split('01_tcp_ar')[0].split('\\')[1]
    pathology = edf_file.split('edf')[1].split('01_tcp_ar')[0].split('\\')[2]
    memmap_file = os.path.join(desired_path, train_or_eval, pathology, os.path.basename(edf_file).split('.edf')[0] + '.dat')
    os.makedirs(os.path.dirname(memmap_file), exist_ok=True)
    fp = np.memmap(memmap_file, 
                   dtype='float32', mode='w+', shape=signals.shape)
    fp[:] = signals[:]
    fp.flush()

print(signal_labels_dict)

  0%|          | 0/2993 [00:00<?, ?it/s]

In [7]:
signal_headers

[{'label': 'EEG FP1-REF',
  'dimension': 'uV',
  'sample_rate': 256.0,
  'physical_max': 5482.288,
  'physical_min': -5482.28,
  'digital_max': 32767,
  'digital_min': -32767,
  'prefilter': 'HP:0.000 Hz LP:0.0 Hz N:0.0',
  'transducer': 'Unknown'},
 {'label': 'EEG FP2-REF',
  'dimension': 'uV',
  'sample_rate': 256.0,
  'physical_max': 5482.288,
  'physical_min': -5482.28,
  'digital_max': 32767,
  'digital_min': -32767,
  'prefilter': 'HP:0.000 Hz LP:0.0 Hz N:0.0',
  'transducer': 'Unknown'},
 {'label': 'EEG F3-REF',
  'dimension': 'uV',
  'sample_rate': 256.0,
  'physical_max': 5482.288,
  'physical_min': -5482.28,
  'digital_max': 32767,
  'digital_min': -32767,
  'prefilter': 'HP:0.000 Hz LP:0.0 Hz N:0.0',
  'transducer': 'Unknown'},
 {'label': 'EEG F4-REF',
  'dimension': 'uV',
  'sample_rate': 256.0,
  'physical_max': 5482.288,
  'physical_min': -5482.28,
  'digital_max': 32767,
  'digital_min': -32767,
  'prefilter': 'HP:0.000 Hz LP:0.0 Hz N:0.0',
  'transducer': 'Unknown'},
 {

In [6]:
signal_labels_dict

{'EEG FP1-REF': 647,
 'EEG FP2-REF': 647,
 'EEG F3-REF': 647,
 'EEG F4-REF': 647,
 'EEG C3-REF': 647,
 'EEG C4-REF': 647,
 'EEG P3-REF': 647,
 'EEG P4-REF': 647,
 'EEG O1-REF': 647,
 'EEG O2-REF': 647,
 'EEG F7-REF': 647,
 'EEG F8-REF': 647,
 'EEG T3-REF': 647,
 'EEG T4-REF': 647,
 'EEG T5-REF': 647,
 'EEG T6-REF': 647,
 'EEG A1-REF': 647,
 'EEG A2-REF': 647,
 'EEG FZ-REF': 647,
 'EEG CZ-REF': 647,
 'EEG PZ-REF': 647,
 'EEG ROC-REF': 616,
 'EEG LOC-REF': 616,
 'EEG EKG1-REF': 646,
 'EEG T1-REF': 646,
 'EEG T2-REF': 646,
 'PHOTIC-REF': 622,
 'IBI': 647,
 'BURSTS': 647,
 'SUPPR': 647,
 'EMG-REF': 466,
 'EEG 26-REF': 442,
 'EEG 27-REF': 438,
 'EEG 28-REF': 438,
 'EEG 29-REF': 438,
 'EEG 30-REF': 438,
 'EEG 31-REF': 17,
 'EEG 32-REF': 17,
 'EEG C3P-REF': 18,
 'EEG C4P-REF': 18,
 'EEG SP1-REF': 16,
 'EEG SP2-REF': 16,
 'EEG OZ-REF': 1,
 'ECG EKG-REF': 1,
 'PULSE RATE': 1}

---
## Simple preprocessing with converting

#### Test trailing zero signals trimming

In [None]:
for i, f in enumerate(glob.glob(os.path.join(curate_path, 'signal/*.edf'))):
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)  # trim garbage zeros
    print(f, end='\t\t')
    print(edf_header['startdate'], end='\t')
    print(edf_header['startdate'] + datetime.timedelta(seconds = signals.shape[1] / 200), end='\t')
    print()
    
    if i > 10:
        break

---
## Convert and Save

In [None]:
save_feather = True
save_memmap = True

In [None]:
if save_feather:
    os.makedirs(os.path.join(curate_path, 'signal/feather'), exist_ok=True)

if save_memmap:
    os.makedirs(os.path.join(curate_path, 'signal/memmap'), exist_ok=True)

for f in tqdm(glob.glob(os.path.join(curate_path, 'signal/*.edf'))):
    # file name
    serial = f.split('.edf')[0][-5:]
    
    # load signal
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(f)
    signals = trim_trailing_zeros(signals)
    signals = signals.astype('int32')
    
    # save as feather
    if save_feather:
        df = pd.DataFrame(data=signals.T, columns=[s_h['label'] for s_h in signal_headers], dtype=np.int32)
        feather.write_feather(df, os.path.join(curate_path, 'signal/feather', serial + '.feather'))

    # save as numpy memmap
    if save_memmap:
        fp = np.memmap(os.path.join(curate_path, 'signal/memmap', serial + '.dat'), 
                       dtype='int32', mode='w+', shape=signals.shape)
        fp[:] = signals[:]
        fp.flush()