# EEG Data Curation Part 2

`01_Data_Curation1` 노트북에서 생성한 메타데이터를 불러들여, 이후 학습이 가능하도록 정리하는 노트북.

전반적인 구성은 `01_Data_Curation2` 따르되, 하나의 긴 EEG 파일을 여러 Segments로 쪼개서 저장한다.

-----

## 환경 구성

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load some packages
import os
import re
import copy
import glob
from openpyxl import load_workbook, Workbook, styles
import json

import numpy as np
import pyedflib
import datetime
from dateutil.relativedelta import relativedelta

import pprint
import warnings
import ctypes
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# custom package
from utils.eeg_dataset import MultiLabel

In [3]:
# Other settings
%matplotlib inline
%config InlineBackend.figure_format = 'retina' # cleaner text

plt.style.use('default') 
# ['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 
#  'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 
#  'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 
#  'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 
#  'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']

plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams["font.family"] = 'NanumGothic' # for Hangul in Windows

-----

## 메타데이터 불러오기

In [4]:
# Data file path
root_path = r'H:\Lab Database\2020 EEG\201020'
original_folder = r'01_Original_Data'
curated_folder = r'02_Curated_Segment_Data'

In [5]:
meta_file = os.path.join(root_path, r'00_Data_Information/210531_DB_list.xlsx')
ws = load_workbook(meta_file, data_only=True)['metadata']

metadata = []

num = 2
while True:
    m = dict()
    m['edfname'] = ws.cell(row=num, column=1).value
    m['dx1'] = ws.cell(row=num, column=2).value
    m['birth'] = ws.cell(row=num, column=3).value
    m['anomaly'] = True if ws.cell(row=num, column=4).value is not None else False
    num += 1
    
    
    # check whether the row is empty (which is EOF condition)
    if m['edfname'] is None:
        break
    elif m['anomaly']:
        continue
        
    # move the pivot row
    metadata.append(m)
    
print('Size:', len(metadata))
print()
print('Loaded metadata (only 3 displayed):')
print(json.dumps(metadata[:3], indent=4))

Size: 1387

Loaded metadata (only 3 displayed):
[
    {
        "edfname": "00001809_261018",
        "dx1": "mci_rf",
        "birth": 400602,
        "anomaly": false
    },
    {
        "edfname": "00029426_020817",
        "dx1": "smi",
        "birth": 601204,
        "anomaly": false
    },
    {
        "edfname": "00047327_090718",
        "dx1": "vascular mci",
        "birth": 241019,
        "anomaly": false
    }
]


-----

## EDF 파일 구성 확인 및 정리

### EDF Header Label 구조 확인

In [6]:
m = metadata[0]
edf_file = os.path.join(root_path, original_folder, m['edfname'] + '.edf')
signals, signal_headers, edf_header  = pyedflib.highlevel.read_edf(edf_file)

refer_headers = signal_headers

pprint.pp(m)
print()
pprint.pp(edf_header)
print()
pprint.pp(signal_headers)

{'edfname': '00001809_261018',
 'dx1': 'mci_rf',
 'birth': 400602,
 'anomaly': False}

{'technician': '',
 'recording_additional': '',
 'patientname': '',
 'patient_additional': '',
 'patientcode': '',
 'equipment': '',
 'admincode': '',
 'gender': '',
 'startdate': datetime.datetime(2018, 10, 26, 15, 46, 26),
 'birthdate': '',
 'annotations': []}

[{'label': 'Fp1-AVG',
  'dimension': 'uV',
  'sample_rate': 200.0,
  'physical_max': 32767.0,
  'physical_min': -32768.0,
  'digital_max': 32767,
  'digital_min': -32768,
  'prefilter': '',
  'transducer': 'E'},
 {'label': 'F3-AVG',
  'dimension': 'uV',
  'sample_rate': 200.0,
  'physical_max': 32767.0,
  'physical_min': -32768.0,
  'digital_max': 32767,
  'digital_min': -32768,
  'prefilter': '',
  'transducer': 'E'},
 {'label': 'C3-AVG',
  'dimension': 'uV',
  'sample_rate': 200.0,
  'physical_max': 32767.0,
  'physical_min': -32768.0,
  'digital_max': 32767,
  'digital_min': -32768,
  'prefilter': '',
  'transducer': 'E'},
 {'label': 'P3-

In [7]:
signal_headers = [s_h['label'] for s_h in signal_headers]
print(signal_headers)

['Fp1-AVG', 'F3-AVG', 'C3-AVG', 'P3-AVG', 'O1-AVG', 'Fp2-AVG', 'F4-AVG', 'C4-AVG', 'P4-AVG', 'O2-AVG', 'F7-AVG', 'T3-AVG', 'T5-AVG', 'F8-AVG', 'T4-AVG', 'T6-AVG', 'FZ-AVG', 'CZ-AVG', 'PZ-AVG', 'EKG', 'Photic']


### EDF 신호열의 Trailing Zero Columns 제거 테스트

In [8]:
def trim_zero_columns(a):
    assert type(a) == np.ndarray
    trim = 0
    for i in range(a.shape[-1]):
        if np.any(a[..., -1 - i] != 0):
            trim = i
            break
    a = a[..., :-trim]
    return a

In [9]:
for (i, m) in enumerate(metadata):
    edf_file = os.path.join(root_path, original_folder, m['edfname'] + '.edf')
    signals, signal_headers, edf_header  = pyedflib.highlevel.read_edf(edf_file)
    signals = trim_zero_columns(signals)
    print(m['edfname'], end='\t\t')
    print(edf_header['startdate'], end='\t')
    print(edf_header['startdate'] + datetime.timedelta(seconds = signals.shape[1] / 200), end='\t')
    print()
    
    if i > 10:
        break

00001809_261018		2018-10-26 15:46:26	2018-10-26 15:58:31	
00029426_020817		2017-08-02 16:14:56	2017-08-02 16:32:14	
00047327_090718		2018-07-09 15:29:10	2018-07-09 15:40:00	
00048377_070819		2019-08-07 13:55:25	2019-08-07 14:09:26	
00048377_070916		2016-09-07 10:36:01	2016-09-07 10:51:16	
00048377_250719		2019-07-25 09:58:07	2019-07-25 10:12:04	
00050941_130116		2016-01-13 13:45:54	2016-01-13 14:00:54	
00055757_170414		2014-04-17 13:33:15	2014-04-17 13:41:33	
00056270_260219		2019-02-26 10:38:59	2019-02-26 10:56:33	
00062072_080319		2019-03-08 13:38:26	2019-03-08 13:55:08	
00088513_120717		2017-07-12 14:15:27	2017-07-12 14:30:43	
00110793_210917		2017-09-21 14:25:09	2017-09-21 14:37:51	


## 데이터셋 구성 및 저장

1. EDF / Signal Header Label 불일치 데이터 제거
2. EDF $\rightarrow$ NumPy Pickle 변환하여 저장
    - 하나의 EDF 파일을 여럿으로 30초씩 잘라서 저장
3. Metadata 정리 후 저장
    - `metadata_debug`: 디버깅을 위해 모든 정보를 총망라한 버전
    - `metadata_public`: 익명화까지 수행하여 향후 공개가 가능한 버전

In [10]:
def birth_to_datetime(b):
    if b is None:
        return None
    try:
        if type(b) is int:
            y = (b // 10000) + 1900
            m = (b % 10000) // 100
            d = b % 100
            return datetime.date(y, m, d)
        elif type(b) is str:
            b = int(b)
            y = (b // 10000) + 1900
            m = (b % 10000) // 100
            d = b % 100
            return datetime.date(y, m, d)
    except:
        print(f'WARNING - Input to birth_to_datetime() is uninterpretable: {type(b)}, {b}')
    return None


def calculate_age(birth, record):
    if birth is None:
        return None
    try:
        age = (record - relativedelta(years=birth.year, months=birth.month, days=birth.day)).year
        if age < 40 or 100 < age:
            print(f'WARNING - calculate_age() generated an unordinary age: {age}')
        return age
    except Exception as e:
        print(f'WARNING - calculate_age() has an exception: {e}')
    return None


def serialize_json(obj):
    """JSON serializer for objects not serializable by default json code"""

    if isinstance(obj, datetime.datetime) or isinstance(obj, datetime.date):
        serial = obj.isoformat()
        return serial

    if isinstance(obj, MultiLabel):
        serial = obj.get_true_keys()
        return serial

    return obj.__dict__

In [11]:
segment_length = 200 * 30 # 1 minute

warnings.filterwarnings(action='ignore')

text = f'Delete ALL files in {os.path.join(root_path, curated_folder)}?'
if ctypes.windll.user32.MessageBoxExW(0, text, 'Question', 4) == 6: # Yes
    for f in glob.glob(os.path.join(root_path, curated_folder, '*.*')):
        os.remove(f)

metadata_debug = []
metadata_public = []

for m in tqdm(metadata):
    # EDF file check
    edf_file = os.path.join(root_path, original_folder, m['edfname'] + '.edf')
    signals, signal_headers, edf_header = pyedflib.highlevel.read_edf(edf_file)
    signals = trim_zero_columns(signals)

    if refer_headers != signal_headers:
        print('- Signal header differs from the majority:', m['edfname'])
        continue
        
    # EDF recoding events
    event_file = os.path.join(root_path, original_folder, m['edfname'] + '.xlsx')
    wb = load_workbook(event_file, data_only=True)
    ws = wb[wb.sheetnames[0]]
    
    num = 2
    events = [] 
    
    while True:
        t = ws.cell(row=num, column=3).value
        e = ws.cell(row=num, column=4).value
        
        if t is None:
            break
        
        t = edf_header['startdate'].strftime('%Y%m%d') + t
        t = datetime.datetime.strptime(t, '%Y%m%d %H:%M:%S.%f')
        
        if num == 2: 
            startTime = t
            
        t = int(np.floor((t - startTime).total_seconds() * 200))
        events.append((t, e))
        num += 1

    # calculate age
    age = calculate_age(birth_to_datetime(m['birth']), 
                        edf_header['startdate'])
    
    for s in range(signals.shape[1] // segment_length):
        # trim events
        r1, r2 = (segment_length * s, segment_length * (s + 1))
        events_segment = []
        for (t, e) in events:
            if r1 <= t and t < r2:
                events_segment.append((t - r1, e))
                    
        # metadata_debug
        m2 = {}
        m2['serial'] = f'{len(metadata_debug) + 1:05}'
        m2['edfname'] = m['edfname']
        m2['birth'] = birth_to_datetime(m['birth'])
        m2['record'] = edf_header['startdate']
        m2['age'] = age
        m2['dx1'] = m['dx1']
        m2['label'] = MultiLabel.load_from_string(m['dx1'])
        m2['events'] = events_segment
        metadata_debug.append(m2)

        # metadata_public
        m3 = {}
        m3['serial'] = m2['serial']
        m3['age'] = age
        m3['label'] = m2['label']
        metadata_public.append(m3)

        # EDF signal
        fname = os.path.join(root_path, curated_folder, m2['serial'])
        signal_segment = signals[:, r1:r2]
        np.save(fname, signal_segment.astype('float32'))
    
print('Done.')
print()
print(f'Among {len(metadata)}, {len(metadata_public)} data were saved.')

warnings.filterwarnings(action='default')

  0%|          | 0/1387 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
pprint.pp(metadata_debug[:5])

In [None]:
pprint.pp(metadata_public[:5])

In [None]:
# save metadata_public as JSON
path = os.path.join(root_path, curated_folder, 'metadata_public.json')
with open(path, 'w') as json_file:
    json.dump(metadata_public, json_file, indent=4, default=serialize_json)
    
# save metadata_public as XLSX
wb = Workbook()
ws = wb.active
ws.title = 'metadata'
ws.cell(row=1, column=1).value = 'serial'
ws.cell(row=1, column=2).value = 'age'
for (i, label) in enumerate(metadata_debug[0]['label'].get_label_types()):
    ws.cell(row=1, column=3 + i).value = label

for (i, m) in enumerate(metadata_debug):
    ws.cell(row=2 + i, column=1).value = m['serial']
    ws.cell(row=2 + i, column=2).value = m['age']
    for (k, label) in enumerate(m['label'].get_label_values()):
        ws.cell(row=2 + i, column=3 + k).value = label if label is not False else None
    
    # coloring
    color = 'FDFDD0' if i % 2 == 0 else 'D9E5FF'
    for rows in ws.iter_rows(min_row=2 + i, max_row=2 + i, min_col=1, max_col=2 + m['label'].get_size()):
        for cell in rows:
            cell.fill = styles.PatternFill(start_color=color, end_color=color, fill_type="solid")

path = os.path.join(root_path, curated_folder, 'metadata_public.xlsx')
wb.save(path)

In [None]:
# save metadata_debug as JSON
path = os.path.join(root_path, curated_folder, 'metadata_debug.json')
with open(path, 'w') as json_file:
    json.dump(metadata_debug, json_file, indent=4, default=serialize_json)

# save metadata_debug as XLSX
wb = Workbook()
ws = wb.active
ws.title = 'metadata'
ws.cell(row=1, column=1).value = 'serial'
ws.cell(row=1, column=2).value = 'EDF file'
ws.cell(row=1, column=3).value = 'birth'
ws.cell(row=1, column=4).value = 'record'
ws.cell(row=1, column=5).value = 'age'
ws.cell(row=1, column=6).value = 'dx1'
for (i, label) in enumerate(metadata_debug[0]['label'].get_label_types()):
    ws.cell(row=1, column=7 + i).value = label

for (i, m) in enumerate(metadata_debug):
    ws.cell(row=2 + i, column=1).value = m['serial']
    ws.cell(row=2 + i, column=2).value = m['edfname']
    ws.cell(row=2 + i, column=3).value = m['birth']
    ws.cell(row=2 + i, column=4).value = m['record']
    ws.cell(row=2 + i, column=5).value = m['age']
    ws.cell(row=2 + i, column=6).value = m['dx1']
    for (k, label) in enumerate(m['label'].get_label_values()):
        ws.cell(row=2 + i, column=7 + k).value = label if label is not False else None
    
    # coloring
    color = 'FDFDD0' if i % 2 == 0 else 'D9E5FF'
    for rows in ws.iter_rows(min_row=2 + i, max_row=2 + i, min_col=1, max_col=6 + m['label'].get_size()):
        for cell in rows:
            cell.fill = styles.PatternFill(start_color=color, end_color=color, fill_type="solid")

path = os.path.join(root_path, curated_folder, 'metadata_debug.xlsx')
wb.save(path)