# Benchmark Construction

This notebook organizes the standard benchmark of our `CAUEEG` dataset using the previously generated signal, annotation, and event files.

-----

## 환경 구성

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Load some packages
import os
import glob
import json
import pprint

import numpy as np
import random

# custom package
from datasets.caueeg_dataset import *
from datasets.pipeline import *

In [3]:
# Data file path
data_path = r'local/dataset/02_Curated_Data_220419/'

In [4]:
anno_path = os.path.join(data_path, 'annotation.json')
with open(anno_path, 'r') as json_file:
    annotation = json.load(json_file)

pprint.pprint({k: (v if k != 'data' else v[:5]) for (k, v) in annotation.items()}, width=250)

{'data': [{'age': 78, 'serial': '00001', 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_rf']},
          {'age': 56, 'serial': '00002', 'symptom': ['normal', 'smi']},
          {'age': 93, 'serial': '00003', 'symptom': ['mci', 'mci_vascular']},
          {'age': 78, 'serial': '00004', 'symptom': ['dementia', 'ad', 'load']},
          {'age': 75, 'serial': '00005', 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_ef', 'mci_multi_domain']}],
 'dataset_name': 'CAUEEG dataset',
 'signal_header': ['Fp1-AVG', 'F3-AVG', 'C3-AVG', 'P3-AVG', 'O1-AVG', 'Fp2-AVG', 'F4-AVG', 'C4-AVG', 'P4-AVG', 'O2-AVG', 'F7-AVG', 'T3-AVG', 'T5-AVG', 'F8-AVG', 'T4-AVG', 'T6-AVG', 'FZ-AVG', 'CZ-AVG', 'PZ-AVG', 'EKG', 'Photic']}


## Helper functions

In [5]:
def shuffle_splitted_metadata(splitted_metadata, class_label_to_name, ratios, seed=None, verbose=False):
    # random seed
    if seed is not None:
        random.seed(seed)
    else:
        random.seed()

    metadata_train = []
    metadata_val = []
    metadata_test = []

    for split in splitted_metadata:
        random.shuffle(split)

        n1 = round(len(split) * ratios[0])
        n2 = n1 + round(len(split) * ratios[1])

        metadata_train.extend(split[:n1])
        metadata_val.extend(split[n1:n2])
        metadata_test.extend(split[n2:])

    random.shuffle(metadata_train)
    random.shuffle(metadata_val)
    random.shuffle(metadata_test)

    if verbose:
        train_class_dist = [np.sum([1 for m in metadata_train if m['class_label'] == i])
                            for i in range(len(class_label_to_name))]

        val_class_dist = [np.sum([1 for m in metadata_val if m['class_label'] == i])
                          for i in range(len(class_label_to_name))]

        test_class_dist = [np.sum([1 for m in metadata_test if m['class_label'] == i])
                           for i in range(len(class_label_to_name))]

        print(f'<{"Train":^15}> data label distribution\t:', train_class_dist, '=', np.sum(train_class_dist))
        print(f'<{"Validation":^15}> data label distribution\t:', val_class_dist, '=', np.sum(val_class_dist))
        print(f'<{"Test":^15}> data label distribution\t:', test_class_dist, '=', np.sum(test_class_dist))

    # restore random seed (stochastic)
    random.seed()

    return metadata_train, metadata_val, metadata_test

-----

## Main Task 1: Classification of Three Symptoms (Normal, MCI, Dementia).

#### Define the target diagnoses and split them by their symptoms

In [6]:
diagnosis_filter = [
    # Normal
    {'name': 'Normal',
     'include': ['normal'], 
     'exclude': []},
    # Non-vascular MCI
    {'name': 'MCI',
     'include': ['mci'], 
     'exclude': []},
    # Non-vascular dementia
    {'name': 'Dementia',
     'include': ['dementia'], 
     'exclude': []},
]

class_label_to_name = [d_f['name'] for d_f in diagnosis_filter]
print('class_label_to_name:', class_label_to_name)

class_name_to_label = {d_f['name']: i for i, d_f in enumerate(diagnosis_filter)}
print('class_name_to_label:', class_name_to_label)

class_label_to_name: ['Normal', 'MCI', 'Dementia']
class_name_to_label: {'Normal': 0, 'MCI': 1, 'Dementia': 2}


In [7]:
# Split the filtered dataset
splitted_metadata = [[] for _ in diagnosis_filter]

for m in annotation['data']:
    symptom = m['symptom']
    for c, f in enumerate(diagnosis_filter):
        inc = set(f['include']) & set(symptom) == set(f['include'])
        # inc = len(set(f['include']) & set(label)) > 0
        exc = len(set(f['exclude']) & set(symptom)) == 0
        if inc and exc:
            m['class_name'] = f['name']
            m['class_label'] = c
            splitted_metadata[c].append(m)
            break

for i, split in enumerate(splitted_metadata):
    if len(split) == 0:
        raise ValueError(f'(Warning) Split group {i} has no data.')
    print(f'- There are {len(split):} data belonging to {split[0]["class_name"]}')

- There are 459 data belonging to Normal
- There are 417 data belonging to MCI
- There are 311 data belonging to Dementia


#### Shuffle the divided data

In [8]:
ratios = np.array([8, 1, 1])
ratios = ratios / ratios.sum()
print('Train, validation, test sets ratios:', ratios)

Train, validation, test sets ratios: [0.8 0.1 0.1]


In [9]:
metadata_train, metadata_val, metadata_test = shuffle_splitted_metadata(splitted_metadata, 
                                                                        class_label_to_name, 
                                                                        ratios, 
                                                                        seed=None, 
                                                                        verbose=True)

<     Train     > data label distribution	: [367, 334, 249] = 950
<  Validation   > data label distribution	: [46, 42, 31] = 119
<     Test      > data label distribution	: [46, 41, 31] = 118


#### Save the dataset as JSON file

In [10]:
task_dict = dict()

task_dict['task_name'] = 'CAUEEG-task1 benchmark'
task_dict['task_description'] = 'Classification of [Normal], [MCI], and [Dementia] symptoms.'
task_dict['class_label_to_name'] = class_label_to_name
task_dict['class_name_to_label'] = class_name_to_label

task_dict['train_split'] = metadata_train
task_dict['validation_split'] = metadata_val
task_dict['test_split'] = metadata_test

print('{')
for k, v in task_dict.items():
    print(f'\t{k}:')
    if isinstance(v, list) and len(v) > 3:
        print(f'\t\t{v[0]}')
        print(f'\t\t{v[1]}')
        print(f'\t\t{v[2]}')
        print(f'\t\t.')
        print(f'\t\t.')
        print(f'\t\t.')
        print(f'\t\t{v[-1]}')
    else:
        print(f'\t\t{v}')
    print()
print('}')

with open(os.path.join(data_path, 'task1.json'), 'w') as json_file:
    json.dump(task_dict, json_file, indent=4)
    print('task1.json file is saved.')

{
	task_name:
		CAUEEG-task1 benchmark

	task_description:
		Classification of [Normal], [MCI], and [Dementia] symptoms.

	class_label_to_name:
		['Normal', 'MCI', 'Dementia']

	class_name_to_label:
		{'Normal': 0, 'MCI': 1, 'Dementia': 2}

	train_split:
		{'serial': '00422', 'age': 72, 'symptom': ['normal', 'cb_normal'], 'class_name': 'Normal', 'class_label': 0}
		{'serial': '01137', 'age': 61, 'symptom': ['normal', 'cb_normal'], 'class_name': 'Normal', 'class_label': 0}
		{'serial': '00634', 'age': 73, 'symptom': ['mci', 'mci_amnestic'], 'class_name': 'MCI', 'class_label': 1}
		.
		.
		.
		{'serial': '00594', 'age': 75, 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_ef'], 'class_name': 'MCI', 'class_label': 1}

	validation_split:
		{'serial': '00916', 'age': 72, 'symptom': ['normal', 'cb_normal'], 'class_name': 'Normal', 'class_label': 0}
		{'serial': '00918', 'age': 71, 'symptom': ['normal', 'smi'], 'class_name': 'Normal', 'class_label': 0}
		{'serial': '00606', 'age': 64, 'sympto

---

## Task 2: Classification of Normal and Abnormal Symptoms

#### Define the target diagnoses and split them by their symptoms

In [11]:
diagnosis_filter = [
    # Normal
    {'name': 'Normal',
     'include': ['normal'], 
     'exclude': []},
    # Abnormal
    {'name': 'Abnormal',
     'include': [], 
     'exclude': ['normal']},
]

class_label_to_name = [d_f['name'] for d_f in diagnosis_filter]
print('class_label_to_name:', class_label_to_name)

class_name_to_label = {d_f['name']: i for i, d_f in enumerate(diagnosis_filter)}
print('class_name_to_label:', class_name_to_label)

class_label_to_name: ['Normal', 'Abnormal']
class_name_to_label: {'Normal': 0, 'Abnormal': 1}


In [12]:
# Split the filtered dataset
splitted_metadata = [[] for _ in diagnosis_filter]

for m in annotation['data']:
    symptom = m['symptom']
    for c, f in enumerate(diagnosis_filter):
        inc = set(f['include']) & set(symptom) == set(f['include'])
        # inc = len(set(f['include']) & set(label)) > 0
        exc = len(set(f['exclude']) & set(symptom)) == 0
        if inc and exc:
            m['class_name'] = f['name']
            m['class_label'] = c
            splitted_metadata[c].append(m)
            break

for i, split in enumerate(splitted_metadata):
    if len(split) == 0:
        raise ValueError(f'(Warning) Split group {i} has no data.')
    print(f'- There are {len(split):} data belonging to {split[0]["class_name"]}')

- There are 459 data belonging to Normal
- There are 929 data belonging to Abnormal


#### Shuffle the divided data

In [13]:
ratios = np.array([8, 1, 1])
ratios = ratios / ratios.sum()
print('Train, validation, test sets ratios:', ratios)

Train, validation, test sets ratios: [0.8 0.1 0.1]


In [14]:
metadata_train, metadata_val, metadata_test = shuffle_splitted_metadata(splitted_metadata, 
                                                                        class_label_to_name, 
                                                                        ratios, 
                                                                        seed=None, 
                                                                        verbose=True)

<     Train     > data label distribution	: [367, 743] = 1110
<  Validation   > data label distribution	: [46, 93] = 139
<     Test      > data label distribution	: [46, 93] = 139


#### Save the dataset as JSON file

In [15]:
task_dict = dict()

task_dict['task_name'] = 'CAUEEG-task2 benchmark'
task_dict['task_description'] = 'Classification of [Normal] and [Abnormal] symptoms'
task_dict['class_label_to_name'] = class_label_to_name
task_dict['class_name_to_label'] = class_name_to_label

task_dict['train_split'] = metadata_train
task_dict['validation_split'] = metadata_val
task_dict['test_split'] = metadata_test

print('{')
for k, v in task_dict.items():
    print(f'\t{k}:')
    if isinstance(v, list) and len(v) > 3:
        print(f'\t\t{v[0]}')
        print(f'\t\t{v[1]}')
        print(f'\t\t{v[2]}')
        print(f'\t\t.')
        print(f'\t\t.')
        print(f'\t\t.')
        print(f'\t\t{v[-1]}')
    else:
        print(f'\t\t{v}')
    print()
print('}')

with open(os.path.join(data_path, 'task2.json'), 'w') as json_file:
    json.dump(task_dict, json_file, indent=4)
    print('task2.json file is saved.')

{
	task_name:
		CAUEEG-task2 benchmark

	task_description:
		Classification of [Normal] and [Abnormal] symptoms

	class_label_to_name:
		['Normal', 'Abnormal']

	class_name_to_label:
		{'Normal': 0, 'Abnormal': 1}

	train_split:
		{'serial': '00763', 'age': 76, 'symptom': ['dementia', 'ad', 'load'], 'class_name': 'Abnormal', 'class_label': 1}
		{'serial': '01206', 'age': 77, 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_ef'], 'class_name': 'Abnormal', 'class_label': 1}
		{'serial': '00685', 'age': 72, 'symptom': ['mci', 'mci_vascular'], 'class_name': 'Abnormal', 'class_label': 1}
		.
		.
		.
		{'serial': '00767', 'age': 61, 'symptom': ['normal', 'cb_normal'], 'class_name': 'Normal', 'class_label': 0}

	validation_split:
		{'serial': '01249', 'age': 68, 'symptom': ['normal', 'smi'], 'class_name': 'Normal', 'class_label': 0}
		{'serial': '00252', 'age': 85, 'symptom': ['dementia', 'vd', 'sivd'], 'class_name': 'Abnormal', 'class_label': 1}
		{'serial': '00891', 'age': 69, 'symptom': ['