# EEG Data Statistics

`01_Data_Curation1`과 `02_Data_Curation2`에서 저장한 EEG 데이터의 분포를 전반적으로 살펴보는 노트북.

-----

## 환경 구성

In [1]:
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%cd ..

C:\Users\Minjae\Desktop\EEG_Project


In [2]:
# Load some packages
import os
import glob
import json
import datetime

import statistics

import pprint
from tqdm.auto import tqdm

import numpy as np

# custom package
from datasets.caueeg_dataset import CauEegDataset
from datasets.caueeg_data_curation import MultiEegLabel

-----

## Curated Data 불러오기

In [3]:
# Data file path
curate_folder = r'local/dataset/caueeg-dataset'
output_folder = r'local/output/imgs'

os.makedirs(output_folder, exist_ok=True)

In [4]:
with open(os.path.join(curate_folder, 'annotation_debug.json'), 'r') as json_file:
    annotation = json.load(json_file)

pprint.pprint({k: (v if k != 'data' else v[:5]) for (k, v) in annotation.items()}, width=250)

{'data': [{'age': 78, 'birth': '1940-06-02', 'dx1': 'mci_rf', 'edfname': '00001809_261018', 'record': '2018-10-26T15:46:26', 'serial': '00001', 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_rf']},
          {'age': 56, 'birth': '1960-12-04', 'dx1': 'smi', 'edfname': '00029426_020817', 'record': '2017-08-02T16:14:56', 'serial': '00002', 'symptom': ['normal', 'smi']},
          {'age': 93, 'birth': '1924-10-19', 'dx1': 'vascular mci', 'edfname': '00047327_090718', 'record': '2018-07-09T15:29:10', 'serial': '00003', 'symptom': ['mci', 'mci_vascular']},
          {'age': 78, 'birth': '1941-03-16', 'dx1': 'load', 'edfname': '00048377_070819', 'record': '2019-08-07T13:55:25', 'serial': '00004', 'symptom': ['dementia', 'ad', 'load']},
          {'age': 75, 'birth': '1941-03-16', 'dx1': 'mci (ef) multi-domain', 'edfname': '00048377_070916', 'record': '2016-09-07T10:36:01', 'serial': '00005', 'symptom': ['mci', 'mci_amnestic', 'mci_amnestic_ef', 'mci_multi_domain']}],
 'dataset_name': 'CAUEE

## CAUEEG-Dementia

### Train $\leftrightarrow$ Test

In [5]:
with open('local/dataset/caueeg-dataset/dementia.json', 'r') as json_file:
    dementia_dataset = json.load(json_file)

# find edf information from annotation
for train_data in dementia_dataset['train_split']:
    for anno_data in annotation['data']:
        if train_data['serial'] == anno_data['serial']:
            train_data['edfname'] = anno_data['edfname'].split('_')[0]
            train_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for test_data in dementia_dataset['test_split']:
    for anno_data in annotation['data']:
        if test_data['serial'] == anno_data['serial']:
            test_data['edfname'] = anno_data['edfname'].split('_')[0]
            test_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break

# find duplicates
duplicates = []
for test_data in dementia_dataset['test_split']:
    if test_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == test_data['edfname']:
                dup_data['test_serials'].add(test_data['serial'])
                dup_data['test_records'].add(test_data['record'])
                break
    else:
        for train_data in dementia_dataset['train_split']:
            if test_data['edfname'] == train_data['edfname']:
                dup_data = {'edfname': test_data['edfname'], 
                            'test_serials': set([test_data['serial']]), 
                            'test_records': set([test_data['record']]),
                            'test_symptoms': set([*test_data['symptom']]),
                            'train_serials': set([train_data['serial']]), 
                            'train_records': set([train_data['record']]),
                            'train_symptoms': set([*train_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for train_data in dementia_dataset['train_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == train_data['edfname']:
            dup_data['train_serials'].add(train_data['serial'])
            dup_data['train_records'].add(train_data['record'])
            break
print(len(dementia_dataset['test_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for test_record in dup_data['test_records']:
        for train_record in dup_data['train_records']:
            delta = abs(test_record - train_record)
            delta_list.append(delta)
    
    if dup_data['train_symptoms'] != dup_data['test_symptoms']:
        symp_diff_list.append((dup_data['train_symptoms'], dup_data['test_symptoms']))
        symp_diff_count += len(dup_data['test_serials'])
        symp_diff_count2 += len(dup_data['train_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

118
28

min delta: 10
mean delta: 370
median delta: 178
max delta: 1425

symp_diff_count: 6
symp_diff_count2: 11
[({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci_amnestic_ef', 'mci'}),
 ({'mci_amnestic', 'mci_amnestic_ef', 'mci'}, {'eoad', 'ad', 'dementia'}),
 ({'mci_amnestic', 'mci_amnestic_ef', 'mci'}, {'load', 'ad', 'dementia'}),
 ({'mci_amnestic', 'mci_amnestic_ef', 'mci'}, {'load', 'ad', 'dementia'}),
 ({'mci', 'mci_vascular'}, {'vd', 'dementia', 'sivd'}),
 ({'mci_multi_domain', 'mci_amnestic', 'mci_amnestic_ef', 'mci'},
  {'normal', 'smi'})]



In [6]:
test_serials = [test_serial for d in duplicates for test_serial in d['test_serials']]
train_serials = [train_serial for d in duplicates for train_serial in d['train_serials']]

print(len(duplicates))
print()
print(len(test_serials))
print(test_serials)
print()
print(len(train_serials))
print(train_serials)

28

28
['00789', '00934', '01034', '00495', '01028', '00294', '00565', '01087', '00860', '00004', '00709', '01252', '00435', '00061', '00837', '00841', '00301', '00690', '00259', '00298', '01091', '00243', '00770', '01118', '00787', '00516', '00828', '00143']

39
['00790', '00933', '01035', '01037', '00493', '01029', '00293', '00566', '01088', '00859', '00858', '00857', '00006', '00005', '00708', '00710', '00707', '01253', '00436', '00434', '00063', '00062', '00838', '00840', '00839', '00302', '00691', '00258', '00299', '00297', '01092', '00244', '00771', '01119', '00788', '00515', '00829', '00145', '00144']


### Train $\leftrightarrow$ Validation

In [7]:
with open('local/dataset/caueeg-dataset/dementia.json', 'r') as json_file:
    dementia_dataset = json.load(json_file)

# find edf information from annotation
for train_data in dementia_dataset['train_split']:
    for anno_data in annotation['data']:
        if train_data['serial'] == anno_data['serial']:
            train_data['edfname'] = anno_data['edfname'].split('_')[0]
            train_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for validation_data in dementia_dataset['validation_split']:
    for anno_data in annotation['data']:
        if validation_data['serial'] == anno_data['serial']:
            validation_data['edfname'] = anno_data['edfname'].split('_')[0]
            validation_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
            
duplicates = []
for validation_data in dementia_dataset['validation_split']:
    if validation_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == validation_data['edfname']:
                dup_data['validation_serials'].add(validation_data['serial'])
                dup_data['validation_records'].add(validation_data['record'])
                break
    else:
        for train_data in dementia_dataset['train_split']:
            if validation_data['edfname'] == train_data['edfname']:
                dup_data = {'edfname': validation_data['edfname'], 
                            'validation_serials': set([validation_data['serial']]), 
                            'validation_records': set([validation_data['record']]),
                            'validation_symptoms': set([*validation_data['symptom']]),
                            'train_serials': set([train_data['serial']]), 
                            'train_records': set([train_data['record']]),
                            'train_symptoms': set([*train_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for train_data in dementia_dataset['train_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == train_data['edfname']:
            dup_data['train_serials'].add(train_data['serial'])
            dup_data['train_records'].add(train_data['record'])
            break
print(len(dementia_dataset['validation_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for validation_record in dup_data['validation_records']:
        for train_record in dup_data['train_records']:
            delta = abs(validation_record - train_record)
            delta_list.append(delta)
    
    if dup_data['train_symptoms'] != dup_data['validation_symptoms']:
        symp_diff_list.append((dup_data['train_symptoms'], dup_data['validation_symptoms']))
        symp_diff_count += len(dup_data['validation_serials'])
        symp_diff_count2 += len(dup_data['train_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

119
35

min delta: 7
mean delta: 329
median delta: 136
max delta: 1417

symp_diff_count: 5
symp_diff_count2: 5
[({'normal', 'smi'}, {'mci_amnestic', 'mci_ad', 'mci'}),
 ({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci_amnestic_ef', 'mci'}),
 ({'mci_amnestic', 'mci_amnestic_ef', 'mci'},
  {'mci_amnestic', 'mci', 'mci_amnestic_rf'}),
 ({'mci_amnestic', 'mci_amnestic_ef', 'mci'}, {'eoad', 'ad', 'dementia'})]



In [8]:
validation_serials = [validation_serial for d in duplicates for validation_serial in d['validation_serials']]
train_serials = [train_serial for d in duplicates for train_serial in d['train_serials']]

print(len(duplicates))
print()
print(len(validation_serials))
print(validation_serials)
print()
print(len(train_serials))
print(train_serials)

35

37
['00172', '00807', '01052', '01036', '00999', '00935', '01310', '01309', '00586', '00957', '00539', '00627', '00753', '01101', '00481', '01099', '01241', '01305', '00039', '00338', '00917', '00938', '00028', '00478', '01178', '01058', '01156', '01165', '00460', '00276', '00974', '00729', '00701', '00703', '00494', '00160', '00992']

50
['00171', '00173', '00808', '00806', '01053', '01035', '01037', '01000', '00933', '01308', '01307', '00587', '00956', '00538', '00628', '00752', '01102', '00480', '00482', '01100', '01242', '01240', '01304', '00040', '00337', '00918', '00937', '00026', '00031', '00029', '00030', '00027', '00477', '01179', '01059', '01155', '01167', '01166', '00459', '00277', '00975', '00973', '00728', '00702', '00493', '00161', '00988', '00991', '00989', '00990']


### Test $\leftrightarrow$ Validation

In [9]:
with open('local/dataset/caueeg-dataset/dementia.json', 'r') as json_file:
    dementia_dataset = json.load(json_file)

# find edf information from annotation
for test_data in dementia_dataset['test_split']:
    for anno_data in annotation['data']:
        if test_data['serial'] == anno_data['serial']:
            test_data['edfname'] = anno_data['edfname'].split('_')[0]
            test_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for validation_data in dementia_dataset['validation_split']:
    for anno_data in annotation['data']:
        if validation_data['serial'] == anno_data['serial']:
            validation_data['edfname'] = anno_data['edfname'].split('_')[0]
            validation_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
            
duplicates = []
for validation_data in dementia_dataset['validation_split']:
    if validation_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == validation_data['edfname']:
                dup_data['validation_serials'].add(validation_data['serial'])
                dup_data['validation_records'].add(validation_data['record'])
                break
    else:
        for test_data in dementia_dataset['test_split']:
            if validation_data['edfname'] == test_data['edfname']:
                dup_data = {'edfname': validation_data['edfname'], 
                            'validation_serials': set([validation_data['serial']]), 
                            'validation_records': set([validation_data['record']]),
                            'validation_symptoms': set([*validation_data['symptom']]),
                            'test_serials': set([test_data['serial']]), 
                            'test_records': set([test_data['record']]),
                            'test_symptoms': set([*test_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for test_data in dementia_dataset['test_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == test_data['edfname']:
            dup_data['test_serials'].add(test_data['serial'])
            dup_data['test_records'].add(test_data['record'])
            break
print(len(dementia_dataset['validation_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for validation_record in dup_data['validation_records']:
        for test_record in dup_data['test_records']:
            delta = abs(validation_record - test_record)
            delta_list.append(delta)
    
    if dup_data['test_symptoms'] != dup_data['validation_symptoms']:
        symp_diff_list.append((dup_data['test_symptoms'], dup_data['validation_symptoms']))
        symp_diff_count += len(dup_data['validation_serials'])
        symp_diff_count2 += len(dup_data['test_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

119
3

min delta: 14
mean delta: 391
median delta: 314
max delta: 846

symp_diff_count: 1
symp_diff_count2: 1
[({'mci_amnestic', 'mci_amnestic_ef', 'mci'}, {'load', 'ad', 'dementia'})]



In [10]:
validation_serials = [validation_serial for d in duplicates for validation_serial in d['validation_serials']]
test_serials = [test_serial for d in duplicates for test_serial in d['test_serials']]

print(len(duplicates))
print()
print(len(validation_serials))
print(validation_serials)
print()
print(len(test_serials))
print(test_serials)

3

3
['01036', '00935', '00494']

3
['01034', '00934', '00495']


## CAUEEG-Abnormal

### Train $\leftrightarrow$ Test

In [11]:
with open('local/dataset/caueeg-dataset/abnormal.json', 'r') as json_file:
    abnormal_dataset = json.load(json_file)

# find edf information from annotation
for train_data in abnormal_dataset['train_split']:
    for anno_data in annotation['data']:
        if train_data['serial'] == anno_data['serial']:
            train_data['edfname'] = anno_data['edfname'].split('_')[0]
            train_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for test_data in abnormal_dataset['test_split']:
    for anno_data in annotation['data']:
        if test_data['serial'] == anno_data['serial']:
            test_data['edfname'] = anno_data['edfname'].split('_')[0]
            test_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
            
duplicates = []
for test_data in abnormal_dataset['test_split']:
    if test_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == test_data['edfname']:
                dup_data['test_serials'].add(test_data['serial'])
                dup_data['test_records'].add(test_data['record'])
                break
    else:
        for train_data in abnormal_dataset['train_split']:
            if test_data['edfname'] == train_data['edfname']:
                dup_data = {'edfname': test_data['edfname'], 
                            'test_serials': set([test_data['serial']]), 
                            'test_records': set([test_data['record']]),
                            'test_symptoms': set([*test_data['symptom']]),
                            'train_serials': set([train_data['serial']]), 
                            'train_records': set([train_data['record']]),
                            'train_symptoms': set([*train_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for train_data in abnormal_dataset['train_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == train_data['edfname']:
            dup_data['train_serials'].add(train_data['serial'])
            dup_data['train_records'].add(train_data['record'])
            break
print(len(abnormal_dataset['test_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for test_record in dup_data['test_records']:
        for train_record in dup_data['train_records']:
            delta = abs(test_record - train_record)
            delta_list.append(delta)
    
    if dup_data['train_symptoms'] != dup_data['test_symptoms']:
        symp_diff_list.append((dup_data['train_symptoms'], dup_data['test_symptoms']))
        symp_diff_count += len(dup_data['test_serials'])
        symp_diff_count2 += len(dup_data['train_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

136
27

min delta: 7
mean delta: 340
median delta: 80
max delta: 1080

symp_diff_count: 6
symp_diff_count2: 9
[({'mci_amnestic', 'mci_amnestic_ef', 'mci'},
  {'mci_amnestic', 'mci', 'mci_amnestic_rf'}),
 ({'mci_multi_domain', 'mci_amnestic', 'mci_amnestic_ef', 'mci'},
  {'normal', 'smi'}),
 ({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci'}),
 ({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci_amnestic_ef', 'mci'}),
 ({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci_amnestic_ef', 'mci'}),
 ({'mci', 'mci_non_amnestic'}, {'ftd', 'bvftd'})]



In [12]:
test_serials = [test_serial for d in duplicates for test_serial in d['test_serials']]
train_serials = [train_serial for d in duplicates for train_serial in d['train_serials']]

print(len(duplicates))
print()
print(len(test_serials))
print(test_serials)
print()
print(len(train_serials))
print(train_serials)

27

27
['01156', '00999', '00481', '00690', '00787', '00028', '00298', '00172', '00841', '00753', '01118', '01178', '00860', '01165', '00494', '00259', '00938', '00586', '00371', '00039', '00243', '00789', '00750', '00565', '01028', '00516', '00924']

39
['01155', '01000', '00482', '00480', '00691', '00788', '00026', '00031', '00029', '00030', '00027', '00299', '00297', '00171', '00173', '00840', '00839', '00752', '01119', '01179', '00858', '00857', '00859', '01167', '01166', '00493', '00495', '00258', '00937', '00587', '00373', '00040', '00244', '00790', '00749', '00566', '01029', '00515', '00925']


### Train $\leftrightarrow$ Validation

In [13]:
with open('local/dataset/caueeg-dataset/abnormal.json', 'r') as json_file:
    abnormal_dataset = json.load(json_file)

# find edf information from annotation
for train_data in abnormal_dataset['train_split']:
    for anno_data in annotation['data']:
        if train_data['serial'] == anno_data['serial']:
            train_data['edfname'] = anno_data['edfname'].split('_')[0]
            train_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for validation_data in abnormal_dataset['validation_split']:
    for anno_data in annotation['data']:
        if validation_data['serial'] == anno_data['serial']:
            validation_data['edfname'] = anno_data['edfname'].split('_')[0]
            validation_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
            
duplicates = []
for validation_data in abnormal_dataset['validation_split']:
    if validation_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == validation_data['edfname']:
                dup_data['validation_serials'].add(validation_data['serial'])
                dup_data['validation_records'].add(validation_data['record'])
                break
    else:
        for train_data in abnormal_dataset['train_split']:
            if validation_data['edfname'] == train_data['edfname']:
                dup_data = {'edfname': validation_data['edfname'], 
                            'validation_serials': set([validation_data['serial']]), 
                            'validation_records': set([validation_data['record']]),
                            'validation_symptoms': set([*validation_data['symptom']]),
                            'train_serials': set([train_data['serial']]), 
                            'train_records': set([train_data['record']]),
                            'train_symptoms': set([*train_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for train_data in abnormal_dataset['train_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == train_data['edfname']:
            dup_data['train_serials'].add(train_data['serial'])
            dup_data['train_records'].add(train_data['record'])
            break
print(len(abnormal_dataset['validation_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for validation_record in dup_data['validation_records']:
        for train_record in dup_data['train_records']:
            delta = abs(validation_record - train_record)
            delta_list.append(delta)
    
    if dup_data['train_symptoms'] != dup_data['validation_symptoms']:
        symp_diff_list.append((dup_data['train_symptoms'], dup_data['validation_symptoms']))
        symp_diff_count += len(dup_data['validation_serials'])
        symp_diff_count2 += len(dup_data['train_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

136
27

min delta: 9
mean delta: 361
median delta: 119
max delta: 1508

symp_diff_count: 2
symp_diff_count2: 5
[({'load', 'ad', 'dementia'}, {'mci_amnestic', 'mci_amnestic_ef', 'mci'}),
 ({'mci', 'mci_vascular'}, {'vd', 'dementia', 'sivd'})]



In [14]:
validation_serials = [validation_serial for d in duplicates for validation_serial in d['validation_serials']]
train_serials = [train_serial for d in duplicates for train_serial in d['train_serials']]

print(len(duplicates))
print()
print(len(validation_serials))
print(validation_serials)
print()
print(len(train_serials))
print(train_serials)

27

31
['01034', '00372', '00934', '00935', '00160', '00427', '00807', '01310', '01309', '00294', '01260', '01259', '00729', '01305', '01087', '01091', '00718', '00627', '00917', '00478', '00770', '01058', '00393', '00395', '00338', '00709', '00061', '01052', '00837', '01101', '00957']

36
['01035', '01036', '01037', '00373', '00933', '00161', '00426', '00428', '00808', '00806', '01308', '01307', '00293', '01261', '00728', '01304', '01088', '01092', '00719', '00628', '00918', '00477', '00771', '01059', '00392', '00394', '00337', '00708', '00710', '00707', '00063', '00062', '01053', '00838', '01102', '00956']


### Test $\leftrightarrow$ Validation

In [15]:
with open('local/dataset/caueeg-dataset/abnormal.json', 'r') as json_file:
    abnormal_dataset = json.load(json_file)

# find edf information from annotation
for test_data in abnormal_dataset['test_split']:
    for anno_data in annotation['data']:
        if test_data['serial'] == anno_data['serial']:
            test_data['edfname'] = anno_data['edfname'].split('_')[0]
            test_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
    
for validation_data in abnormal_dataset['validation_split']:
    for anno_data in annotation['data']:
        if validation_data['serial'] == anno_data['serial']:
            validation_data['edfname'] = anno_data['edfname'].split('_')[0]
            validation_data['record'] = datetime.datetime.fromisoformat(anno_data['record'])
            break
            
duplicates = []
for validation_data in abnormal_dataset['validation_split']:
    if validation_data['edfname'] in [d['edfname'] for d in duplicates]:
        for dup_data in duplicates:
            if dup_data['edfname'] == validation_data['edfname']:
                dup_data['validation_serials'].add(validation_data['serial'])
                dup_data['validation_records'].add(validation_data['record'])
                break
    else:
        for test_data in abnormal_dataset['test_split']:
            if validation_data['edfname'] == test_data['edfname']:
                dup_data = {'edfname': validation_data['edfname'], 
                            'validation_serials': set([validation_data['serial']]), 
                            'validation_records': set([validation_data['record']]),
                            'validation_symptoms': set([*validation_data['symptom']]),
                            'test_serials': set([test_data['serial']]), 
                            'test_records': set([test_data['record']]),
                            'test_symptoms': set([*test_data['symptom']])
                           }
                duplicates.append(dup_data)
                break

for test_data in abnormal_dataset['test_split']:
    for dup_data in duplicates:
        if dup_data['edfname'] == test_data['edfname']:
            dup_data['test_serials'].add(test_data['serial'])
            dup_data['test_records'].add(test_data['record'])
            break
print(len(abnormal_dataset['validation_split']))
print(len(duplicates))
print()

delta_list = []
symp_diff_count = 0
symp_diff_count2 = 0
symp_diff_list = []
minimum_case = None

for i, dup_data in enumerate(duplicates):
    for validation_record in dup_data['validation_records']:
        for test_record in dup_data['test_records']:
            delta = abs(validation_record - test_record)
            delta_list.append(delta)
    
    if dup_data['test_symptoms'] != dup_data['validation_symptoms']:
        symp_diff_list.append((dup_data['test_symptoms'], dup_data['validation_symptoms']))
        symp_diff_count += len(dup_data['validation_serials'])
        symp_diff_count2 += len(dup_data['test_serials'])
            
sum_delta = datetime.timedelta(days=0)
for delta in delta_list:
    sum_delta += delta

print('min delta:', round(min(delta_list) / datetime.timedelta(hours=1) / 24))
print('mean delta:', round(sum_delta / len(delta_list) / datetime.timedelta(hours=1) / 24))
print('median delta:', round(statistics.median(delta_list) / datetime.timedelta(hours=1) / 24))
print('max delta:', round(max(delta_list) / datetime.timedelta(hours=1) / 24))
print()
print('symp_diff_count:', symp_diff_count)
print('symp_diff_count2:', symp_diff_count2)
pprint.pprint(symp_diff_list)
print()
# pprint.pprint(duplicates)

136
3

min delta: 33
mean delta: 413
median delta: 36
max delta: 1170

symp_diff_count: 1
symp_diff_count2: 1
[({'ftd', 'bvftd'}, {'mci_multi_domain', 'mci_amnestic', 'mci'})]



In [16]:
validation_serials = [validation_serial for d in duplicates for validation_serial in d['validation_serials']]
test_serials = [test_serial for d in duplicates for test_serial in d['test_serials']]

print(len(duplicates))
print()
print(len(validation_serials))
print(validation_serials)
print()
print(len(test_serials))
print(test_serials)

3

3
['00372', '00169', '00498']

3
['00371', '00168', '00499']
