In [19]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import os
import pandas as pd

In [24]:
import os
DATA_MAIN_DIR='../../../data/eardrum_public_data'
CHILE_DATA_DIR='Chile'
OHIO_DATA_DIR='Ohio'
TURKEY_DATA_DIR='Turkey'

def get_file_list(data_dir,suffix_list=['.jpg','.png'],exclude='.ipynb_checkpoints'):
    file_list=[]
    for root, dirs, files in os.walk(data_dir):
        for file in files:
            if exclude not in file:
                if any(file.endswith(suffix) for suffix in suffix_list):
                    file_list.append(os.path.join(root, file))
    return file_list


## Generate dataframe for the Chile dataset

In [25]:
CHILE_DATA_PATH=os.path.join(DATA_MAIN_DIR,CHILE_DATA_DIR)
chile_file_list=get_file_list(CHILE_DATA_PATH)
# To avoid problem of duplicate files, we create an index

df_chile=pd.DataFrame(chile_file_list,columns=['file_path'])
df_chile['relative_file_path']=df_chile['file_path'].apply(lambda x: x.split(CHILE_DATA_PATH)[1])
df_chile['is_test']=df_chile['relative_file_path'].apply(lambda x: 'Testing/' in x)
df_chile['class']=df_chile['relative_file_path'].apply(lambda x: x.split('/')[2])
print(df_chile['class'].value_counts(dropna=False))

df_chile['binary_class']=df_chile['class'].apply(lambda x: 0 if x=='Normal' else 1)
print(df_chile['binary_class'].value_counts(dropna=False))
# group by is_test and class show the counts
print(df_chile.groupby(['is_test','class']).size())
df_chile.drop(columns=['file_path'],inplace=True)
df_chile['source']='Chile'

Earwax plug             220
Normal                  220
Myringosclerosis        220
Chronic otitis media    220
Name: class, dtype: int64
1    660
0    220
Name: binary_class, dtype: int64
is_test  class               
False    Chronic otitis media    180
         Earwax plug             180
         Myringosclerosis        180
         Normal                  180
True     Chronic otitis media     40
         Earwax plug              40
         Myringosclerosis         40
         Normal                   40
dtype: int64


## Generate dataframe for the Ohio dataset

In [26]:
OHIO_DATA_PATH=os.path.join(DATA_MAIN_DIR,OHIO_DATA_DIR)
ohio_file_list=get_file_list(OHIO_DATA_PATH)
df_ohio=pd.DataFrame(ohio_file_list,columns=['file_path'])
df_ohio['relative_file_path']=df_ohio['file_path'].apply(lambda x: x.split(OHIO_DATA_PATH)[1])
# Assign df_ohio['is_test'] as NA
df_ohio['is_test']=None
df_ohio['class']=df_ohio['relative_file_path'].apply(lambda x: x.split('/')[2])
print(df_ohio['class'].value_counts(dropna=False))
df_ohio['binary_class']=df_ohio['class'].apply(lambda x: 0 if x=='Normal' else 1)
df_ohio['source']='Ohio'
print(df_ohio['binary_class'].value_counts(dropna=False))
df_ohio.drop(columns=['file_path'],inplace=True)
# remove the Tube class
df_ohio=df_ohio[df_ohio['class']!='Tube'].reset_index(drop=True)
print(df_ohio['binary_class'].value_counts(dropna=False))

Effusion    182
Normal      179
Tube         96
Name: class, dtype: int64
1    278
0    179
Name: binary_class, dtype: int64
1    182
0    179
Name: binary_class, dtype: int64


## Generate dataframe for the Turkey dataset

In [27]:
TURKEY_DATA_PATH=os.path.join(DATA_MAIN_DIR,TURKEY_DATA_DIR)
turkey_file_list=get_file_list(TURKEY_DATA_PATH)    
df_turkey=pd.DataFrame(turkey_file_list,columns=['file_path'])
df_turkey['relative_file_path']=df_turkey['file_path'].apply(lambda x: x.split(TURKEY_DATA_PATH)[1])
df_turkey['class']=df_turkey['relative_file_path'].apply(lambda x: x.split('/')[1] if '/normal/' in x else x.split('/')[2])
print(df_turkey['class'].value_counts(dropna=False))
df_turkey['is_test']=df_turkey['relative_file_path'].apply(lambda x: 'Test' in x)
df_turkey['is_test'].value_counts(dropna=False) 
df_turkey['binary_class']=df_turkey['class'].apply(lambda x: 0 if x=='normal' else 1)
# drop the case if class ='pseudoMembranes' or 'foreignObjectEar' or 'otitisexterna' or 'earVentilationTube'
# keep tympanoskleros as this has been found in the CHILE datasets
# Include  tympanosclerosis as it is similar to Myringosclerosis
df_turkey=df_turkey[~df_turkey['class'].isin(['pseudoMembranes','foreignObjectEar','otitisexterna','earVentilationTube'])]
df_turkey.reset_index(drop=True,inplace=True)
df_turkey['source']='Turkey'
print(df_turkey.groupby(['class']).size())
df_turkey.drop(columns=['file_path'],inplace=True)

normal                535
earwax                140
aom                   119
csom                   63
otitisexterna          41
tympanoskleros         28
earVentilationTube     16
pseudoMembranes        11
foreignObjectEar        3
Name: class, dtype: int64
class
aom               119
csom               63
earwax            140
normal            535
tympanoskleros     28
dtype: int64


## Combine all three dataframes and save as metadata.csv

In [29]:
df=pd.concat([df_chile,df_ohio,df_turkey],axis=0)
df.reset_index(inplace=True,drop=True)
#df.to_csv(os.path.join('../metadata/metadata.csv'),index=False)

In [30]:
df.groupby(['source','class']).size()

source  class               
Chile   Chronic otitis media    220
        Earwax plug             220
        Myringosclerosis        220
        Normal                  220
Ohio    Effusion                182
        Normal                  179
Turkey  aom                     119
        csom                     63
        earwax                  140
        normal                  535
        tympanoskleros           28
dtype: int64

### Check pre-specified testing sets for the Chile and Turkey datasets

In [11]:
df.groupby(['source','is_test','class']).size()

source  is_test  class               
Chile   False    Chronic otitis media    180
                 Earwax plug             180
                 Myringosclerosis        180
                 Normal                  180
        True     Chronic otitis media     40
                 Earwax plug              40
                 Myringosclerosis         40
                 Normal                   40
Turkey  False    aom                      95
                 csom                     50
                 earwax                  112
                 normal                  428
                 tympanoskleros           28
        True     aom                      24
                 csom                     13
                 earwax                   28
                 normal                  107
dtype: int64