In [1]:
import os
import pandas as pd
import json
import re

In [2]:
%run ../utils/__init__.py
%run ../metrics/__init__.py

In [3]:
classification = False

In [5]:
BASE_FOLDER = os.path.join(WORKSPACE_DIR,
                           'classification' if classification else 'report_generation')
RESULTS_FOLDER = os.path.join(BASE_FOLDER, 'results')

In [6]:
def get_suffix(filename):
    match = re.search('.*metrics-(?P<suffix>\w*)\.json', filename)
    if match is None:
        suffix = ''
    else:
        suffix = match.group('suffix')
    return suffix

In [7]:
KEY_COLS = ['run_name', 'dataset_type', 'free']

In [23]:
def load_results():
    results_by_metric_type = {}

    for run_name in os.listdir(RESULTS_FOLDER):
        if run_name == 'debug':
            continue

        folder = os.path.join(RESULTS_FOLDER, run_name)
        for filename in os.listdir(folder):
            filepath = os.path.join(folder, filename)
            if not os.path.isfile(filepath) or not filename.endswith('json'):
                continue

            metric_type = 'chexpert' if filename.startswith('chexpert') else 'base'
            suffix = get_suffix(filename)
                
            with open(filepath, 'r') as f:
                results_dict = json.load(f)
            
            results_df = pd.DataFrame.from_dict(results_dict, orient='index')
            results_df.reset_index(inplace=True)
            results_df.rename(columns={'index': 'dataset_type'}, inplace=True)
            results_df['run_name'] = run_name
            results_df['free'] = suffix            
            
            if metric_type not in results_by_metric_type:
                results_by_metric_type[metric_type] = results_df
            else:
                prev = results_by_metric_type[metric_type]
                results_by_metric_type[metric_type] = prev.append(results_df, ignore_index=True)

    df = None
    cols_in_order = list(KEY_COLS)
    for results in results_by_metric_type.values():
        cols_in_order += [col for col in results.columns if col not in KEY_COLS]
        
        if df is None:
            df = results
        else:
            df = df.merge(results, on=KEY_COLS)
                
    return df[cols_in_order]

In [35]:
def filter_results(dataset_type=None, metrics=None, free=None, contains=None, drop=None):
    df = RESULTS_DF
    
    if dataset_type:
        if isinstance(dataset_type, str):
            df = df[df['dataset_type'] == dataset_type]
        elif isinstance(dataset_type, (list, tuple)):
            dataset_type = set(dataset_type)
            df = df[df['dataset_type'].isin(dataset_type)]
    
    if free is not None:
        free_str = 'free' if free else 'notfree'
        df = df.loc[df['free'] == free_str]
    
    if contains:
        filter_contains = lambda d, s: d.loc[d['run_name'].str.contains(s)]
        if isinstance(contains, (list, tuple)):
            for c in contains:
                df = filter_contains(df, c)
        elif isinstance(contains, str):
            df = filter_contains(df, contains)
    
    if drop:
        df = df.loc[~df['run_name'].str.contains(drop)]
        
    if metrics:
        columns = KEY_COLS + metrics
        df = df[columns]
    
    return df

In [25]:
RESULTS_DF = load_results()
RESULTS_DF.head()

Unnamed: 0,run_name,dataset_type,free,acc,acc-No Finding,acc-Enlarged Cardiomediastinum,acc-Cardiomegaly,acc-Lung Lesion,acc-Lung Opacity,acc-Edema,...,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD,distinct_words,distinct_sentences,word_acc
0,0915_174026_dummy-common-sentences-100,train,free,0.908746,0.430187,0.933986,0.790478,0.958973,0.827283,0.993247,...,0.340114,0.207458,0.137165,0.095464,0.19505,0.27658,0.152633,109.0,296.0,
1,0915_174026_dummy-common-sentences-100,test,free,0.911284,0.389628,0.926862,0.81383,0.956117,0.845745,0.994681,...,0.346689,0.21085,0.13844,0.095769,0.197937,0.280916,0.165551,109.0,222.0,
2,0915_174026_dummy-common-sentences-100,val,free,0.90993,0.442077,0.933422,0.785619,0.968043,0.861518,0.986684,...,0.330119,0.201151,0.133937,0.093666,0.189718,0.272445,0.160992,109.0,277.0,
3,0915_174026_dummy-common-sentences-100,train,notfree,0.908577,0.459396,0.925544,0.7714,0.958467,0.826439,0.992909,...,0.391287,0.241733,0.159835,0.110573,0.225857,0.297524,0.226741,109.0,1028.0,0.038364
4,0915_174026_dummy-common-sentences-100,val,notfree,0.91012,0.464714,0.916112,0.786951,0.968043,0.861518,0.986684,...,0.39388,0.241507,0.160103,0.111258,0.226687,0.295175,0.265806,109.0,339.0,0.037777


### Report generation

In [26]:
NLP_METRICS = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'bleu', 'rougeL', 'ciderD']
CHEXPERT_METRICS = ['acc', 'roc_auc',] #  'recall', 'prec', 'f1', 'roc_auc']

In [21]:
filter_results(dataset_type=['test-normal', 'test', 'test-abnormal'],
               metrics=NLP_METRICS + CHEXPERT_METRICS,
               free=True)

Unnamed: 0,run_name,dataset_type,free,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD,acc,roc_auc
1,0915_174026_dummy-common-sentences-100,test,free,0.346689,0.21085,0.13844,0.09576881,0.197937,0.280916,0.165551,0.911284,0.497568
7,0915_172915_dummy-common-words-10,test,free,0.312055,0.107749,0.024166,8.550974e-07,0.110993,0.243604,0.042768,0.915274,0.5
13,0915_173524_dummy-common-sentences-10,test,free,0.275183,0.170479,0.112833,0.07838136,0.159219,0.294405,0.15161,0.915274,0.5
20,0915_172709_dummy-constant,test,free,0.454623,0.311525,0.223393,0.165365,0.288726,0.356681,0.292646,0.915274,0.5
26,0918_144929_lstm-att-v2_lr0.0001_densenet-121_...,test,free,0.36085,0.226277,0.152071,0.1064681,0.211416,0.313536,0.186868,,0.507832
32,0919_144203_h-lstm-att-v2_lr0.0001_resnet-50_s...,test,free,0.416384,0.256044,0.17148,0.1224339,0.241586,0.354631,0.190768,,0.5
37,0918_125940_lstm-v2_lr0.0001_densenet-121_size256,test,free,0.379042,0.239385,0.163692,0.1170487,0.224792,0.337529,0.284303,,0.505421
44,0918_190428_h-lstm-att-v2_lr0.0001_densenet-12...,test,free,0.327633,0.204035,0.141362,0.1029318,0.19399,0.32148,0.114352,,0.5
50,0919_054554_lstm-v2_lr0.0001_mobilenet_size256,test,free,0.252249,0.160502,0.108879,0.07718667,0.149704,0.31654,0.227866,,0.504324
55,0915_173609_dummy-random,test,free,0.36248,0.196943,0.117449,0.07459662,0.187867,0.263606,0.111804,0.893522,0.507537


In [79]:
metrics = [
    col
    for col in RESULTS_DF.columns
    # if col.startswith('recall')
    # if col == 'recall-No Finding'
    if 'No Finding' in col
]
cols = KEY_COLS + metrics
RESULTS_DF[cols].sort_index()

Unnamed: 0,run_name,dataset_type,free,prec-No Finding,recall-No Finding,f1-No Finding,roc_auc-No Finding
0,0915_174026_dummy-common-sentences-100,val,free,0.408293,0.842105,0.549946,0.506064
1,0915_174026_dummy-common-sentences-100,train,free,0.400556,0.850400,0.544596,0.499848
2,0915_174026_dummy-common-sentences-100,test,free,0.356375,0.852941,0.502709,0.490012
3,0915_174026_dummy-common-sentences-100,train,notfree,0.412312,0.821323,0.549014,0.519394
4,0915_174026_dummy-common-sentences-100,val,notfree,0.418333,0.825658,0.555310,0.522449
...,...,...,...,...,...,...,...
163,0919_032831_h-lstm-att-v2_lr0.0001_precnn_size256,train,free,0.400642,1.000000,0.572083,0.500000
164,0919_032831_h-lstm-att-v2_lr0.0001_precnn_size256,val,free,0.404794,1.000000,0.576303,0.500000
165,0919_032831_h-lstm-att-v2_lr0.0001_precnn_size256,val,notfree,0.404794,1.000000,0.576303,0.500000
166,0919_032831_h-lstm-att-v2_lr0.0001_precnn_size256,train,notfree,0.400540,0.999579,0.571911,0.499789


In [43]:
replace_strs = [
    (r'^\d{4}_\d{6}_', ''),
    ('most-similar-image', '1nn'),
    ('_lr[\d\.]+', ''),
    ('_size256', ''),
    (r'_\d{4}_\d{6}_.*', ''),
    ('dummy-', ''),
    ('common', 'top'),
    ('-v2', ''),
    (r'top-(\w)\w+-(\d+)', r'top-\1-\2'),
    ('_densenet-121', ''),
]

def rename_runs(run_name):
    s = run_name
    for target, replace_with in replace_strs:
        s = re.sub(target, replace_with, s)
    return s

In [44]:
df = filter_results(dataset_type='test',
                    free=True,
                    contains='(?=_lstm-att-v2.*densenet|_lstm-v2.*densenet|dummy)',
                    drop='0915_173951|0915_174222|0916_104739',
                   )

In [46]:
columns = NLP_METRICS + CHEXPERT_METRICS
print(df.set_index('run_name').rename(index=rename_runs).sort_index().to_latex(
    columns=columns,
    float_format='%.3f',
    column_format='l' + 'c' * len(columns),
))

\begin{tabular}{lccccccccc}
\toprule
{} &  bleu1 &  bleu2 &  bleu3 &  bleu4 &  bleu &  rougeL &  ciderD &   acc &  roc\_auc \\
run\_name  &        &        &        &        &       &         &         &       &          \\
\midrule
1nn       &  0.383 &  0.220 &  0.142 &  0.100 & 0.211 &   0.288 &   0.230 & 0.903 &    0.518 \\
constant  &  0.455 &  0.312 &  0.223 &  0.165 & 0.289 &   0.357 &   0.293 & 0.915 &    0.500 \\
lstm      &  0.379 &  0.239 &  0.164 &  0.117 & 0.225 &   0.338 &   0.284 & 0.912 &    0.505 \\
lstm-att  &  0.361 &  0.226 &  0.152 &  0.106 & 0.211 &   0.314 &   0.187 & 0.918 &    0.508 \\
random    &  0.362 &  0.197 &  0.117 &  0.075 & 0.188 &   0.264 &   0.112 & 0.894 &    0.508 \\
top-s-10  &  0.275 &  0.170 &  0.113 &  0.078 & 0.159 &   0.294 &   0.152 & 0.915 &    0.500 \\
top-s-100 &  0.347 &  0.211 &  0.138 &  0.096 & 0.198 &   0.281 &   0.166 & 0.911 &    0.498 \\
top-s-50  &  0.333 &  0.206 &  0.137 &  0.097 & 0.193 &   0.283 &   0.188 & 0.911 &    0.500 \\

### Classification

In [64]:
# contains = 'covid-x'
# contains = 'cxr14'
contains = 'e0'
# contains = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid'
# contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
contains = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid' # WINNER

contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
contains = 'covid-uc'

In [71]:
results_df_train = results_df_train.loc[results_df_train.index.str.contains(contains)]
results_df_val = results_df_val.loc[results_df_val.index.str.contains(contains)]
results_df_test = results_df_test.loc[results_df_test.index.str.contains(contains)]

In [62]:
results_df_all

Unnamed: 0,loss,acc,prec_covid,prec_Non-COVID,prec_normal,recall_covid,recall_Non-COVID,recall_normal,spec_covid,spec_Non-COVID,spec_normal,cm
0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid---covid-uc_size256_frontal,3.348938,0.393496,0.050633,0.2,0.941964,0.666667,0.202703,0.408124,0.492386,0.889094,0.867347,"[[15, 8, 1], [44, 19, 11], [229, 83, 205]]"
0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid---covid-uc_size512_frontal,4.755333,0.269919,0.046083,0.282051,0.950704,0.833333,0.148649,0.261122,0.299492,0.948244,0.928571,"[[20, 4, 0], [55, 12, 7], [342, 35, 140]]"
0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid---covid-uc_size512,2.975181,0.234428,0.038241,0.17734,0.961783,0.714286,0.367347,0.199472,0.411696,0.787261,0.952381,"[[18, 10, 0], [49, 42, 7], [406, 185, 166]]"
0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid---covid-uc_size256,2.332392,0.322763,0.03304,0.164773,0.952569,0.535714,0.295918,0.318362,0.48655,0.812739,0.904762,"[[11, 17, 0], [52, 35, 11], [332, 189, 236]]"


In [111]:
results_df_train.sort_values(['acc', 'recall_covid', 'prec_covid'],
                           ascending=False).dropna(axis=1, how='all')

KeyError: 'recall_covid'

In [72]:
results_df_val.sort_values(['acc', 'recall_covid', 'prec_covid'],
                           ascending=False).dropna(axis=1, how='all')

Unnamed: 0,loss,acc,prec_covid,prec_normal,recall_covid,recall_normal,spec_covid,spec_normal,cm,prec_Non-COVID,recall_Non-COVID,spec_Non-COVID
0722_193534_covid-uc_densenet-121_lr0.0001_os-max10_aug_size256--,0.421872,0.897727,0.0,0.916667,0.0,0.974684,0.987952,0.222222,"[[0, 0, 5], [0, 2, 2], [1, 1, 77]]",0.666667,0.5,0.988095
0722_201835_covid-uc_densenet-121_lr0.0001_os-max10_aug_pre-covid-x--,1.011396,0.772727,0.181818,0.969697,0.8,0.810127,0.783133,0.777778,"[[4, 0, 1], [2, 0, 2], [13, 3, 63]]",0.0,0.0,1.0


In [67]:
results_df_test

Unnamed: 0,loss,acc,prec_covid,prec_pneumonia,prec_normal,recall_covid,recall_pneumonia,recall_normal,spec_covid,spec_pneumonia,...,cm_Infiltration,cm_Mass,cm_Nodule,cm_Pneumothorax,cm_Consolidation,cm_Edema,cm_Emphysema,cm_Fibrosis,cm_Pleural_Thickening,cm_Hernia


In [66]:
results_df_test.sort_values(['acc', 'recall_covid', 'prec_covid'],
                            ascending=False).dropna(axis=1, how='all')

In [36]:
print_columns = [
    'acc',
    'prec_covid','prec_pneumonia','prec_normal',
    'recall_covid','recall_pneumonia', 'recall_normal',
    'spec_covid','spec_pneumonia', 'spec_normal'
#     'acc_Pneumonia', 'prec_Pneumonia', 'recall_Pneumonia'
]
# print_columns = [c for c in results_df_test.columns if c != 'loss' and 'cm' not in c]

In [37]:
print((results_df_test*100).to_latex(columns=print_columns,
                               float_format='%.2f',
                              ))

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &   acc &  prec\_covid &  prec\_pneumonia &  prec\_normal &  recall\_covid &  recall\_pneumonia &  recall\_normal &  spec\_covid &  spec\_pneumonia &  spec\_normal \\
\midrule
0717\_101812\_covid-x\_densenet-121\_lr1e-06\_os-max... & 93.73 &       93.94 &           91.12 &        95.53 &         93.00 &             93.27 &          94.12 &       99.59 &           94.52 &        94.38 \\
\bottomrule
\end{tabular}



### Report-generation: results at different report lengths

In [32]:
vals_words = [20, 25, 27, 33, 44, None]
vals_sents = [3, 4, 5, 6, None]

In [40]:
max_words = vals_words[0]
suffix = f'max-words-{max_words}' if max_words else ''
all_results = load_results(suffix)
results_df_test = create_results_df(all_results, 'test')
results_df_test

Unnamed: 0,loss,word_acc,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD
0717_041434_lstm_lr0.0001_densenet-121,5.579509,0.149129,0.478934,0.325275,0.256634,0.2183,0.319786,0.395486,1.354237
0716_211601_lstm-att_lr0.0001_densenet-121,7.382415,0.068293,0.442359,0.28249,0.199632,0.150046,0.268632,0.358223,0.775244
0717_015057_h-lstm_lr0.0001_densenet-121,4.548621,0.162518,0.328184,0.19551,0.12641,0.082401,0.183126,0.334782,0.256378
0716_234501_h-lstm-att_lr0.0001_densenet-121,5.157984,0.132504,0.298223,0.17906,0.115951,0.075574,0.167202,0.315891,0.101786


### DEBUG

In [48]:
%run -n ../eval_report_generation_chexpert_labeler.py

ModuleNotFoundError: No module named 'medai.dataset'

In [29]:
import pandas as pd
import os

In [43]:
fpath = '/mnt/workspace/iu-x-ray/dataset/reports/reports_with_chexpert_labels.csv'
df = pd.read_csv(fpath, index_col=0)
df.replace((-1, -2), 0, inplace=True)
df.head()

Unnamed: 0,Reports,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Lesion,Lung Opacity,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,filename
0,the cardiac silhouette and mediastinum size ar...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.xml
1,the cardiomediastinal silhouette is within nor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.xml
2,both lungs are clear and expanded . heart and ...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.xml
3,there is xxxx increased opacity within the rig...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.xml
4,interstitial markings are diffusely prominent ...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1001.xml


In [41]:
rename_cols = { 'Reports': 'ground_truth' }
for label in CHEXPERT_LABELS:
    rename_cols[label] = f'{label}-gt'
rename_cols

{'Reports': 'ground_truth',
 'No Finding': 'No Finding-gt',
 'Enlarged Cardiomediastinum': 'Enlarged Cardiomediastinum-gt',
 'Cardiomegaly': 'Cardiomegaly-gt',
 'Lung Lesion': 'Lung Lesion-gt',
 'Lung Opacity': 'Lung Opacity-gt',
 'Edema': 'Edema-gt',
 'Consolidation': 'Consolidation-gt',
 'Pneumonia': 'Pneumonia-gt',
 'Atelectasis': 'Atelectasis-gt',
 'Pneumothorax': 'Pneumothorax-gt',
 'Pleural Effusion': 'Pleural Effusion-gt',
 'Pleural Other': 'Pleural Other-gt',
 'Fracture': 'Fracture-gt',
 'Support Devices': 'Support Devices-gt'}

In [42]:
df.rename(columns=rename_cols, inplace=True)
df.head()

Unnamed: 0,ground_truth,No Finding-gt,Enlarged Cardiomediastinum-gt,Cardiomegaly-gt,Lung Lesion-gt,Lung Opacity-gt,Edema-gt,Consolidation-gt,Pneumonia-gt,Atelectasis-gt,Pneumothorax-gt,Pleural Effusion-gt,Pleural Other-gt,Fracture-gt,Support Devices-gt,filename
0,the cardiac silhouette and mediastinum size ar...,1.0,0.0,0.0,-2.0,-2.0,0.0,0.0,-2.0,-2.0,0.0,0.0,-2.0,-2.0,-2.0,1.xml
1,the cardiomediastinal silhouette is within nor...,-2.0,0.0,-2.0,-2.0,0.0,-2.0,-2.0,-2.0,-2.0,0.0,0.0,-2.0,-2.0,-2.0,10.xml
2,both lungs are clear and expanded . heart and ...,1.0,0.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,100.xml
3,there is xxxx increased opacity within the rig...,-2.0,-2.0,0.0,-1.0,1.0,-2.0,-1.0,-2.0,-1.0,0.0,0.0,-2.0,-2.0,-2.0,1000.xml
4,interstitial markings are diffusely prominent ...,-2.0,-2.0,0.0,-2.0,1.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,-2.0,1001.xml


In [46]:
run_name = '0919_011224_h-lstm-v2_lr0.0001_precnn_size256'
debug = False
free = False

In [47]:
results_folder = get_results_folder(run_name, classification=False, debug=debug)

# Output file at the end of this process
suffix = 'free' if free else 'notfree'
labeled_output_path = os.path.join(results_folder, f'outputs-labeled-{suffix}.csv')

model_output_path = os.path.join(results_folder, f'outputs-{suffix}.csv')

if not os.path.isfile(model_output_path):
    print('Need to compute outputs for run first: ', model_output_path)

# Read outputs
df2 = pd.read_csv(model_output_path)
df2.head()

Unnamed: 0,filename,epoch,dataset_type,ground_truth,generated
0,2089.xml,1,train,no active disease .,the heart is normal in size . the lungs are cl...
1,2089.xml,1,train,no active disease .,the heart is normal in size . the lungs are cl...
2,2435.xml,1,train,no active disease .,the heart is normal in size . the lungs are cl...
3,2435.xml,1,train,no active disease .,the heart is normal in size . the lungs are cl...
4,2460.xml,1,train,no active disease .,the heart is normal in size . the lungs are cl...


In [58]:
%run -n ../eval_report_generation_chexpert_labeler.py

In [56]:
labels = _load_gt_labels(df2)
labels.shape

(7426, 14)

In [57]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [26]:
_save_cached_gt_labels(df)

KeyError: "None of [Index(['ground_truth', 'No Finding-gt', 'Enlarged Cardiomediastinum-gt',\n       'Cardiomegaly-gt', 'Lung Lesion-gt', 'Lung Opacity-gt', 'Edema-gt',\n       'Consolidation-gt', 'Pneumonia-gt', 'Atelectasis-gt', 'Pneumothorax-gt',\n       'Pleural Effusion-gt', 'Pleural Other-gt', 'Fracture-gt',\n       'Support Devices-gt'],\n      dtype='object')] are in the [columns]"