## Imports

In [1]:
import os
import pandas as pd
import json
import re

In [2]:
%run ../utils/__init__.py
%run ../utils/files.py
%run ../metrics/__init__.py

In [3]:
pd.options.display.max_columns = None

## Functions

In [4]:
# TASK = 'seg'
TASK = 'rg'

In [5]:
TASK_FOLDER = _get_task_folder(TASK)
BASE_FOLDER = os.path.join(WORKSPACE_DIR, TASK_FOLDER)
RESULTS_FOLDER = os.path.join(BASE_FOLDER, 'results')

In [6]:
def get_suffix(filename):
    match = re.search('.*metrics-(?P<suffix>\w*)\.json', filename)
    if match is None:
        suffix = ''
    else:
        suffix = match.group('suffix')
    return suffix

In [7]:
KEY_COLS = ['run_name', 'dataset_type']
if TASK == 'rg':
    KEY_COLS.append('free')
KEY_COLS

['run_name', 'dataset_type', 'free']

In [8]:
METRIC_TYPES = [
    'chexpert',
    'grad-cam',
    'mirqi',
]

In [9]:
def load_results():
    results_by_metric_type = {}

    for run_name in os.listdir(RESULTS_FOLDER):
        if run_name == 'debug':
            continue

        folder = os.path.join(RESULTS_FOLDER, run_name)
        for filename in os.listdir(folder):
            filepath = os.path.join(folder, filename)
            if not os.path.isfile(filepath) or not filename.endswith('json'):
                continue

            metric_type = next(
                (met for met in METRIC_TYPES if met in filename),
                'base', # Default if no specific metric_type is found
            )

            with open(filepath, 'r') as f:
                results_dict = json.load(f)
            
            results_df = pd.DataFrame.from_dict(results_dict, orient='index')
            results_df.reset_index(inplace=True)
            results_df.rename(columns={'index': 'dataset_type'}, inplace=True)
            results_df['run_name'] = run_name
            if TASK == 'rg':
                results_df['free'] = get_suffix(filename)           
            
            if metric_type not in results_by_metric_type:
                results_by_metric_type[metric_type] = results_df
            else:
                prev = results_by_metric_type[metric_type]
                results_by_metric_type[metric_type] = prev.append(results_df, ignore_index=True)

    df = None
    cols_in_order = list(KEY_COLS)
    for results in results_by_metric_type.values():
        cols_in_order += [col for col in results.columns if col not in cols_in_order]
        
        if df is None:
            df = results
        else:
            df = df.merge(results, on=KEY_COLS, how='outer')
                
    return df[cols_in_order], results_by_metric_type

In [10]:
def filter_results(dataset_type=None, metrics=None,
                   metrics_contain=None, free=None, contains=None,
                   drop=None, drop_na_rows=False, drop_key_cols=False):
    df = RESULTS_DF
    
    if dataset_type:
        if isinstance(dataset_type, str):
            df = df[df['dataset_type'] == dataset_type]
        elif isinstance(dataset_type, (list, tuple)):
            dataset_type = set(dataset_type)
            df = df[df['dataset_type'].isin(dataset_type)]
    
    if free is not None:
        free_str = 'free' if free else 'notfree'
        df = df.loc[df['free'] == free_str]
    
    if contains:
        filter_contains = lambda d, s: d.loc[d['run_name'].str.contains(s)]
        if isinstance(contains, (list, tuple)):
            for c in contains:
                df = filter_contains(df, c)
        elif isinstance(contains, str):
            df = filter_contains(df, contains)
    
    if drop:
        df = df.loc[~df['run_name'].str.contains(drop)]
        
    if metrics_contain:
        columns = KEY_COLS + [c for c in df.columns if metrics_contain in c]
        df = df[columns]
    elif metrics:
        columns = KEY_COLS + metrics
        df = df[columns]
    
    if drop_na_rows:
        df.dropna(axis=0, how='any', inplace=True)

    # Drop cols with all na
    df.dropna(axis=1, how='all', inplace=True)

    if drop_key_cols:
        columns = [c for c in df.columns if c == 'run_name' or c not in KEY_COLS]
        df = df[columns]
    
    return df

## Load results

In [11]:
RESULTS_DF, debug = load_results()
print(len(RESULTS_DF))
RESULTS_DF.head()

84


Unnamed: 0,run_name,dataset_type,free,acc,acc-No Finding,acc-Enlarged Cardiomediastinum,acc-Cardiomegaly,acc-Lung Lesion,acc-Lung Opacity,acc-Edema,acc-Consolidation,acc-Pneumonia,acc-Atelectasis,acc-Pneumothorax,acc-Pleural Effusion,acc-Pleural Other,acc-Fracture,acc-Support Devices,prec,prec-No Finding,prec-Enlarged Cardiomediastinum,prec-Cardiomegaly,prec-Lung Lesion,prec-Lung Opacity,prec-Edema,prec-Consolidation,prec-Pneumonia,prec-Atelectasis,prec-Pneumothorax,prec-Pleural Effusion,prec-Pleural Other,prec-Fracture,prec-Support Devices,recall,recall-No Finding,recall-Enlarged Cardiomediastinum,recall-Cardiomegaly,recall-Lung Lesion,recall-Lung Opacity,recall-Edema,recall-Consolidation,recall-Pneumonia,recall-Atelectasis,recall-Pneumothorax,recall-Pleural Effusion,recall-Pleural Other,recall-Fracture,recall-Support Devices,f1,f1-No Finding,f1-Enlarged Cardiomediastinum,f1-Cardiomegaly,f1-Lung Lesion,f1-Lung Opacity,f1-Edema,f1-Consolidation,f1-Pneumonia,f1-Atelectasis,f1-Pneumothorax,f1-Pleural Effusion,f1-Pleural Other,f1-Fracture,f1-Support Devices,roc_auc,roc_auc-No Finding,roc_auc-Enlarged Cardiomediastinum,roc_auc-Cardiomegaly,roc_auc-Lung Lesion,roc_auc-Lung Opacity,roc_auc-Edema,roc_auc-Consolidation,roc_auc-Pneumonia,roc_auc-Atelectasis,roc_auc-Pneumothorax,roc_auc-Pleural Effusion,roc_auc-Pleural Other,roc_auc-Fracture,roc_auc-Support Devices,loss,word_loss,stop_loss,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD,distinct_words,distinct_sentences,chex_timer,chex_acc,chex_acc_No Finding,chex_acc_Enlarged Cardiomediastinum,chex_acc_Cardiomegaly,chex_acc_Lung Lesion,chex_acc_Lung Opacity,chex_acc_Edema,chex_acc_Consolidation,chex_acc_Pneumonia,chex_acc_Atelectasis,chex_acc_Pneumothorax,chex_acc_Pleural Effusion,chex_acc_Pleural Other,chex_acc_Fracture,chex_acc_Support Devices,chex_prec,chex_prec_No Finding,chex_prec_Enlarged Cardiomediastinum,chex_prec_Cardiomegaly,chex_prec_Lung Lesion,chex_prec_Lung Opacity,chex_prec_Edema,chex_prec_Consolidation,chex_prec_Pneumonia,chex_prec_Atelectasis,chex_prec_Pneumothorax,chex_prec_Pleural Effusion,chex_prec_Pleural Other,chex_prec_Fracture,chex_prec_Support Devices,chex_recall,chex_recall_No Finding,chex_recall_Enlarged Cardiomediastinum,chex_recall_Cardiomegaly,chex_recall_Lung Lesion,chex_recall_Lung Opacity,chex_recall_Edema,chex_recall_Consolidation,chex_recall_Pneumonia,chex_recall_Atelectasis,chex_recall_Pneumothorax,chex_recall_Pleural Effusion,chex_recall_Pleural Other,chex_recall_Fracture,chex_recall_Support Devices,chex_spec,chex_spec_No Finding,chex_spec_Enlarged Cardiomediastinum,chex_spec_Cardiomegaly,chex_spec_Lung Lesion,chex_spec_Lung Opacity,chex_spec_Edema,chex_spec_Consolidation,chex_spec_Pneumonia,chex_spec_Atelectasis,chex_spec_Pneumothorax,chex_spec_Pleural Effusion,chex_spec_Pleural Other,chex_spec_Fracture,chex_spec_Support Devices,chex_npv,chex_npv_No Finding,chex_npv_Enlarged Cardiomediastinum,chex_npv_Cardiomegaly,chex_npv_Lung Lesion,chex_npv_Lung Opacity,chex_npv_Edema,chex_npv_Consolidation,chex_npv_Pneumonia,chex_npv_Atelectasis,chex_npv_Pneumothorax,chex_npv_Pleural Effusion,chex_npv_Pleural Other,chex_npv_Fracture,chex_npv_Support Devices,chex_f1,chex_f1_No Finding,chex_f1_Enlarged Cardiomediastinum,chex_f1_Cardiomegaly,chex_f1_Lung Lesion,chex_f1_Lung Opacity,chex_f1_Edema,chex_f1_Consolidation,chex_f1_Pneumonia,chex_f1_Atelectasis,chex_f1_Pneumothorax,chex_f1_Pleural Effusion,chex_f1_Pleural Other,chex_f1_Fracture,chex_f1_Support Devices,word_acc,att_loss,att_iou,att_iobb
0,0122_135739_dummy-common-sentences-100_front,test,free,0.906808,0.398438,0.867188,0.820312,0.953125,0.838542,0.994792,0.994792,0.989583,0.963542,0.986979,0.979167,0.984375,0.955729,0.96875,0.038305,0.345794,0.057143,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073287,0.840909,0.1,0.085106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047621,0.490066,0.072727,0.103896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500888,0.503788,0.50467,0.503978,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.349819,0.209409,0.135837,0.09359,0.197163,0.283324,0.157216,105,100,0.0/0.0,0.939174,0.994792,0.848958,0.8125,0.947917,0.825521,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.094866,0.994792,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085099,1.0,0.095238,0.096154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.919016,0.0,0.94152,0.924699,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875276,0.0,0.894444,0.867232,0.947917,0.825521,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.088611,0.997389,0.121212,0.121951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1,0122_135739_dummy-common-sentences-100_front,train,free,0.905571,0.432531,0.892775,0.780271,0.95843,0.82811,0.993402,0.984823,0.991092,0.950511,0.989772,0.968987,0.990762,0.970307,0.946222,0.043518,0.400627,0.057416,0.151203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073092,0.842539,0.085714,0.095032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052037,0.543039,0.068768,0.116711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50064,0.500753,0.508786,0.499424,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.336457,0.205273,0.135745,0.094295,0.192943,0.276334,0.152092,105,100,0.0/0.0,0.937621,0.996041,0.857803,0.772682,0.954141,0.815242,0.985153,0.982844,0.986143,0.929726,0.986803,0.95843,0.987133,0.968327,0.946222,0.088759,0.996041,0.081633,0.164948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081152,1.0,0.038961,0.097166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.918189,0.0,0.950422,0.904218,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.873911,0.0,0.897365,0.837226,0.954141,0.815242,0.985153,0.982844,0.986143,0.929726,0.986803,0.95843,0.987133,0.968327,0.946222,0.08379,0.998017,0.052747,0.122293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,0122_135739_dummy-common-sentences-100_front,val,free,0.90538,0.420779,0.888312,0.787013,0.968831,0.862338,0.98961,0.98961,0.987013,0.924675,0.987013,0.971429,0.994805,0.958442,0.945455,0.036717,0.394984,0.0,0.119048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064835,0.807692,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045659,0.530526,0.0,0.108696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49594,0.482449,0.46594,0.494776,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.326945,0.196131,0.126751,0.08611,0.183984,0.270924,0.158774,105,100,0.0/0.0,0.93859,0.994805,0.862338,0.768831,0.963636,0.844156,0.981818,0.98961,0.981818,0.906494,0.979221,0.968831,0.994805,0.955844,0.948052,0.081262,0.994805,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078692,1.0,0.0,0.101695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.917204,0.0,0.951289,0.889571,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875853,0.0,0.902174,0.845481,0.963636,0.844156,0.981818,0.98961,0.981818,0.906494,0.979221,0.968831,0.994805,0.955844,0.948052,0.079729,0.997396,0.0,0.118812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
3,0122_135739_dummy-common-sentences-100_front,val,notfree,0.909276,0.506494,0.898701,0.766234,0.966234,0.862338,0.984416,0.98961,0.987013,0.924675,0.979221,0.966234,0.994805,0.958442,0.945455,0.045728,0.444805,0.08,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079237,0.878205,0.111111,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057228,0.590517,0.093023,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505051,0.56574,0.52422,0.491343,0.49866,0.5,0.497375,0.5,0.5,0.5,0.496053,0.497326,0.5,0.5,0.5,43.466139,6.963101,36.503038,0.393895,0.241075,0.157749,0.108631,0.225337,0.294778,0.236226,105,133,1.2/1.2,0.937291,0.98961,0.872727,0.763636,0.961039,0.844156,0.976623,0.98961,0.981818,0.906494,0.974026,0.963636,0.994805,0.955844,0.948052,0.093195,0.994778,0.117647,0.192308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08713,0.994778,0.055556,0.169492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.914967,0.0,0.95702,0.871166,0.997305,1.0,0.994709,1.0,1.0,1.0,0.994695,0.994638,1.0,1.0,1.0,0.876735,0.0,0.907609,0.852853,0.963542,0.844156,0.981723,0.98961,0.981818,0.906494,0.979112,0.968668,0.994805,0.955844,0.948052,0.089316,0.994778,0.075472,0.18018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038147,,,
4,0122_135739_dummy-common-sentences-100_front,test,notfree,0.902344,0.377604,0.864583,0.796875,0.950521,0.833333,0.994792,0.994792,0.989583,0.963542,0.981771,0.976562,0.984375,0.955729,0.96875,0.033314,0.326861,0.0,0.139535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063772,0.765152,0.0,0.12766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042242,0.45805,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.494744,0.469877,0.456044,0.508934,0.498634,0.496894,0.5,0.5,0.5,0.5,0.497361,0.49867,0.5,0.5,0.5,28.614465,6.94264,21.671825,0.398164,0.243229,0.159492,0.108269,0.227289,0.303682,0.232407,105,129,1.0/1.0,0.936198,0.994792,0.84375,0.783854,0.945312,0.820312,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.081023,0.994792,0.0,0.139535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07967,1.0,0.0,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.916205,0.0,0.947368,0.888554,0.997253,0.993691,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.874392,0.0,0.885246,0.865103,0.947781,0.824607,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.080265,0.997389,0.0,0.126316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036335,,,


In [32]:
set(
    col.replace('-', '_').split('_')[0]
    for col in RESULTS_DF.columns
)

{'MIRQI',
 'acc',
 'att',
 'bleu',
 'bleu1',
 'bleu2',
 'bleu3',
 'bleu4',
 'chex',
 'ciderD',
 'dataset',
 'distinct',
 'f1',
 'free',
 'loss',
 'prec',
 'recall',
 'roc',
 'rougeL',
 'run',
 'stop',
 'word'}

## Segmentation

In [38]:
SEG_METRICS = [
    'iou', 'iou-heart', 'iou-left lung', 'iou-right lung',
    'dice', 'dice-heart', 'dice-left lung', 'dice-right lung',
]

In [24]:
replace_strs = [
    (r'^\d{4}_\d{6}_jsrt_scan_', ''),
#     ('most-similar-image', '1nn'),
#     ('_lr[\d\.]+', ''),
#     ('_size256', ''),
#     (r'_\d{4}_\d{6}_.*', ''),
#     ('dummy-', ''),
#     ('common', 'top'),
#     ('-v2', ''),
#     (r'top-(\w)\w+-(\d+)', r'top-\1-\2'),
#     ('_densenet-121', ''),
]

def rename_runs(run_name):
    s = run_name
    for target, replace_with in replace_strs:
        s = re.sub(target, replace_with, s)
    return s

In [39]:
filter_results(
    metrics=SEG_METRICS,
    dataset_type='test',
    drop='1105_180035',
).sort_values('iou')# .set_index('run_name').rename(index=rename_runs)

Unnamed: 0,run_name,dataset_type,iou,iou-heart,iou-left lung,iou-right lung,dice,dice-heart,dice-left lung,dice-right lung
5,1106_180038_jsrt_scan_lr0.0002_normS_size1024_...,test,0.859004,0.762582,0.85502,0.891177,0.920927,0.860135,0.920031,0.941517
23,1106_131033_jsrt_scan_lr0.0001_normS_size1024_wce,test,0.876946,0.768236,0.89025,0.91201,0.93065,0.861632,0.940607,0.952947
17,1106_092037_jsrt_scan_lr0.0001_normD_size1024,test,0.882099,0.772974,0.89296,0.919901,0.934353,0.867182,0.942441,0.957549
14,1106_124922_jsrt_scan_lr0.0001_normS_size1024,test,0.885864,0.773531,0.906981,0.922493,0.936624,0.867728,0.95047,0.959147
8,1106_143944_jsrt_scan_lr0.0001_normS_size1024_...,test,0.888081,0.782473,0.911865,0.918951,0.937644,0.872197,0.953031,0.957002
11,1109_122859_jsrt_scan_lr0.001_normS_size1024_w...,test,0.906357,0.804471,0.930428,0.943959,0.948732,0.888313,0.963272,0.970937
20,1106_180455_jsrt_scan_lr0.0005_normS_size1024_...,test,0.907892,0.82499,0.922734,0.935546,0.950283,0.902217,0.959132,0.966459


## Report generation

In [13]:
NLP_METRICS = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'bleu', 'rougeL', 'ciderD']
CHEXPERT_METRICS = ['acc', 'roc_auc', 'recall', 'prec', 'f1', 'roc_auc']
CHEXPERT_METRICS = [c for c in RESULTS_DF.columns
                    if any(c.startswith(ch) for ch in CHEXPERT_METRICS)]
CHEXPERT_RUNTIME_METRICS = [col for col in RESULTS_DF.columns if col.startswith('chex')]
VAR_METRICS = [c for c in RESULTS_DF.columns if 'distinct' in c]
MIRQI_METRICS = [c for c in RESULTS_DF.columns if 'MIRQI' in c]

In [14]:
ESSENTIAL_METRICS = [
    'bleu', 'ciderD', 'rougeL',
    'chex_f1', 'chex_acc', # 'chex_recall', 'chex_prec', # Runtime-chexpert
    'MIRQI-v2-f',
    'roc_auc', 'acc', 'f1', # Holistic-chexpert
]

In [16]:
runtime_chexpert = [c for c in RESULTS_DF.columns if c.startswith('chex_f1')]
holistic_chexpert = [c for c in RESULTS_DF.columns if c.startswith('f1')]
# metrics = runtime_chexpert + holistic_chexpert
metrics = [
    'chex_f1', 'chex_f1_No Finding',
    'f1', 'f1-No Finding'
]

In [25]:
filter_results(
    # contains='^01\d{2}_\d{6}_(h-|lstm)',
    contains='.*dummy',
    dataset_type='test',
    free=True,
    metrics=metrics, # CHEXPERT_RUNTIME_METRICS
    # drop_key_cols=True,
    # drop_na_rows=True,
) # .sort_values('run_name')
# .replace(r'^\d{4}_\d{6}_(.*)', r'\1', regex=True)

Unnamed: 0,run_name,dataset_type,free,chex_f1,chex_f1_No Finding
2,0121_230020_dummy-random_front,test,free,0.118726,0.998696
26,0121_210210_dummy-common-words-100_front,test,free,0.130601,0.982781
74,0121_210044_dummy-constant_front,test,free,0.071429,1.0


### Compare runtime chexpert vs holistic chexpert

In [63]:
def subtract_cols(df, cols_a, cols_b, drop_na_rows=True):
    array_a = df[cols_a].to_numpy()
    array_b = df[cols_b].to_numpy()
    
    df_2 = df[KEY_COLS].copy()
    df_2 = pd.concat([df_2, pd.DataFrame(array_a - array_b, columns=cols_a)], axis=1)
    
    if drop_na_rows:
        df_2.dropna(axis=0, inplace=True, how='any')
    
    return df_2

In [70]:
metric = 'prec'

runtime_chexpert = [c for c in RESULTS_DF.columns if c.startswith(f'chex_{metric}')]
holistic_chexpert = [c for c in RESULTS_DF.columns if c.startswith(metric)]

In [71]:
df = subtract_cols(RESULTS_DF, runtime_chexpert, holistic_chexpert)
df.head()

Unnamed: 0,run_name,dataset_type,free,chex_prec,chex_prec_No Finding,chex_prec_Enlarged Cardiomediastinum,chex_prec_Cardiomegaly,chex_prec_Lung Lesion,chex_prec_Lung Opacity,chex_prec_Edema,chex_prec_Consolidation,chex_prec_Pneumonia,chex_prec_Atelectasis,chex_prec_Pneumothorax,chex_prec_Pleural Effusion,chex_prec_Pleural Other,chex_prec_Fracture,chex_prec_Support Devices
0,0122_135739_dummy-common-sentences-100_front,test,free,0.056561,0.648997,0.109524,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0122_135739_dummy-common-sentences-100_front,train,free,0.045241,0.595414,0.024216,0.013746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0122_135739_dummy-common-sentences-100_front,val,free,0.044545,0.599821,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0122_135739_dummy-common-sentences-100_front,val,notfree,0.047467,0.549973,0.037647,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0122_135739_dummy-common-sentences-100_front,test,notfree,0.047709,0.667931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
df.describe()

Unnamed: 0,chex_prec,chex_prec_No Finding,chex_prec_Enlarged Cardiomediastinum,chex_prec_Cardiomegaly,chex_prec_Lung Lesion,chex_prec_Lung Opacity,chex_prec_Edema,chex_prec_Consolidation,chex_prec_Pneumonia,chex_prec_Atelectasis,chex_prec_Pneumothorax,chex_prec_Pleural Effusion,chex_prec_Pleural Other,chex_prec_Fracture,chex_prec_Support Devices
count,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0,78.0
mean,0.045733,0.590839,0.016239,0.004462,0.001199,0.003852,0.00095,0.002631,2e-06,0.009322,0.002338,0.003983,0.0,0.004446,0.0
std,0.009792,0.06464,0.03706,0.012002,0.009484,0.009954,0.005462,0.019039,1.5e-05,0.030332,0.02287,0.023021,0.0,0.037737,0.0
min,0.029138,0.393939,0.0,-0.003003,-0.003268,0.0,-6.2e-05,-0.000342,0.0,-0.001812,-0.02381,0.0,0.0,0.0,0.0
25%,0.042115,0.588435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.042843,0.595843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.046503,0.644511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.102231,0.72549,0.222222,0.076923,0.083333,0.048101,0.035014,0.166667,0.000133,0.210526,0.2,0.2,0.0,0.333333,0.0


In [40]:
metrics = [
    col
    for col in RESULTS_DF.columns
    # if col.startswith('recall')
    # if col == 'recall-No Finding'
    # if 'No Finding' in col
]
cols = KEY_COLS + metrics
cols
# RESULTS_DF[cols].sort_index()

['run_name',
 'dataset_type',
 'free',
 'run_name',
 'dataset_type',
 'free',
 'MIRQI-r',
 'MIRQI-p',
 'MIRQI-f',
 'MIRQI-v2-r',
 'MIRQI-v2-p',
 'MIRQI-v2-sp',
 'MIRQI-v2-np',
 'MIRQI-v2-f',
 'MIRQI-v2-attr-p',
 'MIRQI-v2-attr-r',
 'acc',
 'acc-No Finding',
 'acc-Enlarged Cardiomediastinum',
 'acc-Cardiomegaly',
 'acc-Lung Lesion',
 'acc-Lung Opacity',
 'acc-Edema',
 'acc-Consolidation',
 'acc-Pneumonia',
 'acc-Atelectasis',
 'acc-Pneumothorax',
 'acc-Pleural Effusion',
 'acc-Pleural Other',
 'acc-Fracture',
 'acc-Support Devices',
 'prec',
 'prec-No Finding',
 'prec-Enlarged Cardiomediastinum',
 'prec-Cardiomegaly',
 'prec-Lung Lesion',
 'prec-Lung Opacity',
 'prec-Edema',
 'prec-Consolidation',
 'prec-Pneumonia',
 'prec-Atelectasis',
 'prec-Pneumothorax',
 'prec-Pleural Effusion',
 'prec-Pleural Other',
 'prec-Fracture',
 'prec-Support Devices',
 'recall',
 'recall-No Finding',
 'recall-Enlarged Cardiomediastinum',
 'recall-Cardiomegaly',
 'recall-Lung Lesion',
 'recall-Lung Opacity'

In [33]:
from collections import Counter

In [28]:
run_name = '0112_154506_lstm-v2_lr0.001_densenet-121-v2_noes'
debug = False
d1 = load_rg_outputs(run_name, debug=debug, free=True)
d2 = load_rg_outputs(run_name, debug=debug, free=False)
len(d1), len(d2)

(18704, 22278)

In [34]:
c1 = Counter(d1['filename'])
c2 = Counter(d2['filename'])
len(c1), len(c2)

(3826, 3826)

In [36]:
for fname in c1.keys():
    v1 = c1[fname]
    v2 = c2[fname]
    if v1 != v2:
        print('Wrong: ', fname, v1, v2)

Wrong:  2311.xml 5 6
Wrong:  2379.xml 4 6
Wrong:  2464.xml 4 6
Wrong:  2652.xml 4 6
Wrong:  2748.xml 4 6
Wrong:  28.xml 4 6
Wrong:  2843.xml 4 6
Wrong:  2871.xml 4 6
Wrong:  2951.xml 4 6
Wrong:  2952.xml 2 3
Wrong:  2955.xml 4 6
Wrong:  2981.xml 4 6
Wrong:  2986.xml 6 9
Wrong:  2999.xml 4 6
Wrong:  3010.xml 4 6
Wrong:  3040.xml 2 3
Wrong:  3057.xml 2 3
Wrong:  3064.xml 4 6
Wrong:  3170.xml 4 6
Wrong:  3256.xml 4 6
Wrong:  3388.xml 4 6
Wrong:  3398.xml 4 6
Wrong:  3409.xml 2 3
Wrong:  355.xml 4 6
Wrong:  3598.xml 4 6
Wrong:  3602.xml 4 6
Wrong:  3612.xml 4 6
Wrong:  3659.xml 4 6
Wrong:  3686.xml 4 6
Wrong:  3691.xml 4 6
Wrong:  3709.xml 6 9
Wrong:  3808.xml 4 6
Wrong:  3847.xml 4 6
Wrong:  3882.xml 4 6
Wrong:  424.xml 2 3
Wrong:  459.xml 4 6
Wrong:  498.xml 4 6
Wrong:  693.xml 4 6
Wrong:  743.xml 4 6
Wrong:  927.xml 4 6
Wrong:  944.xml 4 6
Wrong:  979.xml 4 6
Wrong:  1030.xml 4 6
Wrong:  1050.xml 4 6
Wrong:  1101.xml 6 9
Wrong:  1152.xml 4 6
Wrong:  1184.xml 4 6
Wrong:  1195.xml 4 6
Wro

Wrong:  802.xml 4 6
Wrong:  803.xml 2 3
Wrong:  1194.xml 2 3
Wrong:  1361.xml 4 6
Wrong:  1572.xml 4 6
Wrong:  2130.xml 4 6
Wrong:  2502.xml 4 6
Wrong:  2578.xml 4 6
Wrong:  2585.xml 4 6
Wrong:  3072.xml 4 6
Wrong:  3158.xml 4 6
Wrong:  525.xml 4 6
Wrong:  760.xml 4 6
Wrong:  966.xml 4 6
Wrong:  1614.xml 4 6
Wrong:  1740.xml 4 6
Wrong:  233.xml 6 9
Wrong:  2721.xml 4 6
Wrong:  3038.xml 4 6
Wrong:  3190.xml 4 6
Wrong:  702.xml 2 3
Wrong:  854.xml 4 6
Wrong:  101.xml 4 6
Wrong:  1683.xml 4 6
Wrong:  1704.xml 4 6
Wrong:  177.xml 4 6
Wrong:  189.xml 4 6
Wrong:  2092.xml 4 6
Wrong:  2961.xml 4 6
Wrong:  3188.xml 4 6
Wrong:  3435.xml 4 6
Wrong:  3490.xml 4 6
Wrong:  3926.xml 4 6
Wrong:  468.xml 4 6
Wrong:  549.xml 4 6
Wrong:  934.xml 4 6
Wrong:  2077.xml 4 6
Wrong:  2271.xml 2 3
Wrong:  2594.xml 6 9
Wrong:  2867.xml 4 6
Wrong:  1220.xml 6 9
Wrong:  1529.xml 4 6
Wrong:  2441.xml 4 6
Wrong:  3154.xml 4 6
Wrong:  3219.xml 4 6
Wrong:  3517.xml 2 3
Wrong:  3611.xml 2 3
Wrong:  3934.xml 4 6
Wrong:

In [32]:
d2.head()

Unnamed: 0,filename,epoch,dataset_type,ground_truth,generated
0,2089.xml,1,train,no active disease .,the heart size is normal .
1,2089.xml,1,train,no active disease .,the heart is normal in size
2,2435.xml,1,train,no active disease .,the heart is normal in size
3,2435.xml,1,train,no active disease .,the heart is normal in size
4,2460.xml,1,train,no active disease .,the heart is normal in size


In [31]:
set(d2['dataset_type'])

{'test', 'train', 'val'}

### Pretty-print (latex)

In [41]:
replace_strs = [
    (r'^\d{4}_\d{6}_', ''),
    ('most-similar-image', '1nn'),
    ('_lr[\d\.]+', ''),
    ('_size256', ''),
    (r'_\d{4}_\d{6}_.*', ''),
    ('dummy-', ''),
    ('common', 'top'),
    ('-v2', ''),
    (r'top-(\w)\w+-(\d+)', r'top-\1-\2'),
    ('_densenet-121', ''),
]

def rename_runs(run_name):
    s = run_name
    for target, replace_with in replace_strs:
        s = re.sub(target, replace_with, s)
    return s

In [56]:
columns = ['bleu', 'rougeL', 'ciderD'] + CHEXPERT_METRICS + MIRQI_METRICS

In [57]:
df = filter_results(dataset_type='test',
                    free=True,
                    metrics=columns,
                    contains='(?=_lstm-att-v2.*densenet|_lstm-v2.*densenet|dummy)',
                    drop='0915_173951|0915_174222|0916_104739',
                    drop_na_rows=True,
                   )

In [51]:
df

Unnamed: 0,run_name,dataset_type,free,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD,...,MIRQI-r,MIRQI-p,MIRQI-f,MIRQI-v2-r,MIRQI-v2-p,MIRQI-v2-sp,MIRQI-v2-np,MIRQI-v2-f,MIRQI-v2-attr-p,MIRQI-v2-attr-r
3,0915_174026_dummy-common-sentences-100,test,free,0.346689,0.21085,0.13844,0.09576881,0.197937,0.280916,0.165551,...,0.417426,0.415991,0.413998,0.310505,0.31516,0.813117,0.784744,0.31155,0.305851,0.305851
9,0915_172915_dummy-common-words-10,test,free,0.312055,0.107749,0.024166,8.550974e-07,0.110993,0.243604,0.042768,...,0.356383,0.356383,0.356383,,,,,,,
15,0915_173524_dummy-common-sentences-10,test,free,0.275183,0.170479,0.112833,0.07838136,0.159219,0.294405,0.15161,...,0.457447,0.449426,0.452115,0.356383,0.356383,0.813829,0.773725,0.356383,0.356383,0.356383
21,0915_172709_dummy-constant,test,free,0.454623,0.311525,0.223393,0.165365,0.288726,0.356681,0.292646,...,0.478457,0.457842,0.465502,0.356383,0.356383,0.964095,0.861018,0.356383,0.356383,0.356383
28,0918_144929_lstm-att-v2_lr0.0001_densenet-121_...,test,free,0.36085,0.226277,0.152071,0.1064681,0.211416,0.313536,0.186868,...,0.468323,0.467369,0.461983,0.360217,0.370789,0.896238,0.840001,0.360826,0.288435,0.284874
41,0918_125940_lstm-v2_lr0.0001_densenet-121_size256,test,free,0.379042,0.239385,0.163692,0.1170487,0.224792,0.337529,0.284303,...,0.448021,0.441593,0.442201,0.337433,0.341755,0.874132,0.823266,0.338564,0.321698,0.321077
58,0915_173609_dummy-random,test,free,0.36248,0.196943,0.117449,0.07459662,0.187867,0.263606,0.111804,...,0.431931,0.43007,0.423154,0.344548,0.343018,0.726773,0.72613,0.336981,0.175747,0.174897
107,0915_173754_dummy-common-sentences-50,test,free,0.332812,0.205903,0.137483,0.09688422,0.193271,0.282765,0.188344,...,0.42174,0.424866,0.420113,0.318429,0.327128,0.802019,0.777257,0.320464,0.31383,0.31383
108,0916_104837_dummy-most-similar-image_0717_1202...,test,free,0.382749,0.220256,0.142339,0.09958416,0.211232,0.288225,0.230329,...,0.435919,0.444452,0.433369,0.347119,0.355474,0.754784,0.753767,0.345257,0.186351,0.18588
115,0915_173307_dummy-common-words-100,test,free,0.348717,0.08966,0.016369,6.481232e-07,0.113687,0.206325,0.071575,...,0.224643,0.209504,0.200727,0.198094,0.160554,0.207445,0.328014,0.168722,0.059774,0.059563


In [59]:
shorten_cols = lambda s: s.replace('MIRQI-v2', 'v2')

In [61]:
print(df.set_index('run_name').rename(
    index=rename_runs,
    columns=shorten_cols,
).sort_index().to_latex(
    columns=[shorten_cols(c) for c in columns],
    float_format='%.3f',
    column_format='l' + 'c' * len(columns),
))

\begin{tabular}{lccccccccccccccc}
\toprule
{} &  bleu &  rougeL &  ciderD &   acc &  roc\_auc &  MIRQI-r &  MIRQI-p &  MIRQI-f &  v2-r &  v2-p &  v2-sp &  v2-np &  v2-f &  v2-attr-p &  v2-attr-r \\
run\_name  &       &         &         &       &          &          &          &          &       &       &        &        &       &            &            \\
\midrule
1nn       & 0.211 &   0.288 &   0.230 & 0.903 &    0.518 &    0.436 &    0.444 &    0.433 & 0.347 & 0.355 &  0.755 &  0.754 & 0.345 &      0.186 &      0.186 \\
constant  & 0.289 &   0.357 &   0.293 & 0.915 &    0.500 &    0.478 &    0.458 &    0.466 & 0.356 & 0.356 &  0.964 &  0.861 & 0.356 &      0.356 &      0.356 \\
lstm      & 0.225 &   0.338 &   0.284 & 0.912 &    0.505 &    0.448 &    0.442 &    0.442 & 0.337 & 0.342 &  0.874 &  0.823 & 0.339 &      0.322 &      0.321 \\
lstm-att  & 0.211 &   0.314 &   0.187 & 0.918 &    0.508 &    0.468 &    0.467 &    0.462 & 0.360 & 0.371 &  0.896 &  0.840 & 0.361 &      0.288 &  

## Classification

In [64]:
# contains = 'covid-x'
# contains = 'cxr14'
contains = 'e0'
# contains = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid'
# contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
run_name = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid' # WINNER

contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
contains = 'covid-uc'

In [12]:
filter_results(dataset_type='test-bbox', metrics_contain='iou')

Unnamed: 0,run_name,dataset_type,iou-Pneumonia,iou-Atelectasis,iou-Cardiomegaly,iou-Effusion,iou-Infiltration,iou-Mass,iou-Nodule,iou-Pneumothorax,iou-Consolidation,iou-Edema,iou-Emphysema,iou-Fibrosis,iou-Pleural_Thickening,iou-Hernia
172,0715_232631_cxr14_densenet-121_lr1e-06_os-max4...,test-bbox,0.104788,,,,,,,,,,,,,
173,1027_144914_cxr14_densenet-121_lr1e-05_aug_nor...,test-bbox,,,,0.089691,,,,,0.0,0.0,0.0,0.0,0.0,0.0
174,0722_121738_cxr14_densenet-121_lr0.0001_aug-0_...,test-bbox,,,,,,,,,,,,,,
175,0714_232500_cxr14_densenet-121_lr1e-06,test-bbox,0.082394,0.034604,0.183337,0.091254,0.097544,0.053841,0.004207,0.054527,0.0,0.0,0.0,0.0,0.0,0.0
176,0715_232418_cxr14_densenet-121_lr1e-06_os-max4...,test-bbox,0.102067,,,,,,,,,,,,,
177,0721_125845_cxr14_densenet-121_lr1e-05_aug-0_b...,test-bbox,,,,,,,,,,,,,,
178,0721_124730_cxr14_densenet-121_lr1e-05_bce_Car...,test-bbox,,,,,,,,,,,,,,
179,0716_133211_cxr14_densenet-121_lr1e-06_us_aug-...,test-bbox,0.048599,,,,,,,,,,,,,
180,0714_232518_cxr14_densenet-121_lr1e-06,test-bbox,0.102803,0.027257,0.145093,0.085544,0.037971,0.033063,0.005916,0.057814,0.0,0.0,0.0,0.0,0.0,0.0
181,0720_213359_cxr14_densenet-121_lr1e-06_aug-0_b...,test-bbox,,,0.197302,,,,,,,,,,,


In [66]:
results_df_test.sort_values(['acc', 'recall_covid', 'prec_covid'],
                            ascending=False).dropna(axis=1, how='all')

In [36]:
print_columns = [
    'acc',
    'prec_covid','prec_pneumonia','prec_normal',
    'recall_covid','recall_pneumonia', 'recall_normal',
    'spec_covid','spec_pneumonia', 'spec_normal'
#     'acc_Pneumonia', 'prec_Pneumonia', 'recall_Pneumonia'
]
# print_columns = [c for c in results_df_test.columns if c != 'loss' and 'cm' not in c]

In [37]:
print((results_df_test*100).to_latex(columns=print_columns,
                               float_format='%.2f',
                              ))

\begin{tabular}{lrrrrrrrrrr}
\toprule
{} &   acc &  prec\_covid &  prec\_pneumonia &  prec\_normal &  recall\_covid &  recall\_pneumonia &  recall\_normal &  spec\_covid &  spec\_pneumonia &  spec\_normal \\
\midrule
0717\_101812\_covid-x\_densenet-121\_lr1e-06\_os-max... & 93.73 &       93.94 &           91.12 &        95.53 &         93.00 &             93.27 &          94.12 &       99.59 &           94.52 &        94.38 \\
\bottomrule
\end{tabular}



## Report-generation: results at different report lengths

In [32]:
vals_words = [20, 25, 27, 33, 44, None]
vals_sents = [3, 4, 5, 6, None]

In [40]:
max_words = vals_words[0]
suffix = f'max-words-{max_words}' if max_words else ''
all_results = load_results(suffix)
results_df_test = create_results_df(all_results, 'test')
results_df_test

Unnamed: 0,loss,word_acc,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD
0717_041434_lstm_lr0.0001_densenet-121,5.579509,0.149129,0.478934,0.325275,0.256634,0.2183,0.319786,0.395486,1.354237
0716_211601_lstm-att_lr0.0001_densenet-121,7.382415,0.068293,0.442359,0.28249,0.199632,0.150046,0.268632,0.358223,0.775244
0717_015057_h-lstm_lr0.0001_densenet-121,4.548621,0.162518,0.328184,0.19551,0.12641,0.082401,0.183126,0.334782,0.256378
0716_234501_h-lstm-att_lr0.0001_densenet-121,5.157984,0.132504,0.298223,0.17906,0.115951,0.075574,0.167202,0.315891,0.101786
