## Imports

In [1]:
import os
import pandas as pd
import json
import re

In [2]:
%run ../utils/__init__.py
%run ../utils/files.py
%run ../metrics/__init__.py

In [3]:
pd.options.display.max_columns = None

## Choose task

In [4]:
# TASK = 'seg'
TASK = 'rg'
# TASK = 'cls'

In [5]:
KEY_COLS = ['run_name', 'dataset_type']
if TASK == 'rg':
    KEY_COLS.append('free')
KEY_COLS

['run_name', 'dataset_type', 'free']

## Functions

In [6]:
TASK_FOLDER = _get_task_folder(TASK)
BASE_FOLDER = os.path.join(WORKSPACE_DIR, TASK_FOLDER)
RESULTS_FOLDER = os.path.join(BASE_FOLDER, 'results')

In [7]:
def get_suffix(filename):
    match = re.search('.*metrics-(?P<suffix>\w*)\.json', filename)
    if match is None:
        suffix = ''
    else:
        suffix = match.group('suffix')
    return suffix

In [8]:
METRIC_TYPES = [
    'chexpert',
    'grad-cam',
    'mirqi',
]

In [9]:
def load_results():
    results_by_metric_type = {}

    for run_name in os.listdir(RESULTS_FOLDER):
        if run_name == 'debug':
            continue

        folder = os.path.join(RESULTS_FOLDER, run_name)
        for filename in os.listdir(folder):
            filepath = os.path.join(folder, filename)
            if not os.path.isfile(filepath) or not filename.endswith('json'):
                continue

            metric_type = next(
                (met for met in METRIC_TYPES if met in filename),
                'base', # Default if no specific metric_type is found
            )

            with open(filepath, 'r') as f:
                results_dict = json.load(f)
   
            results_df = pd.DataFrame.from_dict(results_dict, orient='index')
            results_df.reset_index(inplace=True)
            results_df.rename(columns={'index': 'dataset_type'}, inplace=True)
            results_df['run_name'] = run_name
            if TASK == 'rg':
                results_df['free'] = get_suffix(filename)           
            
            if metric_type not in results_by_metric_type:
                results_by_metric_type[metric_type] = results_df
            else:
                prev = results_by_metric_type[metric_type]
                results_by_metric_type[metric_type] = prev.append(results_df, ignore_index=True)

    df = None
    cols_in_order = list(KEY_COLS)
    for results in results_by_metric_type.values():
        cols_in_order += [col for col in results.columns if col not in cols_in_order]
        
        if df is None:
            df = results
        else:
            df = df.merge(results, on=KEY_COLS, how='outer')
                
    return df[cols_in_order], results_by_metric_type

In [10]:
def filter_results(dataset_type=None, metrics=None,
                   metrics_contain=None, free=None,
                   contains=None, doesnt_contain=None,
                   drop=None, drop_na_rows=False, drop_key_cols=False):
    df = RESULTS_DF
    
    if dataset_type:
        if isinstance(dataset_type, str):
            df = df[df['dataset_type'] == dataset_type]
        elif isinstance(dataset_type, (list, tuple)):
            dataset_type = set(dataset_type)
            df = df[df['dataset_type'].isin(dataset_type)]
    
    if free is not None:
        free_str = 'free' if free else 'notfree'
        df = df.loc[df['free'] == free_str]
    
    if contains:
        filter_contains = lambda d, s: d.loc[d['run_name'].str.contains(s)]
        if isinstance(contains, (list, tuple)):
            for c in contains:
                df = filter_contains(df, c)
        elif isinstance(contains, str):
            df = filter_contains(df, contains)
    
    if doesnt_contain:
        filter_doesnt_contain = lambda d, s: d.loc[~d['run_name'].str.contains(s)]
        if isinstance(doesnt_contain, (list, tuple)):
            for c in doesnt_contain:
                df = filter_doesnt_contain(df, c)
        elif isinstance(doesnt_contain, str):
            df = filter_doesnt_contain(df, doesnt_contain)
    
    if drop:
        df = df.loc[~df['run_name'].str.contains(drop)]
        
    if metrics_contain:
        columns = KEY_COLS + [c for c in df.columns if metrics_contain in c]
        df = df[columns]
    elif metrics:
        columns = KEY_COLS + metrics
        df = df[columns]
    
    if drop_na_rows:
        df.dropna(axis=0, how='any', inplace=True)

    # Drop cols with all na
    df.dropna(axis=1, how='all', inplace=True)

    if drop_key_cols:
        columns = [c for c in df.columns if c == 'run_name' or c not in KEY_COLS]
        df = df[columns]
    
    return df

## Load results

In [11]:
RESULTS_DF, debug = load_results()
print(len(RESULTS_DF))
RESULTS_DF.head()

84


Unnamed: 0,run_name,dataset_type,free,acc,acc-No Finding,acc-Enlarged Cardiomediastinum,acc-Cardiomegaly,acc-Lung Lesion,acc-Lung Opacity,acc-Edema,acc-Consolidation,acc-Pneumonia,acc-Atelectasis,acc-Pneumothorax,acc-Pleural Effusion,acc-Pleural Other,acc-Fracture,acc-Support Devices,prec,prec-No Finding,prec-Enlarged Cardiomediastinum,prec-Cardiomegaly,prec-Lung Lesion,prec-Lung Opacity,prec-Edema,prec-Consolidation,prec-Pneumonia,prec-Atelectasis,prec-Pneumothorax,prec-Pleural Effusion,prec-Pleural Other,prec-Fracture,prec-Support Devices,recall,recall-No Finding,recall-Enlarged Cardiomediastinum,recall-Cardiomegaly,recall-Lung Lesion,recall-Lung Opacity,recall-Edema,recall-Consolidation,recall-Pneumonia,recall-Atelectasis,recall-Pneumothorax,recall-Pleural Effusion,recall-Pleural Other,recall-Fracture,recall-Support Devices,f1,f1-No Finding,f1-Enlarged Cardiomediastinum,f1-Cardiomegaly,f1-Lung Lesion,f1-Lung Opacity,f1-Edema,f1-Consolidation,f1-Pneumonia,f1-Atelectasis,f1-Pneumothorax,f1-Pleural Effusion,f1-Pleural Other,f1-Fracture,f1-Support Devices,roc_auc,roc_auc-No Finding,roc_auc-Enlarged Cardiomediastinum,roc_auc-Cardiomegaly,roc_auc-Lung Lesion,roc_auc-Lung Opacity,roc_auc-Edema,roc_auc-Consolidation,roc_auc-Pneumonia,roc_auc-Atelectasis,roc_auc-Pneumothorax,roc_auc-Pleural Effusion,roc_auc-Pleural Other,roc_auc-Fracture,roc_auc-Support Devices,loss,word_loss,stop_loss,bleu1,bleu2,bleu3,bleu4,bleu,rougeL,ciderD,distinct_words,distinct_sentences,chex_timer,chex_acc,chex_acc_No Finding,chex_acc_Enlarged Cardiomediastinum,chex_acc_Cardiomegaly,chex_acc_Lung Lesion,chex_acc_Lung Opacity,chex_acc_Edema,chex_acc_Consolidation,chex_acc_Pneumonia,chex_acc_Atelectasis,chex_acc_Pneumothorax,chex_acc_Pleural Effusion,chex_acc_Pleural Other,chex_acc_Fracture,chex_acc_Support Devices,chex_prec,chex_prec_No Finding,chex_prec_Enlarged Cardiomediastinum,chex_prec_Cardiomegaly,chex_prec_Lung Lesion,chex_prec_Lung Opacity,chex_prec_Edema,chex_prec_Consolidation,chex_prec_Pneumonia,chex_prec_Atelectasis,chex_prec_Pneumothorax,chex_prec_Pleural Effusion,chex_prec_Pleural Other,chex_prec_Fracture,chex_prec_Support Devices,chex_recall,chex_recall_No Finding,chex_recall_Enlarged Cardiomediastinum,chex_recall_Cardiomegaly,chex_recall_Lung Lesion,chex_recall_Lung Opacity,chex_recall_Edema,chex_recall_Consolidation,chex_recall_Pneumonia,chex_recall_Atelectasis,chex_recall_Pneumothorax,chex_recall_Pleural Effusion,chex_recall_Pleural Other,chex_recall_Fracture,chex_recall_Support Devices,chex_spec,chex_spec_No Finding,chex_spec_Enlarged Cardiomediastinum,chex_spec_Cardiomegaly,chex_spec_Lung Lesion,chex_spec_Lung Opacity,chex_spec_Edema,chex_spec_Consolidation,chex_spec_Pneumonia,chex_spec_Atelectasis,chex_spec_Pneumothorax,chex_spec_Pleural Effusion,chex_spec_Pleural Other,chex_spec_Fracture,chex_spec_Support Devices,chex_npv,chex_npv_No Finding,chex_npv_Enlarged Cardiomediastinum,chex_npv_Cardiomegaly,chex_npv_Lung Lesion,chex_npv_Lung Opacity,chex_npv_Edema,chex_npv_Consolidation,chex_npv_Pneumonia,chex_npv_Atelectasis,chex_npv_Pneumothorax,chex_npv_Pleural Effusion,chex_npv_Pleural Other,chex_npv_Fracture,chex_npv_Support Devices,chex_f1,chex_f1_No Finding,chex_f1_Enlarged Cardiomediastinum,chex_f1_Cardiomegaly,chex_f1_Lung Lesion,chex_f1_Lung Opacity,chex_f1_Edema,chex_f1_Consolidation,chex_f1_Pneumonia,chex_f1_Atelectasis,chex_f1_Pneumothorax,chex_f1_Pleural Effusion,chex_f1_Pleural Other,chex_f1_Fracture,chex_f1_Support Devices,word_acc,att_loss,att_iou,att_iobb
0,0122_135739_dummy-common-sentences-100_front,test,free,0.906808,0.398438,0.867188,0.820312,0.953125,0.838542,0.994792,0.994792,0.989583,0.963542,0.986979,0.979167,0.984375,0.955729,0.96875,0.038305,0.345794,0.057143,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073287,0.840909,0.1,0.085106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047621,0.490066,0.072727,0.103896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500888,0.503788,0.50467,0.503978,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.349819,0.209409,0.135837,0.09359,0.197163,0.283324,0.157216,105,100,0.0/0.0,0.939174,0.994792,0.848958,0.8125,0.947917,0.825521,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.094866,0.994792,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.085099,1.0,0.095238,0.096154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.919016,0.0,0.94152,0.924699,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875276,0.0,0.894444,0.867232,0.947917,0.825521,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.088611,0.997389,0.121212,0.121951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
1,0122_135739_dummy-common-sentences-100_front,train,free,0.905571,0.432531,0.892775,0.780271,0.95843,0.82811,0.993402,0.984823,0.991092,0.950511,0.989772,0.968987,0.990762,0.970307,0.946222,0.043518,0.400627,0.057416,0.151203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073092,0.842539,0.085714,0.095032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052037,0.543039,0.068768,0.116711,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50064,0.500753,0.508786,0.499424,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.336457,0.205273,0.135745,0.094295,0.192943,0.276334,0.152092,105,100,0.0/0.0,0.937621,0.996041,0.857803,0.772682,0.954141,0.815242,0.985153,0.982844,0.986143,0.929726,0.986803,0.95843,0.987133,0.968327,0.946222,0.088759,0.996041,0.081633,0.164948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.081152,1.0,0.038961,0.097166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.918189,0.0,0.950422,0.904218,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.873911,0.0,0.897365,0.837226,0.954141,0.815242,0.985153,0.982844,0.986143,0.929726,0.986803,0.95843,0.987133,0.968327,0.946222,0.08379,0.998017,0.052747,0.122293,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
2,0122_135739_dummy-common-sentences-100_front,val,free,0.90538,0.420779,0.888312,0.787013,0.968831,0.862338,0.98961,0.98961,0.987013,0.924675,0.987013,0.971429,0.994805,0.958442,0.945455,0.036717,0.394984,0.0,0.119048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064835,0.807692,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045659,0.530526,0.0,0.108696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.49594,0.482449,0.46594,0.494776,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,-1.0,-1.0,-1.0,0.326945,0.196131,0.126751,0.08611,0.183984,0.270924,0.158774,105,100,0.0/0.0,0.93859,0.994805,0.862338,0.768831,0.963636,0.844156,0.981818,0.98961,0.981818,0.906494,0.979221,0.968831,0.994805,0.955844,0.948052,0.081262,0.994805,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078692,1.0,0.0,0.101695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.917204,0.0,0.951289,0.889571,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875853,0.0,0.902174,0.845481,0.963636,0.844156,0.981818,0.98961,0.981818,0.906494,0.979221,0.968831,0.994805,0.955844,0.948052,0.079729,0.997396,0.0,0.118812,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,
3,0122_135739_dummy-common-sentences-100_front,val,notfree,0.909276,0.506494,0.898701,0.766234,0.966234,0.862338,0.984416,0.98961,0.987013,0.924675,0.979221,0.966234,0.994805,0.958442,0.945455,0.045728,0.444805,0.08,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.079237,0.878205,0.111111,0.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.057228,0.590517,0.093023,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.505051,0.56574,0.52422,0.491343,0.49866,0.5,0.497375,0.5,0.5,0.5,0.496053,0.497326,0.5,0.5,0.5,43.466139,6.963101,36.503038,0.393895,0.241075,0.157749,0.108631,0.225337,0.294778,0.236226,105,133,1.2/1.2,0.937291,0.98961,0.872727,0.763636,0.961039,0.844156,0.976623,0.98961,0.981818,0.906494,0.974026,0.963636,0.994805,0.955844,0.948052,0.093195,0.994778,0.117647,0.192308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08713,0.994778,0.055556,0.169492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.914967,0.0,0.95702,0.871166,0.997305,1.0,0.994709,1.0,1.0,1.0,0.994695,0.994638,1.0,1.0,1.0,0.876735,0.0,0.907609,0.852853,0.963542,0.844156,0.981723,0.98961,0.981818,0.906494,0.979112,0.968668,0.994805,0.955844,0.948052,0.089316,0.994778,0.075472,0.18018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038147,,,
4,0122_135739_dummy-common-sentences-100_front,test,notfree,0.902344,0.377604,0.864583,0.796875,0.950521,0.833333,0.994792,0.994792,0.989583,0.963542,0.981771,0.976562,0.984375,0.955729,0.96875,0.033314,0.326861,0.0,0.139535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.063772,0.765152,0.0,0.12766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.042242,0.45805,0.0,0.133333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.494744,0.469877,0.456044,0.508934,0.498634,0.496894,0.5,0.5,0.5,0.5,0.497361,0.49867,0.5,0.5,0.5,28.614465,6.94264,21.671825,0.398164,0.243229,0.159492,0.108269,0.227289,0.303682,0.232407,105,129,1.0/1.0,0.936198,0.994792,0.84375,0.783854,0.945312,0.820312,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.081023,0.994792,0.0,0.139535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07967,1.0,0.0,0.115385,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.916205,0.0,0.947368,0.888554,0.997253,0.993691,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.874392,0.0,0.885246,0.865103,0.947781,0.824607,0.984375,0.986979,0.984375,0.916667,0.979167,0.966146,0.976562,0.955729,0.96875,0.080265,0.997389,0.0,0.126316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.036335,,,


In [12]:
set(
    col.replace('-', '_').split('_')[0]
    for col in RESULTS_DF.columns
)

{'acc',
 'att',
 'bleu',
 'bleu1',
 'bleu2',
 'bleu3',
 'bleu4',
 'chex',
 'ciderD',
 'dataset',
 'distinct',
 'f1',
 'free',
 'loss',
 'prec',
 'recall',
 'roc',
 'rougeL',
 'run',
 'stop',
 'word'}

## Segmentation

In [None]:
def add_macro_avg_column(target_col):
    matching_cols = [c for c in RESULTS_DF.columns if c.startswith(target_col)]
    assert len(matching_cols) == 3, f'Matching cols not 3: {matching_cols}'
    averages = RESULTS_DF[matching_cols].mean(axis=1)
    RESULTS_DF[target_col] = averages
    print(f'Calculated col {target_col}')

In [None]:
add_macro_avg_column('n-shapes-gen')
add_macro_avg_column('n-holes-gen')

In [None]:
SEG_METRICS = []
organs = ('heart', 'left lung', 'right lung')
def _add_metric(metric_name, macro=True):
    if macro: SEG_METRICS.append(metric_name)
    SEG_METRICS.extend(f'{metric_name}-{organ}' for organ in organs)
_add_metric('iou')
# _add_metric('dice')
_add_metric('n-shapes-gen')
_add_metric('n-holes-gen')
SEG_METRICS

In [None]:
replace_strs = [
    # (r'^\d{4}_\d{6}_', ''),
    (r'jsrt_scan_', ''),
#     ('most-similar-image', '1nn'),
#     ('_lr[\d\.]+', ''),
#     ('_size256', ''),
#     (r'_\d{4}_\d{6}_.*', ''),
#     ('dummy-', ''),
#     ('common', 'top'),
#     ('-v2', ''),
#     (r'top-(\w)\w+-(\d+)', r'top-\1-\2'),
#     ('_densenet-121', ''),
]

def rename_runs(run_name):
    s = run_name
    for target, replace_with in replace_strs:
        s = re.sub(target, replace_with, s)
    return s

In [None]:
filter_results(
    metrics=SEG_METRICS,
    dataset_type='test',
    drop='1105_180035',
).sort_values(
    ['n-shapes-gen', 'n-holes-gen'],
    ascending=True,
).set_index('run_name').rename(index=rename_runs)

## Report generation

In [13]:
NLP_METRICS = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'bleu', 'rougeL', 'ciderD']
CHEXPERT_METRICS = ['acc', 'roc_auc', 'recall', 'prec', 'f1', 'roc_auc']
CHEXPERT_METRICS = [c for c in RESULTS_DF.columns
                    if any(c.startswith(ch) for ch in CHEXPERT_METRICS)]
CHEXPERT_RUNTIME_METRICS = [col for col in RESULTS_DF.columns if col.startswith('chex')]
VAR_METRICS = [c for c in RESULTS_DF.columns if 'distinct' in c]
MIRQI_METRICS = [c for c in RESULTS_DF.columns if 'MIRQI' in c]

In [14]:
ESSENTIAL_METRICS = [
    'bleu', 'ciderD', 'rougeL',
    # 'chex_f1', 'chex_acc', # 'chex_recall', 'chex_prec', # Runtime-chexpert
    # 'MIRQI-v2-f',
    'roc_auc', 'acc', 'f1', 'f1-woNF', 'prec', 'recall', # Holistic-chexpert
]

In [15]:
runtime_chexpert = [c for c in RESULTS_DF.columns if c.startswith('chex_f1')]
holistic_chexpert = [c for c in RESULTS_DF.columns if c.startswith('f1')]
# metrics = runtime_chexpert + holistic_chexpert
metrics = [
    'chex_f1', 'chex_f1_No Finding',
    'f1', 'f1-No Finding'
]

In [16]:
pd.options.display.float_format = '{:.3f}'.format

In [18]:
filter_results(
    # contains='^01\d{2}_\d{6}_(h-|lstm)',
    contains='.*dummy',
    dataset_type='test',
    free=True,
    metrics=ESSENTIAL_METRICS, # CHEXPERT_RUNTIME_METRICS,
    drop_key_cols=True,
    drop_na_rows=True,
).replace(r'^\d{4}_\d{6}_(.*)', r'\1', regex=True).sort_values('run_name')

Unnamed: 0,run_name,bleu,ciderD,rougeL,roc_auc,acc,f1,prec,recall
0,dummy-common-sentences-100_front,0.197,0.157,0.283,0.501,0.907,0.048,0.038,0.073
32,dummy-common-words-100_front,0.118,0.087,0.215,0.492,0.81,0.055,0.061,0.126
79,dummy-constant_front,0.286,0.29,0.355,0.5,0.913,0.037,0.025,0.071
8,dummy-random_front,0.196,0.142,0.275,0.498,0.887,0.063,0.058,0.071


### Compare runtime chexpert vs holistic chexpert

In [None]:
def subtract_cols(df, cols_a, cols_b, drop_na_rows=True):
    array_a = df[cols_a].to_numpy()
    array_b = df[cols_b].to_numpy()
    
    df_2 = df[KEY_COLS].copy()
    df_2 = pd.concat([df_2, pd.DataFrame(array_a - array_b, columns=cols_a)], axis=1)
    
    if drop_na_rows:
        df_2.dropna(axis=0, inplace=True, how='any')
    
    return df_2

In [None]:
metric = 'f1'

runtime_chexpert = [c for c in RESULTS_DF.columns if c.startswith(f'chex_{metric}')]
holistic_chexpert = [c for c in RESULTS_DF.columns if c.startswith(metric)]

In [None]:
df = RESULTS_DF
df = df.loc[~df['run_name'].str.contains('dummy')]
len(df)

In [None]:
set(df['run_name'])

In [None]:
df = subtract_cols(df, runtime_chexpert, holistic_chexpert)
df.head()

In [None]:
df.describe()

In [None]:
df

In [None]:
from collections import Counter

In [None]:
run_name = '0112_154506_lstm-v2_lr0.001_densenet-121-v2_noes'
debug = False
d1 = load_rg_outputs(run_name, debug=debug, free=True)
d2 = load_rg_outputs(run_name, debug=debug, free=False)
len(d1), len(d2)

In [None]:
c1 = Counter(d1['filename'])
c2 = Counter(d2['filename'])
len(c1), len(c2)

In [None]:
for fname in c1.keys():
    v1 = c1[fname]
    v2 = c2[fname]
    if v1 != v2:
        print('Wrong: ', fname, v1, v2)

In [None]:
d2.head()

In [None]:
set(d2['dataset_type'])

### Pretty-print (latex)

In [None]:
replace_strs = [
    (r'^\d{4}_\d{6}_', ''),
    ('most-similar-image', '1nn'),
    ('_lr[\d\.]+', ''),
    ('_size256', ''),
    (r'_\d{4}_\d{6}_.*', ''),
    ('dummy-', ''),
    ('common', 'top'),
    ('-v2', ''),
    (r'top-(\w)\w+-(\d+)', r'top-\1-\2'),
    ('_densenet-121', ''),
]

def rename_runs(run_name):
    s = run_name
    for target, replace_with in replace_strs:
        s = re.sub(target, replace_with, s)
    return s

In [None]:
columns = ['bleu', 'rougeL', 'ciderD'] + CHEXPERT_METRICS + MIRQI_METRICS

In [None]:
df = filter_results(dataset_type='test',
                    free=True,
                    metrics=columns,
                    contains='(?=_lstm-att-v2.*densenet|_lstm-v2.*densenet|dummy)',
                    drop='0915_173951|0915_174222|0916_104739',
                    drop_na_rows=True,
                   )

In [None]:
df

In [None]:
shorten_cols = lambda s: s.replace('MIRQI-v2', 'v2')

In [None]:
print(df.set_index('run_name').rename(
    index=rename_runs,
    columns=shorten_cols,
).sort_index().to_latex(
    columns=[shorten_cols(c) for c in columns],
    float_format='%.3f',
    column_format='l' + 'c' * len(columns),
))

## Classification

In [None]:
# contains = 'covid-x'
# contains = 'cxr14'
# contains = 'e0'
# contains = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid'
# contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
run_name = '0717_120222_covid-x_densenet-121_lr1e-06_os_aug-covid' # WINNER

contains = '0717_101812_covid-x_densenet-121_lr1e-06_os-max2_aug-covid'
contains = 'covid-uc'

In [None]:
ESSENTIAL_METRICS = [
    'acc', 'roc_auc', 'hamming', # 'prec', 'recall',
]

In [None]:
metrics = [
    'acc', 'roc_auc', 'prec', 'recall', 'roc_auc_Cardiomegaly', 'roc_auc_Pneumonia',
    'recall_Cardiomegaly', 'recall_Pneumonia',
    'iobb-masks', 'iobb-masks-Cardiomegaly', 'iobb-masks-Pneumonia',
]

In [None]:
pd.options.display.float_format = '{:.3f}'.format

In [None]:
def simplify_names(s):
    model_name = re.sub(r'^\d{4}_\d{6}_cxr14_(.*)_lr.*', r'\1', s)
    if 'hint' in s:
        return f'{model_name}_hint'
    return model_name

In [None]:
d = filter_results(
    contains=r'^(01|12|02)\w+_cxr14_',
    doesnt_contain=[
        r'_Card',
        r'_Pneu',
        '1027_144914', # really bad results
    ],
    dataset_type=('val'), # -bbox
    # metrics_contain='iou',
    metrics=metrics, # ESSENTIAL_METRICS,
    drop_key_cols=True,
    # drop_na_rows=True,
).sort_values('roc_auc', ascending=False)
# d['run_name'] = d['run_name'].apply(simplify_names)
d

## Report-generation: results at different report lengths

In [None]:
vals_words = [20, 25, 27, 33, 44, None]
vals_sents = [3, 4, 5, 6, None]

In [None]:
max_words = vals_words[0]
suffix = f'max-words-{max_words}' if max_words else ''
all_results = load_results(suffix)
results_df_test = create_results_df(all_results, 'test')
results_df_test