# Save paper results

Save baseline/paper results to file

In [None]:
import json
import os
import numpy as np
import pprint

In [None]:
%run ../datasets/common/constants.py
%run ../utils/__init__.py
%run ../utils/files.py

# Utils

In [None]:
def _save_metrics(folder, filename, results_dict):
    os.makedirs(folder, exist_ok=True)

    filepath = os.path.join(folder, filename)
    with open(filepath, 'w') as f:
        json.dump(results_dict, f, indent=2)
    print(f'Saved dict to {filepath}')

In [None]:
def save_mirqi_metrics(folder, results):
    _save_metrics(folder, 'mirqi-metrics-free.json', results)

def save_chexpert_metrics(folder, results):
    _save_metrics(folder, 'chexpert-metrics-free.json', results)

def save_runtime_metrics(folder, results):
    _save_metrics(folder, 'metrics-free.json', results)

In [None]:
def get_paper_folder(dataset, paper, save_mode=False):
    assert dataset in ('iu-x-ray', 'mimic-cxr')
    run_name = f'{dataset}_paper_{paper}'
    folder = get_results_folder(RunId(run_name, False, 'rg'), save_mode=save_mode)
    return folder

In [None]:
def compute_mean(metrics, prefixes, no_finding=False,
                 diseases=CHEXPERT_DISEASES, verbose=False):
    if isinstance(prefixes, str):
        prefixes = (prefixes,)

    macro_avgs = {}

    for prefix in prefixes:
        keys = [
            f'{prefix}-{disease}'
            for disease in diseases
            if no_finding or disease.lower() != 'no finding'
        ]
        macro_avg = np.mean([metrics[k] for k in keys])

        key = prefix if no_finding else f'{prefix}-woNF'
        macro_avgs[key] = macro_avg
            
        if verbose:
            metrics_sliced = { k: metrics[k] for k in keys }
            s = pprint.pformat(metrics_sliced)
            print(f'Prefix={prefix}, avg={macro_avg}, slice: {len(keys)}, {s}')
            
    return macro_avgs

# Papers

## Paper MIRQI

In [None]:
folder = get_paper_folder('iu-x-ray', 'zhang-et-al-mirqi')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.441, 0.291, 0.203, 0.147
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.304,
        'rougeL': 0.367,
    }
}
mirqi_results = {
    'test': {
        'MIRQI-r': 0.483,
        'MIRQI-p': 0.490,
        'MIRQI-f': 0.478,
    }
}

In [None]:
save_mirqi_metrics(folder, mirqi_results)
save_runtime_metrics(folder, runtime_results)

## Lovelace et al

In [None]:
folder = get_paper_folder('mimic-cxr', 'lovelace-et-al')

In [None]:
# Using their transformer w/fine-tuning ablation
bleu1, bleu2, bleu3, bleu4 = 0.415, 0.272, 0.193, 0.146
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.316, # not sure if Cider-D or Cider
        'rougeL': 0.318,
    }
}
_values = {
    'f1': 22.8,
    'prec': 33.3,
    'recall': 21.7,

    'f1-Atelectasis': 32.2,
    'f1-Cardiomegaly': 43.3,
    'f1-Consolidation': 7.3,
    'f1-Edema': 29.8,
    'f1-Enlarged Cardiomediastinum': 5.9,
    'f1-Fracture': 0,
    'f1-Lung Lesion': 1.4,
    'f1-Lung Opacity': 17.1,
    'f1-No Finding': 54.1,
    'f1-Pleural Effusion': 48.0,
    'f1-Pleural Other': 0.9,
    'f1-Pneumonia': 3.9,
    'f1-Pneumothorax': 9.8,
    'f1-Support Devices': 66.0,

    'prec-Atelectasis': 43.0,
    'prec-Cardiomegaly': 46.9,
    'prec-Consolidation': 15.7,
    'prec-Edema': 37.6,
    'prec-Enlarged Cardiomediastinum': 12.3,
    'prec-Fracture': 0,
    'prec-Lung Lesion': 23.8,
    'prec-Lung Opacity': 64.0,
    'prec-No Finding': 39.0,
    'prec-Pleural Effusion': 71.2,
    'prec-Pleural Other': 16.1,
    'prec-Pneumonia': 7,
    'prec-Pneumothorax': 12.9,
    'prec-Support Devices': 77.0,

    'recall-Atelectasis': 25.8,
    'recall-Cardiomegaly': 40.2,
    'recall-Consolidation': 4.8,
    'recall-Edema': 24.6,
    'recall-Enlarged Cardiomediastinum': 3.9,
    'recall-Fracture': 0,
    'recall-Lung Lesion': 0.7,
    'recall-Lung Opacity': 9.9,
    'recall-No Finding': 88.2,
    'recall-Pleural Effusion': 36.2,
    'recall-Pleural Other': 0.5,
    'recall-Pneumonia': 2.7,
    'recall-Pneumothorax': 7.8,
    'recall-Support Devices': 57.8,
}
woNF = calculate_avg_woNF(_values, ['f1', 'recall', 'prec'])
_values.update(woNF)
chexpert_results = {
    'test': {
        k: value / 100
        for k, value in _values.items()
    },
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Boag et al

### 1NN

In [None]:
folder = get_paper_folder('mimic-cxr', 'boag-et-al-1nn')

In [None]:
# Using their 1-NN model
bleu1, bleu2, bleu3, bleu4 = 0.305, 0.171, 0.098, 0.057
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.755, # not sure if Cider-D or Cider
    }
}
_values = {
    'acc': 0.818,
    'prec': 0.253,
    'f1': 0.258,

    'f1-Support Devices': 0.527,
    'f1-Lung Opacity': 0.417,
    'f1-Cardiomegaly': 0.445,
    'f1-Atelectasis': 0.375,
    'f1-No Finding': 0.455,
    'f1-Pleural Effusion': 0.532,
    'f1-Edema': 0.286,
    'f1-Enlarged Cardiomediastinum': 0.142,
    'f1-Pneumonia': 0.08,
    'f1-Pneumothorax': 0.111,
    'f1-Fracture': 0.060,
    'f1-Lung Lesion': 0.062,
    'f1-Consolidation': 0.085,
    'f1-Pleural Other': 0.039,
}
woNF = calculate_avg_woNF(_values, 'f1')
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

### cnn-rnn (without beam-search)

In [None]:
folder = get_paper_folder('mimic-cxr', 'boag-et-al-cnn-rnn')

In [None]:
# Using their CNN-RNN (wo-beam)
bleu1, bleu2, bleu3, bleu4 = 0.004, 0.001, 0.001, 0.001
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.066, # not sure if Cider-D or Cider
    }
}
_values = {
    'acc': 0.822, 
    'prec': 0.144,
    'f1': 0.067,

    'f1-Support Devices': 0.106,
    'f1-Lung Opacity': 0.330,
    'f1-Cardiomegaly': 0.022,
    'f1-Atelectasis': 0.054,
    'f1-No Finding': 0.362,
    'f1-Pleural Effusion': 0.001, # less than that
    'f1-Edema': 0.009,
    'f1-Enlarged Cardiomediastinum': 0.001, # less than that
    'f1-Pneumonia': 0.01,
    'f1-Pneumothorax': 0.042,
    'f1-Fracture': 0.001, # less than that
    'f1-Lung Lesion': 0.005,
    'f1-Consolidation': 0.002,
    'f1-Pleural Other': 0.001, # less than that
}

In [None]:
wNF = compute_mean(_values, ['f1'], no_finding=True)
woNF = compute_mean(_values, ['f1'], no_finding=False)
woNF, wNF

In [None]:
chexpert_results = {'test': {**_values,**woNF}}

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

### cnn-rnn-beam

In [None]:
folder = get_paper_folder('mimic-cxr', 'boag-et-al-cnn-rnn-beam')

In [None]:
# Using their CNN-RNN-beam
bleu1, bleu2, bleu3, bleu4 = 0.305, 0.201, 0.137, 0.092
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.850, # not sure if Cider-D or Cider
    }
}
_values = {
    'acc': 0.837, 
    'prec': 0.304,
    'f1': 0.186,

    'f1-Support Devices': 0.613,
    'f1-Lung Opacity': 0.077,
    'f1-Cardiomegaly': 0.390,
    'f1-Atelectasis': 0.146,
    'f1-No Finding': 0.407,
    'f1-Pleural Effusion': 0.473,
    'f1-Edema': 0.271,
    'f1-Enlarged Cardiomediastinum': 0.134,
    'f1-Pneumonia': 0.03,
    'f1-Pneumothorax': 0.043,
    'f1-Fracture': 0.001,
    'f1-Lung Lesion': 0.001, # less than that
    'f1-Consolidation': 0.014,
    'f1-Pleural Other': 0.001, # less than that
}
woNF = calculate_avg_woNF(_values, ['f1'], CHEXPERT_DISEASES)
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Liu et al

### CCR

In [None]:
folder = get_paper_folder('mimic-cxr', 'liu-et-al-ccr')

In [None]:
# Using their CCR ablation
bleu1, bleu2, bleu3, bleu4 = 0.294, 0.190, 0.134, 0.094
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.956,
        'rougeL': 0.284,
    }
}
_values = {
    'acc': 0.868,
    'prec': 0.313,
    'recall': 0.126,

    'prec-No Finding': 0.491,
    'prec-Enlarged Cardiomediastinum': 0.202,
    'prec-Cardiomegaly': 0.678,
    'prec-Lung Lesion': 0,
    'prec-Lung Opacity': 0.640,
    'prec-Edema': 0.280,
    'prec-Consolidation': 0.037,
    'prec-Pneumonia': 0,
    'prec-Atelectasis': 0.476,
    'prec-Pneumothorax': 0.039,
    'prec-Pleural Effusion': 0.683,
    'prec-Pleural Other': 0,
    'prec-Fracture': 0,
    'prec-Support Devices': 0.849,
}
woNF = calculate_avg_woNF(_values, ['prec'], CHEXPERT_DISEASES, verbose=False)
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

### Full

In [None]:
folder = get_paper_folder('mimic-cxr', 'liu-et-al-full')

In [None]:
# Using their CCR ablation
bleu1, bleu2, bleu3, bleu4 = 0.313, 0.206, 0.146, 0.103
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 1.046,
        'rougeL': 0.306,
    }
}
_values = {
    'acc': 0.867,
    'prec': 0.309,
    'recall': 0.134,

    'prec-No Finding': 0.405,
    'prec-Enlarged Cardiomediastinum': 0.167,
    'prec-Cardiomegaly': 0.704,
    'prec-Lung Lesion': 0,
    'prec-Lung Opacity': 0.460,
    'prec-Edema': 0,
    'prec-Consolidation': 0,
    'prec-Pneumonia': 0.4,
    'prec-Atelectasis': 0.521,
    'prec-Pneumothorax': 0.098,
    'prec-Pleural Effusion': 0.689,
    'prec-Pleural Other': 0,
    'prec-Fracture': 0,
    'prec-Support Devices': 0.880,
}
woNF = calculate_avg_woNF(_values, ['prec'], CHEXPERT_DISEASES, verbose=False)
woNF

In [None]:
_values.update(woNF)

chexpert_results = {
    'test': _values,
}

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Ni et al

In [None]:
def calculate_f1(values, diseases):
    f1s = dict()
    for disease in diseases:
        prec = values[f'prec-{disease}']
        recall = values[f'recall-{disease}']
        
        f1 = 2 * (prec * recall) / (prec + recall)
        f1s[f'f1-{disease}'] = f1
    f1s['f1'] = np.mean(list(f1s.values()))
    return f1s

In [None]:
folder = get_paper_folder('mimic-cxr', 'ni-et-al')

In [None]:
# MIMIC-CXR dataset but only with abnormal findings!!!
# approx 30k samples in total
# CVSE + mutual exclusivity ablation 
bleu4, bleu1 = 0.036, 0.192
# meteor = 0.077
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu4': bleu4,
        'rougeL': 0.153,
    }
}
_values = {
    'acc': 0.863,
    'prec': 0.317,
    'recall': 0.224,

    'acc-No Finding': 0.769,
    'acc-Enlarged Cardiomediastinum': 0.926,
    'acc-Cardiomegaly': 0.801,
    'acc-Lung Lesion': 0.921,
    'acc-Lung Opacity': 0.692,
    'acc-Edema': 0.920,
    'acc-Consolidation': 0.876,
    'acc-Pneumonia': 0.859,
    'acc-Atelectasis': 0.773,
    'acc-Pneumothorax': 0.964,
    'acc-Pleural Effusion': 0.894,
    'acc-Pleural Other': 0.962,
    'acc-Fracture': 0.917,
    'acc-Support Devices': 0.808,

    'prec-No Finding': 0.346,
    'prec-Enlarged Cardiomediastinum': 0.063,
    'prec-Cardiomegaly': 0.512,
    'prec-Lung Lesion': 0.192,
    'prec-Lung Opacity': 0.635,
    'prec-Edema': 0.405,
    'prec-Consolidation': 0.130,
    'prec-Pneumonia': 0.364,
    'prec-Atelectasis': 0.525,
    'prec-Pneumothorax': 0.073,
    'prec-Pleural Effusion': 0.640,
    'prec-Pleural Other': 0.145,
    'prec-Fracture': 0.063,
    'prec-Support Devices': 0.348,

    'recall-No Finding': 0.265,
    'recall-Enlarged Cardiomediastinum': 0.060,
    'recall-Cardiomegaly': 0.606,
    'recall-Lung Lesion': 0.121,
    'recall-Lung Opacity': 0.237,
    'recall-Edema': 0.206,
    'recall-Consolidation': 0.181,
    'recall-Pneumonia': 0.214,
    'recall-Atelectasis': 0.320,
    'recall-Pneumothorax': 0.051,
    'recall-Pleural Effusion': 0.465,
    'recall-Pleural Other': 0.036,
    'recall-Fracture': 0.050,
    'recall-Support Devices': 0.321,
}
f1 = calculate_f1(_values, CHEXPERT_DISEASES)
_values.update(f1)
woNF = calculate_avg_woNF(_values, ['recall', 'f1', 'prec'], CHEXPERT_DISEASES, verbose=False)
_values.update(woNF)

chexpert_results = {
    'test': _values,
}
f1, woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Chen et al

In [None]:
folder = get_paper_folder('mimic-cxr', 'chen-et-al')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.353, 0.218, 0.145, 0.103
# meteor = 0.142 # unused!
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.277,
    }
}
chexpert_results = {
    'test': {
        'f1': 0.276,
        'prec': 0.333,
        'recall': 0.273,
    },
}

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## RTEx paper

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'rtex')

In [None]:
runtime_results = {
    'test': {
        'bleu4': 5.9 / 100, # Assume is bleu4
        'rougeL': 20.5 / 100
    }
}
chexpert_results = {
    'test': {
        'prec': 0.229,
        'recall': 0.284,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'rtex')

In [None]:
runtime_results = {
    'test': {
        'bleu4': 5.5 / 100,
        'rougeL': 20.2 / 100
    }
}
chexpert_results = {
    'test': {
        'prec': 0.193,
        'recall': 0.222,
    },
}

In [None]:
save_chexpert_metrics(iu_folder, chexpert_results)
save_runtime_metrics(iu_folder, runtime_results)

## Nguyen et al

MV+T+I variant

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'nguyen-et-al')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.495, 0.360, 0.278, 0.224
# meteor = 0.222 # unused!
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.390,
    }
}
chexpert_results = {
    'test': {
        'acc': 0.887,
        # Macro scores:
        # 'auc': 0.784,
        'f1': 0.412,
        'prec': 0.432,
        'recall': 0.418,
        ## Micro scores:
        # 'micro-auc': 0.874,
        # 'micro-f1': 0.576,
        # 'micro-prec': 0.567,
        # 'micro-recall': 0.585,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'nguyen-et-al')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.515, 0.378, 0.293, 0.235
# meteor = 0.219 # unused!
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.362,
    }
}
chexpert_results = {
    'test': {
        'acc': 0.937,
        # Macro scores:
        # 'auc': 0.702,
        'f1': 0.152,
        'prec': 0.142,
        'recall': 0.173,
        ## Micro scores:
        # 'micro-auc': 0.877,
        # 'micro-f1': 0.626,
        # 'micro-prec': 0.604,
        # 'micro-recall': 0.649,
    },
}

In [None]:
save_chexpert_metrics(iu_folder, chexpert_results)
save_runtime_metrics(iu_folder, runtime_results)

## Nishino et al

TS-MRGen w/o modification, is the fair one!

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'nishino-et-al')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.217, 0.118, 0.073, 0.048
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
    }
}
chexpert_results = {
    'test': {
        'acc': 0.873,
        # Macro scores:
        'f1': 0.217,
        # 'micro-f1': 0.296,
        # Most likely micro!! (though not specified)
        # Unclear if macro or micro! probably micro
        # 'micro-prec': 0.482,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

## Liu et al 2021: contrastive attention

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'liu-2021-et-al-CA')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.350, 0.290, 0.152, 0.109
# meteor = 0.151
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.283,
    }
}
chexpert_results = {
    'test': {
        'f1': 0.303,
        'prec': 0.352,
        'recall': 0.298,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.492, 0.314, 0.222, 0.169
# meteor = 0.193
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.381,
    }
}

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'liu-2021-et-al-CA')
save_runtime_metrics(iu_folder, runtime_results)

## Variational topic inference (VTI)

Najdenkoska et al

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'vti')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.418, 0.293, 0.152, 0.109
# meteor = 0.177
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.302,
    }
}
chexpert_results = {
    'test': {
        'f1': 0.210,
        'prec': 0.350,
        'recall': 0.151,
        # Micro:
        # 'micro-f1': 0.403,
        # 'micro-prec': 0.497,
        # 'micro-recall': 0.342,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.493, 0.360, 0.291, 0.154
# meteor = 0.218
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.375,
    }
}

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'vti')
save_runtime_metrics(iu_folder, runtime_results)

## RATCHET

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'ratchet')

In [None]:
# meteor = 0.101
# spice = 0.127
runtime_results = {
    'test': {
        'bleu1': 0.232, # 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        # 'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.240,
        'ciderD': 0.493,
    }
}

In [None]:
_f1_values = {
    'f1-No Finding': 0.451,
    'f1-Enlarged Cardiomediastinum': 0.015,
    'f1-Cardiomegaly': 0.446,
    'f1-Lung Lesion': 0.069,
    'f1-Lung Opacity': 0.344,
    'f1-Edema': 0.407,
    'f1-Consolidation': 0.041,
    'f1-Pneumonia': 0.234,
    'f1-Atelectasis': 0.411,
    'f1-Pneumothorax': 0.110,
    'f1-Pleural Effusion': 0.633,
    'f1-Pleural Other': 0,
    'f1-Fracture': 0,
    'f1-Support Devices': 0.697,
}

_prec_values = {
    'prec-No Finding': 0.344,
    'prec-Enlarged Cardiomediastinum': 0.096,
    'prec-Cardiomegaly': 0.405,
    'prec-Lung Lesion': 0.162,
    'prec-Lung Opacity': 0.500,
    'prec-Edema': 0.582,
    'prec-Consolidation': 0.233,
    'prec-Pneumonia': 0.422,
    'prec-Atelectasis': 0.465,
    'prec-Pneumothorax': 0.110,
    'prec-Pleural Effusion': 0.704,
    'prec-Pleural Other': 0,
    'prec-Fracture': 0,
    'prec-Support Devices': 0.628,
}

_recall_values = {
    'recall-No Finding': 0.653,
    'recall-Enlarged Cardiomediastinum': 0.008,
    'recall-Cardiomegaly': 0.496,
    'recall-Lung Lesion': 0.044,
    'recall-Lung Opacity': 0.262,
    'recall-Edema': 0.312,
    'recall-Consolidation': 0.022,
    'recall-Pneumonia': 0.162,
    'recall-Atelectasis': 0.368,
    'recall-Pneumothorax': 0.110,
    'recall-Pleural Effusion': 0.575,
    'recall-Pleural Other': 0,
    'recall-Fracture': 0,
    'recall-Support Devices': 0.783,
}

In [None]:
_values = {
    'recall': full_mean(_recall_values),
    'recall-woNF': woNF_mean(_recall_values),
    'prec': full_mean(_prec_values),
    'prec-woNF': woNF_mean(_prec_values),
    'f1': full_mean(_f1_values),
    'f1-woNF': woNF_mean(_f1_values),
    **_f1_values,
    **_prec_values,
    **_recall_values,
}
chexpert_results = {
    'test': _values,
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

## Miura et al

Two ablations: fc_E and fc_EN

### FC_E ablation in MIMIC

In [None]:
mimic_folder_fce = get_paper_folder('mimic-cxr', 'miura-et-al-fce')

In [None]:
runtime_results = {
    'test': {
        'bleu4': 0.111,
        'ciderD': 0.492,
    }
}

In [None]:
_values = {
    'prec-Atelectasis': 0.379,
    'recall-Atelectasis': 0.805,
    'f1-Atelectasis': 0.516,
    'prec-Cardiomegaly': 0.343,
    'recall-Cardiomegaly': 0.813,
    'f1-Cardiomegaly': 0.482,
    'prec-Consolidation': 0.196,
    'recall-Consolidation': 0.057,
    'f1-Consolidation': 0.089,
    'prec-Edema': 0.56,
    'recall-Edema': 0.699,
    'f1-Edema': 0.622,
    'prec-Pleural Effusion': 0.682,
    'recall-Pleural Effusion': 0.785,
    'f1-Pleural Effusion': 0.730,
    'prec-Enlarged Cardiomediastinum': 0.048,
    'recall-Enlarged Cardiomediastinum': 0.198,
    'f1-Enlarged Cardiomediastinum': 0.077,
    'prec-Fracture': 0.107,
    'recall-Fracture': 0.054,
    'f1-Fracture': 0.071,
    'prec-Lung Lesion': 0.222,
    'recall-Lung Lesion': 0.021,
    'f1-Lung Lesion': 0.038,
    'prec-Lung Opacity': 0.535,
    'recall-Lung Opacity': 0.104,
    'f1-Lung Opacity': 0.174,
    'prec-No Finding': 0.498,
    'recall-No Finding': 0.417,
    'f1-No Finding': 0.454,
    'prec-Pleural Other': 0,
    'recall-Pleural Other': 0,
    'f1-Pleural Other': 0,
    'prec-Pneumonia': 0.621,
    'recall-Pneumonia': 0.17,
    'f1-Pneumonia': 0.267,
    'prec-Pneumothorax': 0.37,
    'recall-Pneumothorax': 0.128,
    'f1-Pneumothorax': 0.19,
    'prec-Support Devices': 0.532,
    'recall-Support Devices': 0.787,
    'f1-Support Devices': 0.635,
}

In [None]:
d1 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=True)
d2 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=False)
d1, d2

In [None]:
chexpert_results = {'test': { **_values, **d1, **d2 } }

In [None]:
save_chexpert_metrics(mimic_folder_fce, chexpert_results)
save_runtime_metrics(mimic_folder_fce, runtime_results)

### FC_E ablation in IU

In [None]:
iu_folder_fce = get_paper_folder('iu-x-ray', 'miura-et-al-fce')

In [None]:
runtime_results = {'test': { 'bleu4': 0.12, 'ciderD': 0.996 }}

In [None]:
_values = {
    'prec-Atelectasis': 0.358,
    'recall-Atelectasis': 0.477,
    'f1-Atelectasis': 0.409,
    'prec-Cardiomegaly': 0.573,
    'recall-Cardiomegaly': 0.556,
    'f1-Cardiomegaly': 0.564,
    'prec-Consolidation': 0.152,
    'recall-Consolidation': 0.263,
    'f1-Consolidation': 0.192,
    'prec-Edema': 0.309,
    'recall-Edema': 0.507,
    'f1-Edema': 0.384,
    'prec-Pleural Effusion': 0.594,
    'recall-Pleural Effusion': 0.664,
    'f1-Pleural Effusion': 0.627,
    'prec-Enlarged Cardiomediastinum': 0.02,
    'recall-Enlarged Cardiomediastinum': 0.042,
    'f1-Enlarged Cardiomediastinum': 0.027,
    'prec-Fracture': 0.0,
    'recall-Fracture': 0.0,
    'f1-Fracture': 0.0,
    'prec-Lung Lesion': 0.0,
    'recall-Lung Lesion': 0.0,
    'f1-Lung Lesion': 0.0,
    'prec-Lung Opacity': 0.578,
    'recall-Lung Opacity': 0.076,
    'f1-Lung Opacity': 0.134,
    'prec-No Finding': 0.821,
    'recall-No Finding': 0.915,
    'f1-No Finding': 0.865,
    'prec-Pleural Other': 0.0,
    'recall-Pleural Other': 0.0,
    'f1-Pleural Other': 0.0,
    'prec-Pneumonia': 0.386,
    'recall-Pneumonia': 0.248,
    'f1-Pneumonia': 0.302,
    'prec-Pneumothorax': 0.0,
    'recall-Pneumothorax': 0.0,
    'f1-Pneumothorax': 0.0,
    'prec-Support Devices': 0.197,
    'recall-Support Devices': 0.366,
    'f1-Support Devices': 0.256,
}

In [None]:
d1 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=True)
d2 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=False)
d1, d2

In [None]:
chexpert_results = {'test': { **_values, **d1, **d2 } }

In [None]:
save_chexpert_metrics(iu_folder_fce, chexpert_results)
save_runtime_metrics(iu_folder_fce, runtime_results)

### FC_EN ablation in MIMIC

In [None]:
mimic_folder_fcen = get_paper_folder('mimic-cxr', 'miura-et-al-fcen')

In [None]:
runtime_results = {'test': { 'bleu4': 0.114, 'ciderD': 0.509 }}

In [None]:
_values = {
    'prec-Atelectasis': 0.406,
    'recall-Atelectasis': 0.762,
    'f1-Atelectasis': 0.530,
    'prec-Cardiomegaly': 0.375,
    'recall-Cardiomegaly': 0.613,
    'f1-Cardiomegaly': 0.466,
    'prec-Consolidation': 0.192,
    'recall-Consolidation': 0.032,
    'f1-Consolidation': 0.055,
    'prec-Edema': 0.656,
    'recall-Edema': 0.527,
    'f1-Edema': 0.585,
    'prec-Pleural Effusion': 0.659,
    'recall-Pleural Effusion': 0.820,
    'f1-Pleural Effusion': 0.731,
    'prec-Enlarged Cardiomediastinum': 0.046,
    'recall-Enlarged Cardiomediastinum': 0.477,
    'f1-Enlarged Cardiomediastinum': 0.084,
    'prec-Fracture': 0.261,
    'recall-Fracture': 0.107,
    'f1-Fracture': 0.152,
    'prec-Lung Lesion': 0.444,
    'recall-Lung Lesion': 0.041,
    'f1-Lung Lesion': 0.075,
    'prec-Lung Opacity': 0.549,
    'recall-Lung Opacity': 0.266,
    'f1-Lung Opacity': 0.358,
    'prec-No Finding': 0.488,
    'recall-No Finding': 0.399,
    'f1-No Finding': 0.439,
    'prec-Pleural Other': 0.0,
    'recall-Pleural Other': 0.0,
    'f1-Pleural Other': 0.0,
    'prec-Pneumonia': 0.0,
    'recall-Pneumonia': 0.0,
    'f1-Pneumonia': 0.0,
    'prec-Pneumothorax': 0.500,
    'recall-Pneumothorax': 0.103,
    'f1-Pneumothorax': 0.170,
    'prec-Support Devices': 0.490,
    'recall-Support Devices': 0.897,
    'f1-Support Devices': 0.633,
}

In [None]:
d1 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=True)
d2 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=False)
d1, d2

In [None]:
chexpert_results = {'test': { **_values, **d1, **d2 } }

In [None]:
save_chexpert_metrics(mimic_folder_fcen, chexpert_results)
save_runtime_metrics(mimic_folder_fcen, runtime_results)

### FC_EN ablation in IU

In [None]:
iu_folder_fcen = get_paper_folder('iu-x-ray', 'miura-et-al-fcen')

In [None]:
runtime_results = {'test': { 'bleu4': 0.131, 'ciderD': 1.034 }}

In [None]:
_values = {
    'prec-Atelectasis': 0.394,
    'recall-Atelectasis': 0.454,
    'f1-Atelectasis': 0.422,
    'prec-Cardiomegaly': 0.600,
    'recall-Cardiomegaly': 0.467,
    'f1-Cardiomegaly': 0.525,
    'prec-Consolidation': 0.143,
    'recall-Consolidation': 0.053,
    'f1-Consolidation': 0.077,
    'prec-Edema': 0.414,
    'recall-Edema': 0.320,
    'f1-Edema': 0.361,
    'prec-Pleural Effusion': 0.560,
    'recall-Pleural Effusion': 0.664,
    'f1-Pleural Effusion': 0.608,
    'prec-Enlarged Cardiomediastinum': 0.040,
    'recall-Enlarged Cardiomediastinum': 0.208,
    'f1-Enlarged Cardiomediastinum': 0.067,
    'prec-Fracture': 0.031,
    'recall-Fracture': 0.023,
    'f1-Fracture': 0.027,
    'prec-Lung Lesion': 0.667,
    'recall-Lung Lesion': 0.045,
    'f1-Lung Lesion': 0.084,
    'prec-Lung Opacity': 0.411,
    'recall-Lung Opacity': 0.221,
    'f1-Lung Opacity': 0.287,
    'prec-No Finding': 0.817,
    'recall-No Finding': 0.884,
    'f1-No Finding': 0.849,
    'prec-Pleural Other': 0.0,
    'recall-Pleural Other': 0.0,
    'f1-Pleural Other': 0.0,
    'prec-Pneumonia': 0.0,
    'recall-Pneumonia': 0.0,
    'f1-Pneumonia': 0.0,
    'prec-Pneumothorax': 1.00,
    'recall-Pneumothorax': 0.133,
    'f1-Pneumothorax': 0.235,
    'prec-Support Devices': 0.131,
    'recall-Support Devices': 0.561,
    'f1-Support Devices': 0.213,
}

In [None]:
d1 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=True)
d2 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=False)
d1, d2

In [None]:
chexpert_results = {'test': { **_values, **d1, **d2 } }

In [None]:
save_chexpert_metrics(iu_folder_fcen, chexpert_results)
save_runtime_metrics(iu_folder_fcen, runtime_results)

## ARL

### IU

In [None]:
iu_folder_arl = get_paper_folder('iu-x-ray', 'arl')

In [None]:
runtime_results = {'test': {
    'bleu4': 0.125, 'meteor': 0.171, 'rougeL': 0.262, 'ciderD': 0.366,
}}

In [None]:
save_runtime_metrics(iu_folder_arl, runtime_results)

### MIMIC

In [None]:
mimic_folder_arl = get_paper_folder('mimic-cxr', 'arl')

In [None]:
runtime_results = {'test': {
    'bleu4': 0.148, 'meteor': 0.253, 'rougeL': 0.329, 'ciderD': 0.402,
}}

_values = {
    'prec-No Finding': 0.718,
    'recall-No Finding': 0.741,
    'f1-No Finding': 0.729,

    'prec-Lung Opacity': 0.522,
    'recall-Lung Opacity': 0.458,
    'f1-Lung Opacity': 0.488,

    'prec-Atelectasis': 0.426,
    'recall-Atelectasis': 0.259,
    'f1-Atelectasis': 0.322,

    'prec-Pleural Effusion': 0.347,
    'recall-Pleural Effusion': 0.140,
    'f1-Pleural Effusion': 0.200,

    'prec-Pneumonia': 0.356,
    'recall-Pneumonia': 0.148,
    'f1-Pneumonia': 0.209,

    'prec-Cardiomegaly': 0.221,
    'recall-Cardiomegaly': 0.071,
    'f1-Cardiomegaly': 0.108,

    'prec-Edema': 0.134,
    'recall-Edema': 0.038,
    'f1-Edema': 0.059,

    'prec-Support Devices': 0.143,
    'recall-Support Devices': 0.017,
    'f1-Support Devices': 0.0317,

    'prec-Consolidation': 0.089,
    'recall-Consolidation': 0.0119,
    'f1-Consolidation': 0.0209,

    'prec-Lung Lesion': 0.0667,
    'recall-Lung Lesion': 0.0062,
    'f1-Lung Lesion': 0.0113,

    'prec-Pneumothorax': 0.0323,
    'recall-Pneumothorax': 0.0021,
    'f1-Pneumothorax': 0.0040,

    'prec-Enlarged Cardiomediastinum': 0.,
    'recall-Enlarged Cardiomediastinum': 0.,
    'f1-Enlarged Cardiomediastinum': 0.,
    'prec-Fracture': 0.,
    'recall-Fracture': 0.,
    'f1-Fracture': 0.,
    'prec-Pleural Other': 0.,
    'recall-Pleural Other': 0.,
    'f1-Pleural Other': 0.,
}

In [None]:
d1 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=True)
d2 = compute_mean(_values, ['prec', 'recall', 'f1'], no_finding=False)
d1, d2

In [None]:
chexpert_results = {'test': { **_values, **d1, **d2 } }

In [None]:
save_chexpert_metrics(mimic_folder_arl, chexpert_results)
save_runtime_metrics(mimic_folder_arl, runtime_results)

## Survey IU papers

Papers that only report NLP metrics in IU

In [None]:
PAPER_RESULTS = [
    # paper, bleu1, bleu2, bleu3, bleu4, rougeL, cider-D
    # ('coatt', 0.517, 0.386, 0.306, 0.247, 0.447, 0.327), # findings+impression
#     ('hrgr', 0.438, 0.298, 0.208, 0.151, 0.369, 0.343),
#     ('kerp', 0.482, 0.325, 0.226, 0.162, 0.339, 0.280),
#     ('tienet', 0.330, 0.194, 0.124, 0.081, 0.311, 1.334), # Reported in Liu et al.
#     ('rtmic', 0.350, 0.234, 0.143, 0.096, None, 0.323), # Cider, not -D
#     ('clara', 0.471, 0.324, 0.214, 0.199, None, 0.359),
#     ('syeda-et-al', 0.560, 0.510, 0.500, 0.490, 0.580, None), # findings+impression, apparently
#     ('vispi', 0.419, 0.280, 0.201, 0.150, 0.371, 0.553), # findings + impression
]

In [None]:
for result in PAPER_RESULTS:
    paper, bleu1, bleu2, bleu3, bleu4, rougeL, ciderD = result
    folder = get_paper_folder('iu-x-ray', paper)
    
    d = {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]), # will fail if any is None
    }
    if ciderD is not None:
        d['ciderD'] = ciderD
    if rougeL is not None:
        d['rougeL'] = rougeL
    
    runtime_results = {'test': d}
    
    save_runtime_metrics(folder, runtime_results)

## Show (attend) tell re-implementations

Also more CoAtt re-implementations

In [None]:
import pprint

In [None]:
def _check_validity(d):
    total_errors = 0
    for model_name, reimpls in d.items():
        errors = []
        for reim in reimpls:
            if not isinstance(reim, tuple):
                errors.append(f'Not tuple: {reim}')
                continue
            paper_name = reim[0]
            if len(reim) != 12:
                errors.append(f'Len not 12: {paper_name}, len={len(reim)}')
        if errors:
            print(f'{model_name}:\n\t{pprint.pformat(errors)}')
        total_errors += len(errors)
        
    print('Total errors: ', total_errors)

In [None]:
IU_RESULTS = {
    # paper, bleu1, bleu2, bleu3, bleu4, rougeL, meteor, cider-D, chex-acc, -f1, -p, -r
    'show-tell': [
        # coatt values are also used in Yin et al, CDGPT2
        ('coatt', 0.316, 0.211, 0.140, 0.095, 0.267, 0.159, 0.111, None, None, None, None),
        ('liu-et-al', 0.265, 0.157, 0.105, 0.073, 0.306, None, 0.926, 0.915, None, None, None),
        ('huang-et-al', 0.251, 0.137, 0.098, 0.069, 0.294, None, 0.108, None, None, None, None),

        # HRGR values are also used in: KERP, Vispi, RTMIC??, CLARA, CMAS, Chen-et-al,
        # MedWriter
        ('hrgr', 0.216, 0.124, 0.087, 0.066, 0.306, None, 0.294, None, None, None, None),
        
        ('xue-et-al-18', 0.273, 0.144, 0.116, 0.082, 0.226, 0.125, None, None, None, None, None),
        ('singh-et-al', 0.289, 0.173, 0.119, 0.088, 0.265, 0.137, 0.270, None, None, None, None),
        ('harzig-et-al-19a', 0.333, 0.205, 0.136, 0.094, 0.272, 0.145, 0.306, None, None, None, None),
        ('a3fn', 0.311, 0.218, 0.137, 0.092, 0.262, None, 0.124, None, None, None, None),
        ('rtex', None, None, None, 0.069, 0.236, None, None, None, None, 0.118, 0.088),
        
        # CA also used by: CMCL
        ('liu-2021-et-al-CA', 0.352, 0.227, 0.154, 0.109, 0.313, 0.133, None, None, None, None, None),
    ],
    'show-attend-tell': [
        # coatt values are also used in: Yuan et al, Yin et al, S-M (citation wrong) ??
        ('coatt', 0.399, 0.251, 0.168, 0.118, 0.323, 0.167, 0.302, None, None, None, None),
        ('liu-et-al', 0.328, 0.195, 0.123, 0.080, 0.313, None, 1.276, 0.908, None, None, None),
        ('huang-et-al', 0.328, 0.184, 0.109, 0.083, 0.319, None, 0.154, None, None, None, None),
        ('a3fn', 0.351, 0.237, 0.161, 0.120, 0.314, None, 0.278, None, None, None, None),
        ('zhang-et-al-mirqi', 0.433, 0.281, 0.193, 0.138, 0.361, None, 0.320, None, None, None, None),
        
        ('liu-2021-et-al-CA', 0.371, 0.233, 0.159, 0.118, 0.320, 0.147, None, None, None, None, None),
    ],
    'coatt': [
        # CA also used in: CMCL
        ('liu-2021-et-al-CA', 0.463, 0.293, 0.207, 0.155, 0.365, 0.178, None, None, None, None, None),
        
# ('original-coatt', 0.517, 0.386, 0.306, 0.247, 0.447, 0.327), # findings+impression
#     ('coatt_re-impl-hrgr', 0.455, 0.288, 0.205, 0.154, 0.369, 0.277),
#     ('coatt_re-impl-huang-et-al', 0.429, 0.295, 0.201, 0.148, 0.340, 0.278),
#     ('coatt_re-impl-a3fn', 0.421, 0.324, 0.225, 0.174, 0.341, 0.331),
    ]
}

In [None]:
MIMIC_RESULTS = {
    # paper, bleu1, bleu2, bleu3, bleu4, rougeL, meteor, cider-D, chex-acc, -f1, -p, -r
    'show-tell': [
        # Liu et al very similar to boag et al (also both use mimic-alpha version)
        ('liu-et-al', 0.307, 0.201, 0.137, 0.093, 0.300, None, 0.886, 0.837, None, 0.304, 0.173),
        
        # Boag also used by: nishino et al
        ('boag-et-al', 0.305, 0.201, 0.137, 0.092, None, None, 0.850, 0.837, 0.186, 0.304, None),

        # TODO: ratchet has chexpert details (by abn) in the supplementary
        ('ratchet', 0.208, None, None, None, 0.217, 0.108, 0.419, None, 0.186, 0.293, 0.232),
        
        # Chen-et-al also used by: AlignTransformer,
        ('chen-et-al', 0.299, 0.184, 0.121, 0.084, 0.263, 0.124, None, None, None, None, None),
        
        ('rtex', None, None, None, 0.078, 0.257, None, None, None, None, 0.08, 0.118),
        
        # CA also used by: CMCL
        ('liu-2021-et-al-CA', 0.290, 0.182, 0.119, 0.081, 0.249, 0.112, None, None, None, None, None),
        
        ('medwriter', 0.247, 0.165, 0.124, 0.098, 0.314, None, 0.245, None, None, None, None),
    ],
    'show-attend-tell': [
        ('liu-et-al', 0.318, 0.205, 0.137, 0.093, 0.288, None, 0.967, 0.849, None, 0.312, 0.232),
        ('lovelace-et-al', 0.370, 0.240, 0.170, 0.128, 0.310, 0.141, 0.278, None, None, None, None),
        ('liu-2021-et-al-CA', 0.318, 0.186, 0.122, 0.085, 0.267, 0.119, None, None, None, None, None),
    ],
    'coatt': [
        # CA also used in: CMCL
        ('liu-2021-et-al-CA', 0.329, 0.206, 0.133, 0.095, 0.273, 0.129, None, None, None, None, None),

        ('medwriter', 0.410, 0.267, 0.189, 0.144, 0.274, None, 0.234, None, None, None, None),
    ]
}

In [None]:
_check_validity(MIMIC_RESULTS)

In [None]:
_SHORTEN_NAMES = {} # {'show-tell': 'ST', 'show-attend-tell': 'SAT'}

In [None]:
NLP_METRICS = ['bleu1', 'bleu2', 'bleu3', 'bleu4', 'rougeL', 'meteor', 'ciderD']
CHEX_METRICS = ['acc', 'f1', 'prec', 'recall']

In [None]:
def save_results(res, dataset_name):
    for model_name, reimpls in res.items():
        model_name = _SHORTEN_NAMES.get(model_name, model_name)
        for reim in reimpls:
            paper_name = reim[0]
            paper = f'{model_name}_re-impl-{paper_name}'
            folder = get_paper_folder(dataset_name, paper)
            
            if os.path.isdir(folder):
                print(f'Folder already exists, skipping: {folder}')
                continue
            
            # Build nlp metrics dict
            nlp_metrics = {
                metric_name: value
                for value, metric_name in zip(reim[1:], NLP_METRICS)
                if value is not None
            }
            bleu1, bleu2, bleu3, bleu4 = reim[1:5]
            if bleu1 and bleu2 and bleu3 and bleu4:
                nlp_metrics['bleu'] = np.mean([bleu1, bleu2, bleu3, bleu4])

            save_runtime_metrics(folder, {'test': nlp_metrics})
            
            # Build Chex metrics dict
            chex_metrics = {
                metric_name: value
                for value, metric_name in zip(reim[-4:], CHEX_METRICS)
                if value is not None
            }
            
            save_chexpert_metrics(folder, {'test': chex_metrics})

In [None]:
save_results(IU_RESULTS, 'iu-x-ray')

In [None]:
save_results(MIMIC_RESULTS, 'mimic-cxr')