# Save paper results

Save baseline/paper results to file

In [None]:
import json
import os
import numpy as np

In [None]:
%run ../datasets/common/constants.py
%run ../utils/__init__.py
%run ../utils/files.py

# Utils

In [None]:
def _save_metrics(folder, filename, results_dict):
    filepath = os.path.join(folder, filename)
    with open(filepath, 'w') as f:
        json.dump(results_dict, f, indent=2)
    print(f'Saved dict to {filepath}')

In [None]:
def save_mirqi_metrics(folder, results):
    _save_metrics(folder, 'mirqi-metrics-free.json', results)

def save_chexpert_metrics(folder, results):
    _save_metrics(folder, 'chexpert-metrics-free.json', results)

def save_runtime_metrics(folder, results):
    _save_metrics(folder, 'metrics-free.json', results)

In [None]:
def get_paper_folder(dataset, paper):
    assert dataset in ('iu-x-ray', 'mimic-cxr')
    run_name = f'{dataset}_paper_{paper}'
    folder = get_results_folder(RunId(run_name, False, 'rg'), save_mode=True)
    return folder

In [None]:
def calculate_avg_woNF(metrics, prefixes, diseases=CHEXPERT_DISEASES, verbose=False):
    if isinstance(prefixes, str):
        prefixes = (prefixes,)

    macro_avg_woNF = {}

    for prefix in prefixes:
        keys = [
            f'{prefix}-{disease}'
            for disease in diseases
            if disease.lower() != 'no finding'
        ]
        macro_avg = np.mean([metrics[k] for k in keys])
            
        if verbose:
            print(f'Prefix={prefix}, averaging: {keys}')
        macro_avg_woNF[f'{prefix}-woNF'] = macro_avg
    return macro_avg_woNF

# Papers

## Paper MIRQI

In [None]:
folder = get_paper_folder('iu-x-ray', 'zhang-et-al-mirqi')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.441, 0.291, 0.203, 0.147
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.304,
        'rougeL': 0.367,
    }
}
mirqi_results = {
    'test': {
        'MIRQI-r': 0.483,
        'MIRQI-p': 0.490,
        'MIRQI-f': 0.478,
    }
}

In [None]:
save_mirqi_metrics(folder, mirqi_results)
save_runtime_metrics(folder, runtime_results)

## Lovelace et al

In [None]:
folder = get_paper_folder('mimic-cxr', 'lovelace-et-al')

In [None]:
# Using their transformer w/fine-tuning ablation
bleu1, bleu2, bleu3, bleu4 = 0.415, 0.272, 0.193, 0.146
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.316, # not sure if Cider-D or Cider
        'rougeL': 0.318,
    }
}
_values = {
    'f1': 22.8,
    'prec': 33.3,
    'recall': 21.7,

    'f1-Atelectasis': 32.2,
    'f1-Cardiomegaly': 43.3,
    'f1-Consolidation': 7.3,
    'f1-Edema': 29.8,
    'f1-Enlarged Cardiomediastinum': 5.9,
    'f1-Fracture': 0,
    'f1-Lung Lesion': 1.4,
    'f1-Lung Opacity': 17.1,
    'f1-No Finding': 54.1,
    'f1-Pleural Effusion': 48.0,
    'f1-Pleural Other': 0.9,
    'f1-Pneumonia': 3.9,
    'f1-Pneumothorax': 9.8,
    'f1-Support Devices': 66.0,

    'prec-Atelectasis': 43.0,
    'prec-Cardiomegaly': 46.9,
    'prec-Consolidation': 15.7,
    'prec-Edema': 37.6,
    'prec-Enlarged Cardiomediastinum': 12.3,
    'prec-Fracture': 0,
    'prec-Lung Lesion': 23.8,
    'prec-Lung Opacity': 64.0,
    'prec-No Finding': 39.0,
    'prec-Pleural Effusion': 71.2,
    'prec-Pleural Other': 16.1,
    'prec-Pneumonia': 7,
    'prec-Pneumothorax': 12.9,
    'prec-Support Devices': 77.0,

    'recall-Atelectasis': 25.8,
    'recall-Cardiomegaly': 40.2,
    'recall-Consolidation': 4.8,
    'recall-Edema': 24.6,
    'recall-Enlarged Cardiomediastinum': 3.9,
    'recall-Fracture': 0,
    'recall-Lung Lesion': 0.7,
    'recall-Lung Opacity': 9.9,
    'recall-No Finding': 88.2,
    'recall-Pleural Effusion': 36.2,
    'recall-Pleural Other': 0.5,
    'recall-Pneumonia': 2.7,
    'recall-Pneumothorax': 7.8,
    'recall-Support Devices': 57.8,
}
woNF = calculate_avg_woNF(_values, ['f1', 'recall', 'prec'])
_values.update(woNF)
chexpert_results = {
    'test': {
        k: value / 100
        for k, value in _values.items()
    },
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Boag et al

### 1NN

In [None]:
folder = get_paper_folder('mimic-cxr', 'boag-et-al-1nn')

In [None]:
# Using their 1-NN model
bleu1, bleu2, bleu3, bleu4 = 0.305, 0.171, 0.098, 0.057
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.755, # not sure if Cider-D or Cider
    }
}
_values = {
    'acc': 0.818,
    'prec': 0.253,
    'f1': 0.258,

    'f1-Support Devices': 0.527,
    'f1-Lung Opacity': 0.417,
    'f1-Cardiomegaly': 0.445,
    'f1-Atelectasis': 0.375,
    'f1-No Finding': 0.455,
    'f1-Pleural Effusion': 0.532,
    'f1-Edema': 0.286,
    'f1-Enlarged Cardiomediastinum': 0.142,
    'f1-Pneumonia': 0.08,
    'f1-Pneumothorax': 0.111,
    'f1-Fracture': 0.060,
    'f1-Lung Lesion': 0.062,
    'f1-Consolidation': 0.085,
    'f1-Pleural Other': 0.039,
}
woNF = calculate_avg_woNF(_values, 'f1')
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

### cnn-rnn-beam

In [None]:
folder = get_paper_folder('mimic-cxr', 'boag-et-al-cnn-rnn-beam')

In [None]:
# Using their CNN-RNN-beam
bleu1, bleu2, bleu3, bleu4 = 0.305, 0.201, 0.137, 0.092
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.850, # not sure if Cider-D or Cider
    }
}
_values = {
    'acc': 0.837, 
    'prec': 0.304,
    'f1': 0.186,

    'f1-Support Devices': 0.613,
    'f1-Lung Opacity': 0.077,
    'f1-Cardiomegaly': 0.390,
    'f1-Atelectasis': 0.146,
    'f1-No Finding': 0.407,
    'f1-Pleural Effusion': 0.473,
    'f1-Edema': 0.271,
    'f1-Enlarged Cardiomediastinum': 0.134,
    'f1-Pneumonia': 0.03,
    'f1-Pneumothorax': 0.043,
    'f1-Fracture': 0.001,
    'f1-Lung Lesion': 0.001, # less than that
    'f1-Consolidation': 0.014,
    'f1-Pleural Other': 0.001, # less than that
}
woNF = calculate_avg_woNF(_values, ['f1'], CHEXPERT_DISEASES)
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Liu et al

### CCR

In [None]:
folder = get_paper_folder('mimic-cxr', 'liu-et-al-ccr')

In [None]:
# Using their CCR ablation
bleu1, bleu2, bleu3, bleu4 = 0.294, 0.190, 0.134, 0.094
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 0.956,
        'rougeL': 0.284,
    }
}
_values = {
    'acc': 0.868,
    'prec': 0.313,
    'recall': 0.126,

    'prec-No Finding': 0.491,
    'prec-Enlarged Cardiomediastinum': 0.202,
    'prec-Cardiomegaly': 0.678,
    'prec-Lung Lesion': 0,
    'prec-Lung Opacity': 0.640,
    'prec-Edema': 0.280,
    'prec-Consolidation': 0.037,
    'prec-Pneumonia': 0,
    'prec-Atelectasis': 0.476,
    'prec-Pneumothorax': 0.039,
    'prec-Pleural Effusion': 0.683,
    'prec-Pleural Other': 0,
    'prec-Fracture': 0,
    'prec-Support Devices': 0.849,
}
woNF = calculate_avg_woNF(_values, ['prec'], CHEXPERT_DISEASES, verbose=False)
_values.update(woNF)
chexpert_results = {
    'test': _values,
}
woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

### Full

In [None]:
folder = get_paper_folder('mimic-cxr', 'liu-et-al-full')

In [None]:
# Using their CCR ablation
bleu1, bleu2, bleu3, bleu4 = 0.313, 0.206, 0.146, 0.103
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'ciderD': 1.046,
        'rougeL': 0.306,
    }
}
_values = {
    'acc': 0.867,
    'prec': 0.309,
    'recall': 0.134,

    'prec-No Finding': 0.405,
    'prec-Enlarged Cardiomediastinum': 0.167,
    'prec-Cardiomegaly': 0.704,
    'prec-Lung Lesion': 0,
    'prec-Lung Opacity': 0.460,
    'prec-Edema': 0,
    'prec-Consolidation': 0,
    'prec-Pneumonia': 0.4,
    'prec-Atelectasis': 0.521,
    'prec-Pneumothorax': 0.098,
    'prec-Pleural Effusion': 0.689,
    'prec-Pleural Other': 0,
    'prec-Fracture': 0,
    'prec-Support Devices': 0.880,
}
woNF = calculate_avg_woNF(_values, ['prec'], CHEXPERT_DISEASES, verbose=False)
woNF

In [None]:
_values.update(woNF)

chexpert_results = {
    'test': _values,
}

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Ni et al

In [None]:
def calculate_f1(values, diseases):
    f1s = dict()
    for disease in diseases:
        prec = values[f'prec-{disease}']
        recall = values[f'recall-{disease}']
        
        f1 = 2 * (prec * recall) / (prec + recall)
        f1s[f'f1-{disease}'] = f1
    f1s['f1'] = np.mean(list(f1s.values()))
    return f1s

In [None]:
folder = get_paper_folder('mimic-cxr', 'ni-et-al')

In [None]:
# MIMIC-CXR dataset but only with abnormal findings!!!
# approx 30k samples in total
# CVSE + mutual exclusivity ablation 
bleu4, bleu1 = 0.036, 0.192
# meteor = 0.077
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu4': bleu4,
        'rougeL': 0.153,
    }
}
_values = {
    'acc': 0.863,
    'prec': 0.317,
    'recall': 0.224,

    'acc-No Finding': 0.769,
    'acc-Enlarged Cardiomediastinum': 0.926,
    'acc-Cardiomegaly': 0.801,
    'acc-Lung Lesion': 0.921,
    'acc-Lung Opacity': 0.692,
    'acc-Edema': 0.920,
    'acc-Consolidation': 0.876,
    'acc-Pneumonia': 0.859,
    'acc-Atelectasis': 0.773,
    'acc-Pneumothorax': 0.964,
    'acc-Pleural Effusion': 0.894,
    'acc-Pleural Other': 0.962,
    'acc-Fracture': 0.917,
    'acc-Support Devices': 0.808,

    'prec-No Finding': 0.346,
    'prec-Enlarged Cardiomediastinum': 0.063,
    'prec-Cardiomegaly': 0.512,
    'prec-Lung Lesion': 0.192,
    'prec-Lung Opacity': 0.635,
    'prec-Edema': 0.405,
    'prec-Consolidation': 0.130,
    'prec-Pneumonia': 0.364,
    'prec-Atelectasis': 0.525,
    'prec-Pneumothorax': 0.073,
    'prec-Pleural Effusion': 0.640,
    'prec-Pleural Other': 0.145,
    'prec-Fracture': 0.063,
    'prec-Support Devices': 0.348,

    'recall-No Finding': 0.265,
    'recall-Enlarged Cardiomediastinum': 0.060,
    'recall-Cardiomegaly': 0.606,
    'recall-Lung Lesion': 0.121,
    'recall-Lung Opacity': 0.237,
    'recall-Edema': 0.206,
    'recall-Consolidation': 0.181,
    'recall-Pneumonia': 0.214,
    'recall-Atelectasis': 0.320,
    'recall-Pneumothorax': 0.051,
    'recall-Pleural Effusion': 0.465,
    'recall-Pleural Other': 0.036,
    'recall-Fracture': 0.050,
    'recall-Support Devices': 0.321,
}
f1 = calculate_f1(_values, CHEXPERT_DISEASES)
_values.update(f1)
woNF = calculate_avg_woNF(_values, ['recall', 'f1', 'prec'], CHEXPERT_DISEASES, verbose=False)
_values.update(woNF)

chexpert_results = {
    'test': _values,
}
f1, woNF

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## Chen et al

In [None]:
folder = get_paper_folder('mimic-cxr', 'chen-et-al')

In [None]:
bleu1, bleu2, bleu3, bleu4 = 0.353, 0.218, 0.145, 0.103
# meteor = 0.142 # unused!
runtime_results = {
    'test': {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]),
        'rougeL': 0.277,
    }
}
chexpert_results = {
    'test': {
        'f1': 0.276,
        'prec': 0.333,
        'recall': 0.273,
    },
}

In [None]:
save_chexpert_metrics(folder, chexpert_results)
save_runtime_metrics(folder, runtime_results)

## RTEx paper

In [None]:
mimic_folder = get_paper_folder('mimic-cxr', 'rtex')

In [None]:
runtime_results = {
    'test': {
        'bleu4': 5.9 / 100, # Assume is bleu4
        'rougeL': 20.5 / 100
    }
}
chexpert_results = {
    'test': {
        'prec': 0.229,
        'recall': 0.284,
    },
}

In [None]:
save_chexpert_metrics(mimic_folder, chexpert_results)
save_runtime_metrics(mimic_folder, runtime_results)

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'rtex')

In [None]:
runtime_results = {
    'test': {
        'bleu4': 5.5 / 100,
        'rougeL': 20.2 / 100
    }
}
chexpert_results = {
    'test': {
        'prec': 0.193,
        'recall': 0.222,
    },
}

In [None]:
save_chexpert_metrics(iu_folder, chexpert_results)
save_runtime_metrics(iu_folder, runtime_results)

## Survey IU papers

Papers that only report NLP metrics in IU

In [None]:
iu_folder = get_paper_folder('iu-x-ray', 'rtex')

In [None]:
PAPER_RESULTS = [
    # paper, bleu1, bleu2, bleu3, bleu4, rougeL, cider-D
    # ('coatt', 0.517, 0.386, 0.306, 0.247, 0.447, 0.327), # findings+impression
    ('coatt_re-impl-hrgr', 0.455, 0.288, 0.205, 0.154, 0.369, 0.277),
    ('coatt_re-impl-huang-et-al', 0.429, 0.295, 0.201, 0.148, 0.340, 0.278),
    ('coatt_re-impl-a3fn', 0.421, 0.324, 0.225, 0.174, 0.341, 0.331),
#     ('hrgr', 0.438, 0.298, 0.208, 0.151, 0.369, 0.343),
#     ('kerp', 0.482, 0.325, 0.226, 0.162, 0.339, 0.280),
#     ('tienet', 0.330, 0.194, 0.124, 0.081, 0.311, 1.334), # Reported in Liu et al.
#     ('rtmic', 0.350, 0.234, 0.143, 0.096, None, 0.323), # Cider, not -D
#     ('clara', 0.471, 0.324, 0.214, 0.199, None, 0.359),
#     ('syeda-et-al', 0.560, 0.510, 0.500, 0.490, 0.580, None), # findings+impression, apparently
]

In [None]:
for result in PAPER_RESULTS:
    paper, bleu1, bleu2, bleu3, bleu4, rougeL, ciderD = result
    folder = get_paper_folder('iu-x-ray', paper)
    
    d = {
        'bleu1': bleu1, 'bleu2': bleu2, 'bleu3': bleu3, 'bleu4': bleu4,
        'bleu': np.mean([bleu1, bleu2, bleu3, bleu4]), # will fail if any is None
    }
    if ciderD is not None:
        d['ciderD'] = ciderD
    if rougeL is not None:
        d['rougeL'] = rougeL
    
    runtime_results = {'test': d}
    
    save_runtime_metrics(folder, runtime_results)