# Imports

In [None]:
import os
from collections import Counter, defaultdict
import importlib
import json
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets import iu_xray, mimic_cxr
IU_DIR = iu_xray.DATASET_DIR
MIMIC_DIR = mimic_cxr.DATASET_DIR

# Load sentences and reports

In [None]:
dataset_dir = IU_DIR
# dataset_dir = MIMIC_DIR

fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(3)

In [None]:
fpath = os.path.join(dataset_dir, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath)
REPORTS_DF.head(3)

# Utils

Calculate metrics

In [None]:
def metric_to_str(name, value, fmt='%.3f'):
    s = '%s=' + fmt
    return s % (name, value)

## NLP

### Cider IDF adaptation

In [None]:
%run ../../metrics/report_generation/nlp/cider_idf.py

In [None]:
reports = list(REPORTS_DF['Reports'])
DOC_FREQ = compute_doc_freq(reports)
LOG_REF_LEN = np.log(len(reports))
len(DOC_FREQ), len(reports), LOG_REF_LEN

In [None]:
def calculate_cider(gt, gen):
    scorer = CiderScorerIDFModified()
    scorer.document_frequency = DOC_FREQ
    scorer.ref_len = LOG_REF_LEN

    assert isinstance(gt, str)
    assert isinstance(gen, str)
    
    scorer += (gt, [gen])
    
    cider, _ = scorer.compute_score()
    return cider

### Others

In [None]:
from pycocoevalcap.bleu.bleu_scorer import BleuScorer
from pycocoevalcap.rouge.rouge import Rouge

In [None]:
def calculate_rouge(gt, gen):
    assert isinstance(gt, str)
    assert isinstance(gen, str)
    scorer = Rouge()
    return scorer.calc_score([gen], [gt])

In [None]:
def calculate_bleu(gt, gen):
    assert isinstance(gt, str)
    assert isinstance(gen, str)
    scorer = BleuScorer(4)
    scorer += (gen, [gt])
    bleus, _ = scorer.compute_score()
    return bleus

### All

In [None]:
def calculate_nlp(gt, gen, show=True, **show_kwargs):
    bleus = calculate_bleu(gt, gen)
    cider = calculate_cider(gt, gen)
    rouge = calculate_rouge(gt, gen)

    if show:
        names = ('B', 'B4', 'R', 'C')
        values = (np.mean(bleus), bleus[-1], rouge, cider)
        print('   '.join(metric_to_str(n, v, **show_kwargs) for n, v in zip(names, values)))
    return bleus, rouge, cider

## CheXpert

In [None]:
%run ../../metrics/report_generation/chexpert.py

In [None]:
from sklearn.metrics import precision_recall_fscore_support as prf1s

In [None]:
def calculate_chexpert(gt, gens, verbose=False, diseases=None):
    assert isinstance(gt, str)
    assert isinstance(gens, list)
    
    raw_labels = apply_labeler_to_column([gt] + gens)
    # shape: 1 + n_gens, 14

    labels = raw_labels.copy()
    labels[labels == -2] = 0
    labels[labels == -1] = 1
    
    if verbose:
        print('Chexpert labels: \n', labels)
    
    if diseases is not None:
        diseases_idx = [CHEXPERT_DISEASES.index(d) for d in diseases]
        labels = labels[:, diseases_idx]
        # print('\tFiltered: ', labels)
    
    gt = np.expand_dims(labels[0, :], 0)
    
    p, r, f = [], [], []
    for i in range(len(gens)):
        precision, recall, f1, _ = prf1s(
            gt,
            np.expand_dims(labels[i+1, :], 0),
            zero_division=0,
        )
        p.append(precision)
        r.append(recall)
        f.append(f1)
    p = np.array(p)
    r = np.array(r)
    f = np.array(f)
    return p, r, f, raw_labels

In [None]:
def get_only_mentions_index(raw_labels):
    # raw_labels shape: n_samples, n_diseases
    index = (raw_labels != -2).any(axis=0)
    return index

## MIRQI

In [None]:
%run ../../metrics/report_generation/mirqi.py
%run -n ../../eval_report_generation_mirqi.py

In [None]:
def calculate_mirqi(gt, gens, verbose=False):
    attributes = _call_mirqi_for_reports([gt] + gens)
    # shape: 1 + n_gens, 1
    
    attributes = _attributes_to_list(attributes.squeeze())
    if verbose:
        print('MIRQI attributes: \n', attributes)
    
    gt = [attributes[0]]
    scores = [MIRQI(gt, [attributes[i+1]]) for i in range(len(gens))]
    
    return scores, attributes

# Toy samples 1

NLP errors in negations, uncertains and synonyms

## Negations, synonyms

In [None]:
gt = 'heart size is mildly enlarged . small right pneumothorax is seen .'
gens = [
    'heart size is normal . no pneumothorax is seen .',
    'the cardiac silhouette is enlarged . no pneumothorax .',
    'mild cardiomegaly . pneumothorax on right lung .',
]

In [None]:
for gen in gens:
    bleus, rouge, cider = calculate_nlp(gt, gen)

In [None]:
scores, attrs = calculate_mirqi(gt, [gt] + gens)
scores

In [None]:
p, r, f, raw_labels = calculate_chexpert(gt, gens)
p, r, f

## Out of reach info

In [None]:
gt = 'comparison to previous exam . heart size is enlarged . dr xxxx was contacted .'
gens = [
    'comparison to previous exam. heart size is enlarged . dr was contacted .',
    'heart size is enlarged .',
]

In [None]:
for gen in gens:
    bleus, rouge, cider = calculate_nlp(gt, gen)

In [None]:
p, r, f, raw_labels = calculate_chexpert(gt, gens)
p, r, f

In [None]:
scores, attrs = calculate_mirqi(gt, [gt] + gens)
scores

# Toy samples 2

Error gradation examples

## First example: Pleural Effusion

In [None]:
gt = 'there is a large right sided effusion .'
gens = [
    'there is a minimal right sided effusion .',
    'there is a large left sided effusion .',
    'there is a large right sided mass .',
]

In [None]:
for gen in gens:
    bleus, rouge, cider = calculate_nlp(gt, gen)

In [None]:
p, r, f, raw_labels = calculate_chexpert(gt, gens)
p, r, f

In [None]:
scores, attrs = calculate_mirqi(gt, [gt] + gens)
scores

In [None]:
attrs

## Second example: Atelectasis vs pneumonia

In [None]:
gt2 = 'opacities in the lung bases may represent atelectasis .'
gens2 = [
    'opacities in the left lung may represent atelectasis .',
    'opacities in the lung bases may represent pneumonia .',
]

In [None]:
for gen2 in gens2:
    bleus, rouge, cider = calculate_nlp(gt2, gen2)

In [None]:
prec, recall, f1, raw_labels = calculate_chexpert(gt2, gens2, verbose=True)
prec, recall, f1

In [None]:
scores2, attrs2 = calculate_mirqi(gt2, [gt2] + gens2)
scores2

In [None]:
attrs2

# MIRQI extraction example

In [None]:
%run ../../metrics/report_generation/mirqi.py

In [None]:
gt = 'heart size is normal . right effusion is present . there is a moderate hiatal hernia .'
gens = [
    'right effusion with mild atelectasis . left effusion is also present . cardiomegaly is present .',
]

In [None]:
scores, attributes = calculate_mirqi(gt, gens)
scores

In [None]:
attributes

## Patch MIRQI

By adding negative sentences (where there are unmentions)

In [None]:
%run ../../metrics/report_generation/mirqi.py

In [None]:
gt2 = gt + ' no atelectasis .'
gens2 = [
    gens[0] + ' no hernia .',
]

In [None]:
scores, attributes = calculate_mirqi(gt2, gens2)
scores

In [None]:
attributes

In [None]:
%run ../../metrics/report_generation/mirqi.py

In [None]:
[gt_attr, gen_attr] = attributes

In [None]:
s = MIRQI([gt_attr], [gen_attr])
s

# Other

In [None]:
gt = 'the fox then jumped over the puppy'
gen = 'the fox jumped over the dog'

In [None]:
r = calculate_nlp(gt, gen)
r