## Import

In [None]:
%env CUDA_VISIBLE_DEVICES=2,3
!echo $CUDA_VISIBLE_DEVICES

In [None]:
import torch

In [None]:
# DEVICE = torch.device('cpu')
DEVICE = torch.device('cuda')
DEVICE

In [None]:
from tqdm.notebook import tqdm

In [None]:
%run ../datasets/iu_xray.py
%run ../training/report_generation/hierarchical.py
%run ../utils/nlp.py

In [None]:
train_dataset = IUXRayDataset('train')
len(train_dataset)

## Compute dataset stats

In [None]:
from collections import Counter

In [None]:
def get_dataset_stats(dataset):
    word_appearances = dict()

    words_count = []
    sentences_count = []

    for image, report in tqdm(dataset):
        # Save appearances
        for word in report:
            if word not in word_appearances:
                word_appearances[word] = 0
            word_appearances[word] += 1

        # Must end with a dot
        if report[-1] != END_OF_SENTENCE_IDX:
            report.append(END_OF_SENTENCE_IDX)

        # Count words
        n_words = len(report)
        words_count.append(n_words)

        # Count sentences
        n_sentences = report.count(END_OF_SENTENCE_IDX)
        sentences_count.append(n_sentences)

    return word_appearances, words_count, sentences_count

In [None]:
word_appearances, words_count, sentences_count = get_dataset_stats(train_dataset)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def plot_counter(counter, title):
    x = list(counter.keys())
    y = list(counter.values())
    
    plt.bar(x, y)
    plt.title(title)

In [None]:
plot_counter(Counter(words_count), 'words')

In [None]:
plot_counter(Counter(sentences_count), 'sentences')

In [None]:
counter = Counter(sentences_count)

In [None]:
def get_percentiles(counter):
    items = sorted(counter.items())

    keys, values = zip(*items)

    return list(zip(keys, np.cumsum(values) / total * 100))

In [None]:
get_percentiles(Counter(words_count))

In [None]:
get_percentiles(Counter(sentences_count))

## Evaluate models in subsets

In [None]:
%run -n train_report_generation.py
%run datasets/__init__.py
%run models/checkpoint/__init__.py
%run training/report_generation/flat.py
%run training/report_generation/hierarchical.py
%run models/report_generation/__init__.py

In [None]:
def eval_in_subset(run_name, compiled_model, debug=True, max_n_words=None, max_n_sentences=None,
                   device='cuda'):
    # Create datasets
    vocab = compiled_model.metadata['vocab']
    train_dataset = IUXRayDataset('train', vocab=vocab)
    val_dataset = IUXRayDataset('val', vocab=vocab)
    test_dataset = IUXRayDataset('test', vocab=vocab)
    
    # Prepare subsets
    subset_kwargs = {
        'max_n_words': max_n_words,
        'max_n_sentences': max_n_sentences,
    }
    
    train_subset = create_report_dataset_subset(train_dataset, **subset_kwargs)
    val_subset = create_report_dataset_subset(val_dataset, **subset_kwargs)
    test_subset = create_report_dataset_subset(test_dataset, **subset_kwargs)
    
    # Decide hierachical
    decoder_name = compiled_model.metadata['decoder_kwargs']['decoder_name']
    hierarchical = is_decoder_hierarchical(decoder_name)
    if hierarchical:
        create_dataloader = create_hierarchical_dataloader
    else:
        create_dataloader = create_flat_dataloader

    # Create dataloaders
    BS = 50
    train_dataloader = create_dataloader(train_subset, batch_size=BS)
    val_dataloader = create_dataloader(val_subset, batch_size=BS)
    test_dataloader = create_dataloader(test_subset, batch_size=BS)
    
    # Create a suffix
    if max_n_words:
        suffix = f'max-words-{max_n_words}'
    elif max_n_sentences:
        suffix = f'max-sentences-{max_n_sentences}'
        
    evaluate_and_save(run_name,
                      compiled_model.model,
                      train_dataloader,
                      val_dataloader,
                      test_dataloader,
                      hierarchical=hierarchical,
                      debug=debug,
                      device=device,
                      suffix=suffix,
                     )

In [None]:
eval_n_words = [
    20 , # --> 15%
    25 , # --> 26%
    27 , # --> 33%
    33 , # --> 50%
#     39 , # --> 66%
#     41 , # --> 70%
    44 , # --> 75%
#     47 , # --> 80%
#     58 , # --> 90%
    # None, # --> 100%
]

In [None]:
eval_n_sentences = [
#     1, # 1.2324835387472564
#     2, # 4.761100793516799
    3, # 25.730204288367382
    4, # 55.10720918453487
    5, # 76.66722944453824
    6, # 89.39726489954415
#     7, # 95.03629917271653
#     8, # 97.6194496032416
#     9, # 98.86881647813608
#     10, # 99.42596657099443
#     11, # 99.71298328549722
#     12, # 99.89869998311667
#     13, # 99.96623332770555
#     17, # 99.98311666385278
#     18, # 100
]

In [None]:
run_names = [
#     '0717_041434_lstm_lr0.0001_densenet-121',
    '0716_211601_lstm-att_lr0.0001_densenet-121', # faltan 33 y 34
#     '0717_015057_h-lstm_lr0.0001_densenet-121',
#     '0716_234501_h-lstm-att_lr0.0001_densenet-121',
]
debug = False

In [None]:
for run_name in run_names:
    compiled_model = load_compiled_model_report_generation(run_name,
                                                           debug=debug,
                                                           multiple_gpu=True,
                                                           device=DEVICE)
    for n_words in tqdm(eval_n_words):
        eval_in_subset(run_name,
                       compiled_model,
                       max_n_words=n_words,
                       max_n_sentences=None,
                       debug=debug,
                       device=DEVICE,
                      )
    for n_sentences in tqdm(eval_n_sentences):
        eval_in_subset(run_name,
                       compiled_model,
                       max_n_words=None,
                       max_n_sentences=n_sentences,
                       debug=debug,
                       device=DEVICE,
                      )

## Eval H-LSTM outputs

### Load model

In [None]:
%run ../models/report_generation/__init__.py
%run ../models/checkpoint/__init__.py

In [None]:
# run_name = '0716_234501_h-lstm-att_lr0.0001_densenet-121'
# run_name = '0717_015057_h-lstm_lr0.0001_densenet-121'
# run_name = '0720_192858_lstm_lr0.0001_densenet-121_size256'
# run_name = '0717_041434_lstm_lr0.0001_densenet-121'
run_name = '0716_211601_lstm-att_lr0.0001_densenet-121'
debug = False

In [None]:
compiled_model = load_compiled_model_report_generation(run_name,
                                                       debug=debug,
                                                       device=DEVICE,
                                                      )

_ = compiled_model.model.eval()

In [None]:
compiled_model.metadata['decoder_kwargs']

In [None]:
VOCAB = compiled_model.metadata['vocab']
len(VOCAB)

### Load data

In [None]:
%run ../datasets/iu_xray.py

In [None]:
dataset_kwargs = {
    'max_samples': None,
    'frontal_only': False,
    'image_size': (512, 512),
    'vocab': VOCAB,
}

train_dataset = IUXRayDataset(dataset_type='train', **dataset_kwargs)
val_dataset = IUXRayDataset(dataset_type='val', **dataset_kwargs)
test_dataset = IUXRayDataset(dataset_type='test', **dataset_kwargs)
len(train_dataset), len(val_dataset), len(test_dataset)

### Eval

In [None]:
from ignite.engine import Engine, Events

In [None]:
%run ../training/report_generation/hierarchical.py
%run ../training/report_generation/flat.py
%run ../utils/nlp.py
%run ../utils/__init__.py
%run ../metrics/report_generation/__init__.py

In [None]:
is_hierarchical = compiled_model.metadata['decoder_kwargs']['decoder_name'].startswith('h-')

if is_hierarchical:
    get_step_fn = get_step_fn_hierarchical
    create_dataloader = create_hierarchical_dataloader
else:
    get_step_fn = get_step_fn_flat
    create_dataloader = create_flat_dataloader

In [None]:
dataloader = create_dataloader(test_dataset, batch_size=1)

In [None]:
free = True

In [None]:
tester = Engine(get_step_fn(compiled_model.model, training=False, free=free, device=DEVICE))
attach_metrics_report_generation(tester, hierarchical=is_hierarchical, free=free)
attach_report_writer(tester, VOCAB, run_name, debug=True)

In [None]:
tester.state.dataloader.dataset.dataset_type

In [None]:
tester.run(dataloader, 1)

In [None]:
tester.state.metrics

In [None]:
def eval_sample(compiled_model, image, report,
                show=True, device=DEVICE, free=False, **kwargs):
    report_reader = ReportReader(compiled_model.metadata['vocab'])
    
    is_hierarchical = compiled_model.metadata['decoder_kwargs']['decoder_name'].startswith('h-')
    
    # Prepare inputs
    images = image.unsqueeze(0).to(device)
    if is_hierarchical:
        reports = split_sentences_and_pad(report)
    else:
        reports = torch.tensor(report)

    reports = reports.unsqueeze(0).to(device)
    
    # Pass thru model
    if not is_hierarchical:
        del kwargs['max_sentences']
    tup = compiled_model.model(images, reports, free=free, **kwargs)
    
    # Parse outputs
    if is_hierarchical:
        generated = _flatten_gen_reports(tup[0], tup[1])
    else:
        generated = tup[0]
        _, generated = generated.max(dim=-1)

    generated = generated.squeeze(0).cpu()
    
    # Print result
    original_report = report_reader.idx_to_text(report)
    generated_report = report_reader.idx_to_text(generated)
    if show:
        print(original_report)
        print('-'*20)
        print(generated_report)
        
    return original_report, generated_report

In [None]:
idx = 200

image, report = train_dataset[idx]

gt, gen = eval_sample(compiled_model, image, report,
                      free=True, max_sentences=100, max_words=100)

### Eval metrics on dataset

#### BLEU

In [None]:
from pycocoevalcap.bleu import bleu_scorer

In [None]:
s = bleu_scorer.BleuScorer(n=4)
s += (gen, [gt])
scores_avg, scores_all = s.compute_score()
scores_avg, len(scores_all[0])

In [None]:
def eval_dataset(dataset, max_samples=None, free=False):
    scorer = bleu_scorer.BleuScorer(n=4)
    report_lens = []
    
    n_samples = len(dataset) if max_samples is None else max_samples
    for idx in tqdm(range(n_samples)):
        image, report = dataset[idx]

        report_lens.append(len(report))

        gt, gen = eval_sample(compiled_model, image, report, show=False,
                              free=free, max_sentences=100, max_words=20)

        scorer += (gen, [gt])
    scores_avg, scores_all = scorer.compute_score()
    scores_all = np.array(scores_all)

    return scores_avg, scores_all, report_lens

In [None]:
train_results = eval_dataset(train_dataset, free=True)
train_results[0]

In [None]:
val_results = eval_dataset(val_dataset, free=True)
val_results[0]

In [None]:
test_results = eval_dataset(test_dataset, free=True)
test_results[0]

#### Rouge-L

In [None]:
from pycocoevalcap.rouge import rouge

In [None]:
idx = 800

image, report = train_dataset[idx]

gt, gen = eval_sample(compiled_model, image, report,
                      free=True, max_sentences=100, max_words=100)

scorer = rouge.Rouge()
scorer.calc_score([gen], [gt])

In [None]:
def eval_dataset(dataset, max_samples=None, free=False):
    scorer = rouge.Rouge()
    scores = []
    report_lens = []
    
    n_samples = len(dataset) if max_samples is None else max_samples
    for idx in tqdm(range(n_samples)):
        image, report = dataset[idx]

        report_lens.append(len(report))

        gt, gen = eval_sample(compiled_model, image, report, show=False,
                              free=free, max_sentences=100, max_words=20)

        scores.append(scorer.calc_score([gen], [gt]))

    return np.mean(scores), np.array(scores), report_lens

In [None]:
test_results = eval_dataset(test_dataset, free=True)
test_results[0]

#### CIDEr

In [None]:
from pycocoevalcap.cider import cider_scorer

In [None]:
idx = 800
image, report = train_dataset[idx]

gt, gen = eval_sample(compiled_model, image, report,
                      free=True, max_sentences=100, max_words=100)

scorer = cider_scorer.CiderScorer(n=4)
scorer += (gen, [gt])

idx = 0
image, report = train_dataset[idx]

gt, gen = eval_sample(compiled_model, image, report,
                      free=True, max_sentences=100, max_words=100)
scorer += (gen, [gt])

scorer.compute_score()

In [None]:
def eval_dataset(dataset, max_samples=None, free=False):
    scorer = cider_scorer.CiderScorer(n=4)
    report_lens = []
    
    n_samples = len(dataset) if max_samples is None else max_samples
    for idx in tqdm(range(n_samples)):
        image, report = dataset[idx]

        report_lens.append(len(report))

        gt, gen = eval_sample(compiled_model, image, report, show=False,
                              free=free, max_sentences=100, max_words=20)

        scorer += (gen, [gt])
    scores_avg, scores_all = scorer.compute_score()
    scores_all = np.array(scores_all)

    return scores_avg, scores_all, report_lens

In [None]:
val_results = eval_dataset(val_dataset, free=False)
val_results[0]

In [None]:
test_results = eval_dataset(test_dataset, free=False)
test_results[0]

#### Plots

In [None]:
import matplotlib.pyplot as plt

In [None]:
results = test_results

In [None]:
all_scores = results[1]
# all_scores = all_scores.mean(axis=0)
# all_scores = all_scores[0, :]
all_scores.shape

In [None]:
lens = np.array(results[2])
lens.shape

###### Plot scatter

In [None]:
plt.scatter(lens, all_scores)
plt.xlabel('Report len')
plt.ylabel('Metric')

###### Adjust linear regression

In [None]:
from sklearn.linear_model import LinearRegression as LR

In [None]:
lr = LR()
lr.fit(lens.reshape(-1, 1), all_scores)

In [None]:
lr.coef_, lr.intercept_

###### Plot hist

In [None]:
plt.hist(all_scores, bins=50)
plt.xlabel('Metric')
plt.ylabel('Occurences')

In [None]:
results[0]

In [None]:
results[1].mean(axis=0).shape

#### Eval sample with ordered scores

In [None]:
results = test_results
dataset = test_dataset

In [None]:
scores_all = results[1]
# BLEU
# ordered_values = [
#     (i, *scores_all[:, i]) 
#     for i in range(scores_all.shape[1])
# ]
ordered_values = list(enumerate(scores_all)) # ROUGE-L, CIDEr
len(ordered_values)

In [None]:
ordered_values = sorted(ordered_values, key=lambda x:x[1], reverse=True)

In [None]:
idx, *scores = ordered_values[2]

print(scores)
image, report = dataset[idx]

gt, gen = eval_sample(compiled_model, image, report,
                      free=True, max_sentences=100, max_words=100)

## Debug metrics

### Distinct words

In [None]:
%run ../metrics/report_generation/distinct_words.py

In [None]:
dw = DistinctWords()
dw.reset()

In [None]:
reports = torch.tensor([[1, 2, 3, 4],
                        [21, 12, 1, 0],
                       ])

In [None]:
reports2 = reports + 1
reports2

In [None]:
dw.update(reports)
dw.update(reports2)
dw.compute()

In [None]:
dw.words_seen

### Distinct sentences

In [None]:
%run ../metrics/report_generation/distinct_sentences.py

In [None]:
ds = DistinctSentences()
ds.reset()

In [None]:
reports = torch.tensor([
    [[11, 21, 37, 4],
     [29, 32, 52, 4],
    ],
    [[11, 21, 37, 4],
     [29, 32, 52, 4],
    ],
    [[12, 23, 47, 4],
     [30, 33, 53, 5],
    ],
])

In [None]:
ds.update(reports)
ds.update(reports + 1)
ds.compute()

In [None]:
ds.sentences_seen

## Debug others

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

In [None]:
a = np.array([[1, 0, 0],
              [0, 1, 0],
              [0, 1, 1],
             ])
b = np.array([[1, 0, 0],
              [0, 1, 0],
              [0, 1, 1],
             ])

In [None]:
roc_auc_score(a, b, average=None)

In [None]:
import hashlib

In [None]:
%run -n ../eval_report_generation_chexpert_labeler.py

In [None]:
run_name = '0915_172446_dummy-common-sentences-100'

In [None]:
evaluate_run(run_name, debug=True, max_samples=30, override=True)

## Debug MIRQI

In [None]:
import pandas as pd
import re

In [None]:
%run -n ../eval_report_generation_mirqi.py

### Load MIRQI output

In [None]:
df = pd.read_csv('~/software/MIRQI/testing2.csv')
df.fillna('', inplace=True)
df.head()

In [None]:
attributes_gt = _attributes_to_list(df['attributes-gt'])
attributes_gen = _attributes_to_list(df['attributes-gen'])
len(attributes_gt), len(attributes_gen)

In [None]:
df['attributes-gt']

In [None]:
%run -n ../eval_report_generation_mirqi.py

In [None]:
scores = MIRQI_v2(attributes_gt, attributes_gen)

In [None]:
scores['MIRQI-v2-attr-p']

In [None]:
idx = 2
attributes_gt[idx], attributes_gen[idx]

## MIRQI Examples

In [None]:
%run -n ../eval_report_generation_mirqi.py
# %run -n ~/software/MIRQI/evaluate.py

### MIRQI original def

In [None]:
def MIRQI(gt_list, cand_list, pos_weight=0.8, attribute_weight=0.3, verbose=False):
    """Compute the score of matching keyword and associated attributes between gt list and candidate list.
       It returns two scores:   MIRQI-r (recall: hits in gt)
                                MIRQI-p (precision: correct ratio of all candidates)
    """

    MIRQI_r = []
    MIRQI_p = []
    MIRQI_f = []

    for gt_report_entry, cand_report_entry in zip(gt_list, cand_list):
        attribute_cand_all = []

        pos_count_in_gt = 0
        pos_count_in_cand = 0
        tp = 0.0
        fp = 0.0
        tn = 0.0
        fn = 0.0

        for gt_entity in gt_report_entry:
            if gt_entity[2] == 'NEGATIVE':
                continue
            pos_count_in_gt = pos_count_in_gt + 1
        neg_count_in_gt = len(gt_report_entry) - pos_count_in_gt

        for entity_index, cand_entity in enumerate(cand_report_entry):
            if cand_entity[2] == 'NEGATIVE':
                for entity_index, gt_entity in enumerate(gt_report_entry):
                    if  gt_entity[1] == cand_entity[1]:
                        if gt_entity[2] == 'NEGATIVE':
                            tn = tn + 1     # true negative hits
                            break
                        else:
                            fn = fn + 1     # false negative hits
                            break
            else:
                pos_count_in_cand = pos_count_in_cand + 1
                for entity_index, gt_entity in enumerate(gt_report_entry):
                    if gt_entity[1] == cand_entity[1]:
                        if gt_entity[2] == 'NEGATIVE':
                            fp = fp + 1     # false positive hits
                            break
                        else:
                            tp = tp + 1.0 - attribute_weight    # true positive hits (key words part)
                            # count attribute hits
                            if gt_entity[3] == '':
                                break
                            attributes_all_gt = gt_entity[3].split('/')
                            attribute_hit_count = 0
                            for attribute in attributes_all_gt:
                                if attribute in cand_entity[3]:
                                    attribute_hit_count = attribute_hit_count + 1
                            # true positive hits (attributes part)
                            temp = attribute_hit_count/len(attributes_all_gt)*attribute_weight
                            tp = tp + temp
                            break
        neg_count_in_cand = len(cand_report_entry) - pos_count_in_cand
        #
        # calculate score for positive/uncertain mentions
        if pos_count_in_gt == 0 and pos_count_in_cand == 0:
            score_r = 1.0
            score_p = 1.0
        elif pos_count_in_gt == 0 and pos_count_in_cand != 0:
            score_r = 0.0
            score_p = 0.0
        elif pos_count_in_gt != 0 and pos_count_in_cand == 0:
            score_r = 0.0
            score_p = 0.0
        else:
            score_r = tp / (tp + fn + 0.000001)
            score_p = tp / (tp + fp + 0.000001)

        # calculate score for negative mentions
        # if neg_count_in_cand != 0 and neg_count_in_gt != 0:
        if tn != 0:
            score_r = score_r * pos_weight + tn / (tn + fp + 0.000001) * (1.0 - pos_weight)
            score_p = score_p * pos_weight + tn / (tn + fn + 0.000001) * (1.0 - pos_weight)

        MIRQI_r.append(score_r)
        MIRQI_p.append(score_p)
        rec_prec = (score_r + score_p)
        MIRQI_f.append(2*(score_r * score_p) / rec_prec if rec_prec != 0.0 else 0.0)

    scores = {
        'MIRQI-r': MIRQI_r,
        'MIRQI-p': MIRQI_p,
        'MIRQI-f': MIRQI_f,
    }

    return scores

### Robust matching

#### Repeated nodes with different attributes

In [None]:
report_gt = "right effusion with mild atelectasis. left effusion is also present."
entities_gt = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right/present'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left/pleural'],
]
report_gen = report_gt
entities_gen = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left/pleural'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right/present'],
]
{
    **MIRQI([entities_gt], [entities_gen]),
    **MIRQI_v2([entities_gt], [entities_gen]),
}

#### GT nodes matched twice

In [None]:
report_gt = "right pleural effusion."
entities_gt = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right'],
]
report_gen = "right pleural effusion. left pleural effusion"
entities_gen = [
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'right'],
    ['effusion', 'Pleural Effusion', 'POSITIVE', 'left'],
]
{
    **MIRQI([entities_gt], [entities_gen]),
    **MIRQI_v2([entities_gt], [entities_gen]),
}