# Imports

In [None]:
from collections import Counter
from tqdm import tqdm

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import numpy as np

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

# Load outputs

In [None]:
%run ../../utils/files.py
%run ../../metrics/report_generation/writer.py

In [None]:
# run_name = '1119_183153' # mimic-cxr ST
run_name, best = '1113_185718', 'bleu4' # mimic-cxr SAT
# run_name, best = '1102_190559', None # mimic-cxr tpl single

# run_name, best = '1123_001440', 'bleu4' # IU SAT
# run_name = '1119_183609' # IU ST
# run_name = '1118_210821' # IU TPL grouped
# run_name, best = '1118_210509', None # IU TPL single

In [None]:
run_id = RunId(run_name, debug=False, task='rg')

In [None]:
df = load_rg_outputs(run_id, free=True, best=best, labeled=True)
df.head(2)

In [None]:
df['dataset_type'].value_counts()

In [None]:
df = df.loc[df['dataset_type'] == 'test']
len(df)

# Compute metrics for samples

## NLP metrics

In [None]:
from pycocoevalcap.bleu.bleu_scorer import BleuScorer
from pycocoevalcap.cider.cider_scorer import CiderScorer

In [None]:
%run ./utils.py

In [None]:
df = add_nlp_metrics_to_df(df)
# df.head(2)

## CheX

In [None]:
from collections import defaultdict

In [None]:
from sklearn.metrics import precision_recall_fscore_support as prf1s

In [None]:
%run ./utils.py

In [None]:
df = add_chex_metrics_to_df(df)
# df.head(2)

## Checkout data points

In [None]:
NLP_COLS = [f'bleu{i}' for i in range(1, 5)] + ['rougeL', 'ciderD']
CHEX_COLS = [f'chex-{c}' for c in ('prec', 'recall', 'f1', 'acc')]

In [None]:
cols = ['ground_truth', 'generated'] + NLP_COLS + CHEX_COLS
df[cols].head(5)

## Correlation matrix

In [None]:
import seaborn as sns

In [None]:
from scipy.stats import pearsonr
from itertools import product

In [None]:
def compute_corr_df(df, show=True):
    cols = NLP_COLS + CHEX_COLS
    
    corr_df = pd.DataFrame(columns=cols, index=cols)
    
    for col1, col2 in tqdm(product(cols, cols), disable=not show, total=len(cols) * len(cols)):
        values1 = df[col1].to_numpy()
        values2 = df[col2].to_numpy()

        corr, pvalue = pearsonr(values1, values2)

        corr_df.loc[col1, col2] = corr
    corr_df = corr_df.astype(np.float)
    
    return corr_df

In [None]:
def plot_corr_df(corr_df):
    xticks = list(corr_df.index)
    yticks = list(corr_df.columns)
    sns.heatmap(corr_df.to_numpy(), annot=True, square=True,
                cmap='Blues', fmt='.2f',
                xticklabels=xticks, yticklabels=yticks)
    plt.title('Metrics pearson corr', fontsize=18)

In [None]:
corr_df = compute_corr_df(df)

In [None]:
print('Run name: ', run_id)
plot_corr_df(corr_df)

## Plot samples

In [None]:
def plot_scatter_and_pearson(df, col1, col2):
    values1 = df[col1].to_numpy()
    values2 = df[col2].to_numpy()

    corr, pvalue = pearsonr(values1, values2)

    plt.scatter(values1, values2)
    plt.title(f'Samples (pearson = {corr:.3f})', fontsize=18)
    plt.xlabel(col1, fontsize=17)
    plt.ylabel(col2, fontsize=17)

In [None]:
plot_scatter_and_pearson(df, 'bleu1', 'chex-prec')