# Imports

In [None]:
import os
from collections import Counter, defaultdict
import importlib
import json
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)
plt.rcParams.update({'font.family': 'serif', 'font.sans-serif': ['CMU', 'Helvetica']})

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
%run -n ./nlp_in_chexpert_groups.py

# Utils

In [None]:
# Load experiments
def load_experiments(dataset_name):
    exp_by_abn = {}
    errors = defaultdict(list)
    for abnormality in CHEXPERT_DISEASES[1:]:
        fname = f'{dataset_name}-{abnormality.replace(" ", "-").lower()}'
        if not exist_experiment_pickle(fname):
            errors['not-found'].append(fname)
            continue
        exp = load_experiment_pickle(fname)
        exp_by_abn[abnormality] = exp
        
    if len(errors['not-found']):
        print('Not found: ', errors['not-found'])
        
    return exp_by_abn

# Debug running experiments

In [None]:
%run -n ./nlp_in_chexpert_groups.py

In [None]:
dataset_info = init_dataset_info('iu')
dataset_info.name

In [None]:
exp = init_experiment('Cardiomegaly', dataset_info)
exp_LO

In [None]:
%%time

kwargs = {
    # 'metric': 'bleu',
    # 'metric': 'rouge',
    'metric': 'cider-IDF',
    'k_times': 500,
    # 'k_gts': 1,
    'max_n': 500,
}
exp.append(calc_score_matrices(exp.grouped_2, dataset_info, groups=(0, 1), **kwargs))
# exp.append(calc_score_matrices(exp.grouped, dataset_info, **kwargs))

In [None]:
exp[-1].cube

# Plot experiments

## Plot one example

In [None]:
# exp = load_experiment_pickle('iu-lung-opacity')
exp = load_experiment_pickle('mimic-cardiomegaly')
# exp = load_experiment_pickle('iu-cardiomegaly')
# exp = load_experiment_pickle('mimic-lung-lesion')
exp

In [None]:
[(i, r.metric, r.groups) for i, r in enumerate(exp.results)]

In [None]:
RESULT_I = 1
METRIC_I = 0

In [None]:
plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plot_heatmap(exp, result_i=RESULT_I, metric_i=METRIC_I, annot_kws={'fontsize':13})

# if len(exp.results) > 1:
#     plt.subplot(1, 2, 2)
#     plot_heatmap(exp, result_i=-2, metric_i=METRIC_I)

In [None]:
%run -n ./nlp_in_chexpert_groups.py

In [None]:
def build_image_fpath(exp, result_i, metric_i, suffix=''):
    metric = exp.results[result_i].metric
    if metric == 'bleu':
        metric += f'{metric_i+1}'
    abn = ABN_SHORTCUTS[exp.abnormality].lower()
    data = exp.dataset

    name = f'nlp-vs-chex-{metric}-{abn}-{data}'
    if suffix:
        name += f'-{suffix}'
    
    fpath = os.path.join(FIGURES_DIR, f'{name}.pdf')
    
    print('Filepath: ', fpath)
    
    return fpath

In [None]:
def build_suptitle(exp, result_i, metric_i):
    result = exp.results[result_i]
    pretty_metric = get_pretty_metric(result.metric, metric_i=metric_i)
    dataset = 'IU X-ray' if exp.dataset == 'iu' else 'MIMIC-CXR'
    return f'{pretty_metric} in {exp.abnormality} sentences ({dataset} dataset)'

In [None]:
# plt.figure(figsize=(8, 6))
shape = (2, 2) # Axes shape
ax1 = plt.subplot2grid(shape, (0, 0), rowspan=2)
ax_hist1 = plt.subplot2grid(shape, (0, 1))
ax_hist2 = plt.subplot2grid(shape, (1, 1))

_kw = {'xlabel_fontsize': 14, 'ylabel_fontsize': 14, 'title_fontsize': 15,
       'result_i': RESULT_I, 'metric_i': METRIC_I,
      }
plot_heatmap(exp, ax=ax1, title=False, annot_kws={'fontsize':13}, **_kw)

_kw = {'add_n_to_label': False, 'bins': 50, 'legend_fontsize': 12,
       'range': (0,1),
       **_kw}
plot_hists(exp,
    [
        (0, 0), (0, 1),
    ], title=False, xlabel=False, ax=ax_hist1, **_kw)
plot_hists(exp,
    [
        (1, 1), (1, 0),
    ], title=False, ax=ax_hist2, **_kw) # , range=(0,2)

# Set suptitle
plt.suptitle(build_suptitle(exp, RESULT_I, METRIC_I), fontsize=17)

# Set titles
ax_hist1.set_title('Scores distribution', fontsize=_kw['title_fontsize'])
ax1.set_title('Scores matrix', fontsize=_kw['title_fontsize'])

_LOG_SCALE = False
_SAVE = True

if _LOG_SCALE:
    ax_hist1.set_yscale('log')
    ax_hist2.set_yscale('log')

# increase fontsize of ticks in the first plot (HACKy way)
a = ax1.figure.axes[0] # get the first plot
a.set_xticklabels(a.get_xticklabels(), fontsize=12)
a.set_yticklabels(a.get_yticklabels(), fontsize=12)

_image_fpath = build_image_fpath(exp, RESULT_I, METRIC_I,
                                 suffix='logscale' if _LOG_SCALE else '')
if _SAVE:
    ax1.figure.savefig(_image_fpath, bbox_inches='tight')

## Plot many matrices

In [None]:
GROUPS = [0, 1]
# groups = [-2, 0, -1, 1]
SAMPLER = 'random-gen_k500_n500'

In [None]:
# keydict = { metric: i for i, metric in enumerate(metrics) } # Not necessary

def build_cubes_df(exp_by_abn, abnormalities, metrics, sampler=SAMPLER, groups=GROUPS):
    cubes_df = pd.DataFrame(index=abnormalities, columns=metrics)

    for abn in abnormalities:
        exp = exp_by_abn[abn]

        results = [
            r
            for r in exp.results
            if (sampler is None or r.sampler == sampler) and \
                (groups is None or sorted(r.groups) == sorted(groups))
        ]
        # Not necessary to sort
        # results = sorted(results, key=lambda x: keydict[x.metric])

        if len(results) != len(metrics):
            err = f'Diff amount of results than metrics: {abn}, {len(results)}, {len(metrics)}'
            print(err)
            continue

        for result in results:
            cubes_df.loc[abn, result.metric] = result.cube

    assert cubes_df.isnull().any(axis=0).any(axis=0) == False
    return cubes_df

In [None]:
def get_limits_per_abnormality(df, abns, metrics):
    cubes_per_abn = defaultdict(list)
    for abn in abns:
        for m in metrics:
            cube = df.loc[abn, m]
            cubes_per_abn[abn].append(cube[0])
            if m == 'bleu':
                cubes_per_abn[abn].append(cube[3])
    cubes_per_abn = {abn: np.array(l) for abn, l in cubes_per_abn.items()}
    limits_per_abn = {abn: (cube.min(), cube.max()) for abn, cube in cubes_per_abn.items()}
    return limits_per_abn

In [None]:
import matplotlib.gridspec as gridspec

In [None]:
def plot_many_matrices(exp_by_abn, abnormalities, metrics,
                       bleu_all=False, groups=GROUPS, sampler=SAMPLER,
                       outer=None, fig=None,
                      ):    
    print('Building cube...')
    cubes_df = build_cubes_df(exp_by_abn, abnormalities, metrics, groups=groups,
                              sampler=sampler)

    limits = get_limits_per_abnormality(cubes_df, abnormalities, ['bleu', 'rouge'])
    # limits_cider = get_limits_per_abnormality(cubes_df, abns, ['cider-IDF'])
    
    TICKS = [KEY_TO_LABEL[k] for k in groups]

    metrics_plotable = [(0, 'bleu')]
    if bleu_all:
        metrics_plotable.extend([(1, 'bleu'), (2, 'bleu')])
    metrics_plotable.extend([(3, 'bleu'), (0, 'rouge'), (0, 'cider-IDF')])
    
    n_rows = len(abnormalities)
    n_cols = len(metrics_plotable)

    if outer is not None:
        # Supports using both GridSpec and
        if fig is None:
            fig = plt.gcf()

        inner = gridspec.GridSpecFromSubplotSpec(
            n_rows, n_cols, subplot_spec=outer, wspace=0.15, hspace=0.1)
        get_ax = lambda i, j: plt.Subplot(fig, inner[i, j])
    else:
        # f = plt.figure(figsize=(n_cols*5, n_rows*5))
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*5, n_rows*5))
        get_ax = lambda i, j: axes[i][j]

    for abn_i, abn in enumerate(abnormalities):
        for metric_col_j, (metric_i, metric) in enumerate(metrics_plotable):
            cube = cubes_df.loc[abn, metric]

            # cbar params
            include_cbar = (metric_col_j >= n_cols - 2) # rouge and CIDEr
            if 'cider' in metric:
                cbar_params = {'cmap': 'Blues'} # 'vmin': MIN_CIDER, 'vmax': MAX_CIDER, 
            else:
                min_value, max_value = limits[abn]
                cbar_params = {'vmin': min_value, 'vmax': max_value, 'cmap': 'YlOrRd'}
                # cbar_params = {'vmin': 0, 'vmax': max_value + 0.2, 'cmap': 'YlOrRd'}

            # plt.subplot(n_rows, n_cols, abn_i * n_cols + metric_col_j + 1)
            ax = get_ax(abn_i, metric_col_j)
            a = sns.heatmap(
                cube[metric_i], annot=True, square=True,
                xticklabels=TICKS, yticklabels=TICKS, fmt='.3f', # robust=True,
                cbar=True,
                annot_kws={'fontsize':15},
                ax=ax,
                **cbar_params,
            )
            a.set_xticklabels(a.get_xticklabels(), fontsize=13)
            a.set_yticklabels(a.get_yticklabels(), fontsize=13)

            title_metric = True # (abn_i == 0)
            include_ylabel = (metric_col_j == 0)
            include_xlabel = False # True # (abn_i == n_rows - 1)

            if title_metric:
                pretty_metric = get_pretty_metric(metric, metric_i=metric_i, include_range=True)
                ax.set_title(pretty_metric, fontsize=18)

            if include_xlabel:
                ax.set_xlabel('Generated', fontsize=18)

            if include_ylabel:
                ax.set_ylabel(f'{abn}', fontsize=18) # \nGround Truth
                
            if outer is not None:
                fig.add_subplot(ax)

In [None]:
# exp_by_abn_iu = load_experiments('iu')
# exp_by_abn_mimic = load_experiments('mimic')
len(exp_by_abn_iu), len(exp_by_abn_mimic)

In [None]:
exp_by_abn, dataset_name = exp_by_abn_iu, 'iu'
# exp_by_abn, dataset_name = exp_by_abn_mimic, 'mimic'
# abnormalities = ['Atelectasis', 'Cardiomegaly', 'Pleural Effusion']
abns_half1 = CHEXPERT_DISEASES[1:7]
abns_half2 = CHEXPERT_DISEASES[7:]
metrics = ['bleu', 'rouge', 'cider-IDF']
# groups = [0, 1]
groups = [-2, -1, 0, 1]

In [None]:
_kw = {
    'metrics': metrics, 'bleu_all': True,
    # 'save': True, # 'suptitley': 0.90,
    'groups': groups,
    'sampler': ('random-gen_k500_n500' if dataset_name == 'iu' else 'random-gen_k50_n100'),
}

n_abns = max(len(abns_half1), len(abns_half2))
n_cols = 2 * 6
fig = plt.figure(figsize=(n_cols * 5, n_abns * 5))
outer = gridspec.GridSpec(1, 2, wspace=0.15, hspace=0.1)

plot_many_matrices(exp_by_abn, abns_half1, outer=outer[0], fig=fig, **_kw)
plot_many_matrices(exp_by_abn, abns_half2, outer=outer[1], fig=fig, **_kw)

dataset = 'IU X-ray' if dataset_name == 'iu' else 'MIMIC-CXR'
suptitle = f'Matrices for multiple abnormalities and NLP metrics ({dataset} dataset)'
plt.suptitle(suptitle, fontsize=26, y=0.9)

# HACK: there is a dummy bbox outside # remove it manually
ax_dummy = fig.axes[0]
if not bool(ax_dummy.get_label()) and not bool(ax_dummy.get_title()):
    # (make sure it does not remove an important axis!)
    ax_dummy.remove()

_save = True
_close = False

_fig_fpath = os.path.join(
    FIGURES_DIR,
    f'nlp-vs-chex-all-{len(groups)}x{len(groups)}-{dataset_name}.pdf',
)
print(f'Filepath (save={_save}): {_fig_fpath}')
if _save:
    fig.savefig(_fig_fpath, bbox_inches='tight')
    
if _close:
    plt.close(fig)

In [None]:
# def _save_many_to_file(fig, groups, abnormalities, metrics_plotable, suffix='', save=False):
#     # Build image_fpath
#     _fig_fname = '-'.join(s for s in [
#         'nlp-vs-chex-many',
#         f'{len(groups)}x{len(groups)}',
#         dataset_name,
#         f'{len(abnormalities)}A',
#         f'{len(metrics)}M',
#         suffix,
#     ] if s)
#     _fig_fpath = os.path.join(
#         FIGURES_DIR,
#         f'{_fig_fname}.pdf',
#     )
#     print(f'Filepath (save={save}): {_fig_fpath}')
#     if save:
#         fig.savefig(_fig_fpath, bbox_inches='tight')

## Plot matrices for each experiment in pdfs

For each experiment, all its matrices in a PDF file

In [None]:
import math

In [None]:
def plot_exp_matrices(exp, target_groups=4, save=False, n_rows=2):
    def _find_result(m):
        ress = [
            i
            for i, r in enumerate(exp.results)
            if r.metric == m and len(r.groups) == target_groups
        ]
        assert len(ress) == 1, ress
        return ress[0]

    bleu_result = _find_result('bleu')
    targets = [
        (bleu_result, 0),
        (bleu_result, 1),
        (bleu_result, 2),
        (bleu_result, 3),
        (_find_result('rouge'), 0),
        (_find_result('cider-IDF'), 0),
    ]

    n_cols = math.ceil(len(targets) / n_rows)
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6*n_cols, 5*n_rows)) 
    axes = axes.flatten()

    _kw = {'xlabel_fontsize': 14, 'ylabel_fontsize': 14, 'title_fontsize': 16}

    for ax_i, (ax, (result_i, metric_i)) in enumerate(zip(axes, targets)):
        plot_heatmap(exp, ax=ax, result_i=result_i, metric_i=metric_i,
                     xlabel=(ax_i // n_cols + 1 == n_rows),
                     ylabel=(ax_i % n_cols == 0),
                     title=False, annot_kws={'fontsize':12}, **_kw)

        pretty_metric = get_pretty_metric(exp[result_i].metric, metric_i=metric_i)
        ax.set_title(pretty_metric, fontsize=_kw['title_fontsize'])

        ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)
        ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)

    # Set suptitle
    suptitle = f'{exp.abnormality} in {"IU X-ray" if exp.dataset == "iu" else "MIMIC-CXR"}'
    plt.suptitle(suptitle, fontsize=20, y=0.94)

    save = True

    prefix = f'nlp-vs-chex-matrices-{target_groups}'
    _image_fpath = os.path.join(
        FIGURES_DIR, f'{prefix}-{ABN_SHORTCUTS[exp.abnormality].lower()}-{exp.dataset}.pdf',
    )
    print('Filepath: ', _image_fpath)
    if save:
        fig.savefig(_image_fpath, bbox_inches='tight')
        
    return fig

In [None]:
# exp_by_abn = load_experiments('iu')
exp_by_abn = load_experiments('mimic')
len(exp_by_abn)

In [None]:
for exp in exp_by_abn.values():
    fig = plot_exp_matrices(exp)
    plt.close(fig) # Prevent showing inline

# Attempting to optimize threshold

## Attempt 1

(failed)

In [None]:
result = exp[0]
result

In [None]:
# target1, target2 = (0, 0), (0, 1) # TN, FP (specificity)
target1, target2 = (1, 1), (1, 0) # TP, FN (precision)

arr1 = result.dists[target1]
arr2 = result.dists[target2]
arr1.shape, arr2.shape

In [None]:
assert target1[0] == target1[1]
CORRECT = target1[0]
INCORRECT = 1 - CORRECT

merged = [(value, CORRECT) for value in arr1] + [(value, INCORRECT) for value in arr2]
merged = sorted(merged, reverse=bool(not CORRECT))
merged[:2], merged[-2:]

In [None]:
all_threshs = []
denominator = 0 # TP + FN
numerator = 0 # TP
for value, label in merged:
    current_thresh = value
    if label == CORRECT: # add 1 TP
        numerator += 1

    denominator += 1

    all_threshs.append((current_thresh, numerator / denominator))
all_threshs[:5]

In [None]:
max(all_threshs, key=lambda x: x[1])

In [None]:
x, y = tuple(zip(*all_threshs))
plt.plot(x, y)

## Attempt 2

with sklearn
Failed

In [None]:
from sklearn.metrics import precision_recall_curve as pr_curve

In [None]:
pred, gt = tuple(zip(*merged))
pred = np.array(pred)
gt = np.array(gt)
pred.shape, gt.shape

In [None]:
# pred /= 10 # CIDER re-scaling

In [None]:
precision, recall, thresholds = pr_curve(gt, pred, pos_label=CORRECT)
precision.shape, recall.shape, thresholds.shape

In [None]:
f1 = divide_arrays(2*precision*recall, precision + recall)
f1.shape

In [None]:
best_idx = f1.argmax()
best_idx

In [None]:
thresholds[best_idx], f1[best_idx], precision[best_idx], recall[best_idx]

## Attempt 3: accuracy/prec/recall

CheXpert 4-class classification task --> is a binary classification task in NLP scores
(i.e. NLP scores tell less information)

In [None]:
exp = load_experiment_pickle('mimic-cardiomegaly')
exp

In [None]:
result = exp.results[-1]
result.metric

In [None]:
result.dists

In [None]:
merged = [
    # Value, correct-or-not, original-key
    (value, 1, (0, 0)) for value in result.dists[(0, 0)]
] + [
    (value, 1, (1, 1)) for value in result.dists[(1, 1)]
] + [
    (value, 0, (0, 1)) for value in result.dists[(0, 1)]
] + [
    (value, 0, (1, 0)) for value in result.dists[(1, 0)]
]
merged = sorted(merged)
len(merged), merged[:3]

In [None]:
n_correct = sum(1 for _, correct, _ in merged if correct)
n_incorrect = sum(1 for _, correct, _ in merged if not correct)
n_correct, n_incorrect

In [None]:
def smart_division(a, b):
    if b == 0:
        return 0
    return a / b

In [None]:
all_threshs = []

# At first, the threshold is at 0
# --> No negative predictions, all positive predictions
# --> TN = FN = 0
TP = sum(1 for _, correct, _ in merged if correct)
FP = sum(1 for _, correct, _ in merged if not correct)
TN, FN = 0, 0

total = len(merged)

assert TP + FP + FN + TN == total, f'Begin: {TP + FP + FN + TN} vs {total}'

for value, correct, _ in merged:
    current_thresh = value

    if correct:
        TP -= 1
        FN += 1
    else:
        TN += 1
        FP -= 1

    assert TP + FP + FN + TN == total, f'Thresh={value}: {TP + FP + FN + TN} vs {total}'
        
    acc = (TP + TN) / total
    prec = smart_division(TP, TP + FP)
    recall = smart_division(TP, TP + FN)
    f1 = smart_division(2*prec*recall, prec+recall)
    spec = smart_division(TN, TN + FP)
    npv = smart_division(TN, TN + FN)
    f1_neg = smart_division(2*npv*spec, spec+npv)
    CM = (TP, FN, FP, TN)

    all_threshs.append({
        'thresh': current_thresh,
        'acc': acc,
        'prec': prec,
        'recall': recall,
        'f1': f1,
        'npv': npv,
        'spec': spec,
        'f1_neg': f1_neg,
        'CM': CM,
    })
all_threshs[:1]

In [None]:
max(all_threshs, key=lambda x: x['acc'])

In [None]:
sl = lambda k: tuple(zip(*[(x['thresh'], x[k]) for x in all_threshs]))

In [None]:
plt.figure(figsize=(6, 5))
keys = ('prec', 'recall', 'acc', 'f1') # 'f1', 
# keys = ('acc', )
# keys = ('npv', 'spec', 'f1_neg')
for k in keys:
    thresh, y = sl(k)
    plt.plot(thresh, y, label=k)
plt.legend()
plt.xlabel('Thresh')
plt.ylabel('Value')
plt.title('Optimize by')

In [None]:
best = max(all_threshs, key=lambda x: x['acc'])
best

In [None]:
def plot_cm(cm, title=None):
    TP, FN, FP, TN = cm
    ticks = ['Entailment', 'Contradiction']
    sns.heatmap([[TP, FN], [FP, TN]], annot=True, square=True, cmap='Blues',
                xticklabels=ticks, yticklabels=ticks, fmt=',',
               )
    plt.ylabel('Real')
    plt.xlabel('Scored by Metric')
    if title:
        plt.title(title)

In [None]:
plot_cm(best['CM'], title=f'CM for {exp.abnormality} with {get_pretty_metric(result.metric)}')

## Attempt 4: use AUC

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
def prepare_gt_pred_for_roc(result, metric_i=0, keys=None):
    pred = []
    gt = []
    if keys is None:
        keys = list(result.dists.keys())

    for a, b in keys:
        elements = result.dists[(a, b)]
        if elements.ndim == 2:
            elements = elements[metric_i] # BLEU case
        pred += list(elements)

        entailment = int(a == b)
        gt += [entailment] * len(elements)

    return gt, pred

In [None]:
result = exp.results[-1]

In [None]:
gt, pred = prepare_gt_pred_for_roc(result)

In [None]:
fpr, tpr, thresholds = roc_curve(gt, pred)

J_stat = tpr - fpr
best_idx = J_stat.argmax()

thresholds[best_idx], J_stat[best_idx]

In [None]:
roc = roc_auc_score(gt, pred)
roc

## Compute AUC for all abnormalities

In [None]:
dataset_name = 'mimic'

In [None]:
exp_by_abn_iu = load_experiments(dataset_name)
len(exp_by_abn_iu)

In [None]:
show = True
target_sampler = None # 'random-gen_k500_n500'
target_groups = [0, 1] # [-2, -1, 0, 1]
# keys = [(0, 0), (0, 1), (1, 1), (1, 0)]
# keys = None
set_of_keys = [
    ((0, 0), (0, 1)),
    ((1, 1), (1, 0)),
]

final_records = []

for abnormality in CHEXPERT_DISEASES[1:]:
    if abnormality not in exp_by_abn_iu:
        continue
    exp = exp_by_abn_iu[abnormality]

    for result in tqdm(exp.results, desc=abnormality, disable=not show):
        if target_sampler is not None and result.sampler != target_sampler:
            continue
        if target_groups is not None and sorted(result.groups) != target_groups:
            continue
        
        for keys in set_of_keys:
            gt, pred = prepare_gt_pred_for_roc(result, keys=keys)
            roc = roc_auc_score(gt, pred)

            final_records.append((
                abnormality, result.metric, result.groups, result.sampler, keys, roc)
            )

            if result.metric == 'bleu':
                # HACK
                gt, pred = prepare_gt_pred_for_roc(result, metric_i=3, keys=keys)
                roc = roc_auc_score(gt, pred)
                final_records.append((
                    abnormality, f'{result.metric}-4', result.groups, result.sampler, keys,
                    roc,
                ))

len(final_records)

In [None]:
final_records[:1]

In [None]:
cols = ['disease', 'metric', 'groups', 'sampler', 'keys', 'roc']
df = pd.DataFrame(final_records, columns=cols)
df.head(2)

In [None]:
Counter(df['groups']), Counter(df['sampler'])

In [None]:
df = df.loc[df['groups'] == (0, 1)]
# df = df.loc[df['sampler'] == 'random-gen_k500_n500'] # IU
df = df.loc[df['sampler'] == 'random-gen_k50_n100'] # MIMIC
del df['sampler'], df['groups']
df.head(2)

In [None]:
df_recall = df.loc[df['keys'] == ((1, 1), (1, 0))]
df_spec = df.loc[df['keys'] == ((0, 0), (0, 1))]
len(df_recall), len(df_spec)

In [None]:
def get_renamer(replace_strs):
    def _rename_run(run_name):
        s = run_name
        for target, replace_with in replace_strs:
            s = re.sub(target, replace_with, s)
        return s
    return _rename_run

def bold(s):
    return '\textbf{' + s + '}'

shorten_cols = get_renamer([
    ('cider-IDF', 'C-D'),
    ('bleu-4', 'B-4'),
    (r'bleu\b', 'B-1'),
    ('rouge', 'R-L'),
    ('disease', 'Abnormality'),
])
def latexify_cols(col):
    return bold(shorten_cols(col))

In [None]:
def df_to_table(df):
    cols = list(df.columns)
    metric_col = cols.index('metric')
    roc_col = cols.index('roc')
    df = df.groupby('disease').apply(lambda subdf: {
        row[metric_col]: row[roc_col]
        for row in list(subdf.values)
    }).apply(pd.Series)
    return df
def table_to_latex(table):
    s = table.reset_index().rename(columns=latexify_cols).to_latex(
        float_format='%.3f',
        escape=False,
        index=False,
        column_format='l' + 'c' * len(table.columns),
    )
    s = re.sub(r' +', ' ', s, flags=re.M)
    print(s)

In [None]:
table_recall = df_to_table(df_recall)
table_recall

In [None]:
table_spec = df_to_table(df_spec)
table_spec

In [None]:
long_table = pd.concat([table_recall, table_spec], axis=1)
table_to_latex(long_table)

In [None]:
def get_result(exp_by_abn, abnormality, metric,
               groups=[0, 1], sampler='random-gen_k500_n500'):
    if abnormality not in exp_by_abn:
        print(f'No exp for {abnormality}')
        return None, None
    groups = list(groups)
    exp = exp_by_abn[abnormality]

    for i, result in enumerate(exp.results):
        if sampler is not None and result.sampler != sampler:
            continue
        if groups is not None and sorted(result.groups) != groups:
            continue
        if result.metric != metric:
            continue
            
        return exp, i
    
    print('No experiment found with conditions')
    return exp, None

In [None]:
exp, result_i = get_result(exp_by_abn_iu, 'Atelectasis', 'bleu')
exp

In [None]:
plot_heatmap(exp, result_i=result_i, metric_i=3)

In [None]:
plot_hists(exp, keys=[(0, 0), (0, 1)], result_i=result_i, metric_i=3, bins=50)

# Statistical tests

In [None]:
from scipy.stats import ttest_ind, mannwhitneyu, f_oneway, kruskal

In [None]:
# exp = load_experiment_pickle('mimic-cardiomegaly')
len(exp.results)

In [None]:
plot_heatmap(exp, result_i=-1)

In [None]:
EXP_I = -1
result = exp[EXP_I]
result.metric

In [None]:
key1 = (0, 0)
key2 = (0, 1)
group1 = result.dists[key1]
group2 = result.dists[key2]
if result.metric == 'bleu':
    group1 = group1[0]
    group2 = group2[0]
group1.shape, group2.shape

In [None]:
plot_hists(exp, [key1, key2], result_i=EXP_I, bins=50, range=(0, 1))

In [None]:
r = mannwhitneyu(group1, group2)
r

In [None]:
r = ttest_ind(group1, group2, equal_var=False)
r

In [None]:
groups = [result.dists[k] for k in [(0, 0), (0, 1), (1, 0), (1, 1)]]

In [None]:
anova = f_oneway(*groups)
anova

In [None]:
kru = kruskal(*groups)
kru