# Imports

In [None]:
import torch
import os
import json
import pandas as pd

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)
plt.rcParams.update({'font.family': 'serif', 'font.sans-serif': ['CMU', 'Helvetica']})

In [None]:
from medai.datasets.common.constants import CHEXPERT_DISEASES, ABN_SHORTCUTS

In [None]:
%run ../../utils/__init__.py

# Utils

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DIR
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

In [None]:
IU_REPORTS_DIR = os.path.join(IU_DIR, 'reports')
MIMIC_REPORTS_DIR = os.path.join(MIMIC_DIR, 'reports')

## Images in each split

In [None]:
%run ../../datasets/__init__.py

In [None]:
_LEN_VALUES = {
    'iu': {'all': 3311, 'train': 2638, 'val': 336, 'test': 337},
    'mimic': {'all': 243326, 'train': 237964, 'val': 1959, 'test': 3403},
}

In [None]:
def _get_subset_df(dataset_name, split):
    _kw = {'do_not_load_image':True, 'do_not_load_report':True, 'frontal_only':True}

    assert dataset_name in _LEN_VALUES
    if dataset_name == 'iu':
        dataset = IUXRayDataset(reports_version='v4-1', dataset_type=split, **_kw)
    else:
        dataset = MIMICCXRDataset(reports_version='v4-2', dataset_type=split, **_kw)

    expected = _LEN_VALUES[dataset_name][split]
    assert expected == len(dataset), f'{expected} vs {len(dataset)}'

    return pd.DataFrame([
        (item.report_fname, item.image_fname)
        for item in dataset
    ], columns=['report_fname', 'image_fname'])

In [None]:
IU_ALL = _get_subset_df('iu', 'all')
IU_TRAIN = _get_subset_df('iu', 'train')
IU_VAL = _get_subset_df('iu', 'val')
IU_TEST = _get_subset_df('iu', 'test')

In [None]:
MIMIC_ALL = _get_subset_df('mimic', 'all')
MIMIC_TRAIN = _get_subset_df('mimic', 'train')
MIMIC_VAL = _get_subset_df('mimic', 'val')
MIMIC_TEST = _get_subset_df('mimic', 'test')

In [None]:
def reduce_df_to_subset(df, subset_info):
    df = df.merge(subset_info, left_on='filename', right_on='report_fname', how='right')
    assert len(df) == len(subset_info), f'{len(df)} vs {len(subset_info)}'
    return df

# Word and sentence distribution

## Load reports

In [None]:
with open(os.path.join(IU_REPORTS_DIR, 'reports.clean.v4-1.json')) as f:
    reports_IU = json.load(f)
len(reports_IU)

In [None]:
with open(os.path.join(MIMIC_REPORTS_DIR, 'reports.clean.v4-2.json')) as f:
    reports_MIMIC = json.load(f)
len(reports_MIMIC)

In [None]:
def reports_to_df(reports):
    samples = []

    for report_fname, d in reports.items():
        text = d['clean_text']
        words = text.split()
        n_words = len(words)

        n_sentences = words.count('.')
        if words[-1] != '.':
            n_sentences += 1

        samples.append((text, n_words, n_sentences))

    df = pd.DataFrame(samples, columns=['Report', 'n_words', 'n_sentences'])
    print(len(df))
    return df

In [None]:
df_iu = reports_to_df(reports_IU)
df_iu.head(2)

In [None]:
df_mimic = reports_to_df(reports_MIMIC)
df_mimic.head(2)

In [None]:
fpath = os.path.join(IU_REPORTS_DIR, 'sentences_with_chexpert_labels.csv')
sentences_df_iu = pd.read_csv(fpath)
print(len(sentences_df_iu))
sentences_df_iu.head(2)

In [None]:
fpath = os.path.join(MIMIC_REPORTS_DIR, 'sentences_with_chexpert_labels.csv')
sentences_df_mimic = pd.read_csv(fpath)
print(len(sentences_df_mimic))
sentences_df_mimic.head(2)

## Load vocabs

In [None]:
%run ../../datasets/vocab/__init__.py

In [None]:
vocab_iu = load_vocab(IU_REPORTS_DIR, 'v4-1')
vocab_mimic = load_vocab(MIMIC_REPORTS_DIR, 'v4-2')
len(vocab_iu), len(vocab_mimic)

## Word per sentence

In [None]:
def count_words_in_sentences(sentences):
    return [
        len(sentence.split())
        for sentence in sentences
    ]

In [None]:
sentences_df_mimic['n_words_per_sentence'] = count_words_in_sentences(sentences_df_mimic['sentence'])
sentences_df_iu['n_words_per_sentence'] = count_words_in_sentences(sentences_df_iu['sentence'])
sentences_df_iu.head(2)

## Plot!

In [None]:
X_LABEL_FONTSIZE = Y_LABEL_FONTSIZE = 15
LEGEND_FONTSIZE = 12
TITLE_FONTSIZE = 16

In [None]:
plt.figure(figsize=(15, 4))
n_reports = len(df_iu)

iu_label = 'IU X-ray'
mimic_label = 'MIMIC-CXR'

n_rows = 1
n_cols = 3

plt.subplot(n_rows, n_cols, 1)
plt.hist(df_iu['n_words'], bins=20, density=True, alpha=0.5, label=iu_label)
plt.hist(df_mimic['n_words'], bins=20, density=True, alpha=0.5, label=mimic_label)
plt.title('Number of words per report', fontsize=TITLE_FONTSIZE)
plt.xlabel('Number of words', fontsize=X_LABEL_FONTSIZE)
plt.ylabel('Frequency', fontsize=Y_LABEL_FONTSIZE)
plt.legend(fontsize=LEGEND_FONTSIZE)

plt.subplot(n_rows, n_cols, 2)
plt.hist(df_iu['n_sentences'], bins=15, density=True, alpha=0.5, label=iu_label)
plt.hist(df_mimic['n_sentences'], bins=15, density=True, alpha=0.5, label=mimic_label)
plt.title('Number of sentences per report', fontsize=TITLE_FONTSIZE)
plt.xlabel('Number of sentences', fontsize=X_LABEL_FONTSIZE)
#  plt.ylabel('Frequency', fontsize=Y_LABEL_FONTSIZE) # f'Reports (N={n_reports:,})'
plt.legend(fontsize=LEGEND_FONTSIZE)

plt.subplot(n_rows, n_cols, 3)
plt.hist(sentences_df_iu['n_words_per_sentence'], bins=15, density=True, alpha=0.5,
         label=iu_label)
plt.hist(sentences_df_mimic['n_words_per_sentence'], bins=15, density=True, alpha=0.5,
         label=mimic_label)
plt.title('Number of words per sentence', fontsize=TITLE_FONTSIZE)
plt.xlabel('Number of words', fontsize=X_LABEL_FONTSIZE)
# plt.ylabel('Frequency', fontsize=Y_LABEL_FONTSIZE) # f'Reports (N={n_reports:,})'
plt.legend(fontsize=LEGEND_FONTSIZE)

_fig_fpath = os.path.join(FIGURES_DIR, 'datasets-distributions-word-sentences.pdf')
plt.gcf().savefig(_fig_fpath, bbox_inches='tight')

In [None]:
cols = ['n_words', 'n_sentences']
df_iu[cols].mean(axis=0), df_mimic[cols].mean(axis=0)

# Abnormality distribution

In [None]:
df_iu = pd.read_csv(os.path.join(IU_REPORTS_DIR, 'reports_with_chexpert_labels.csv'))
print(len(df_iu))
df_iu.head(2)

In [None]:
df_mimic = pd.read_csv(os.path.join(MIMIC_REPORTS_DIR, 'reports_with_chexpert_labels.csv'))
print(len(df_mimic))
df_mimic.head(2)

In [None]:
def _shorten_ec(disease):
    if disease == 'Enlarged Cardiomediastinum':
        return 'Enlarged Cardiom.'
    return disease

In [None]:
def plot_abn_dist(df, name, ignore_NF=True, thousands=False, ylabel=True, txt_rot=0):
    n_images = len(df)

    df = df.replace({ -2: 0, -1: 1 })
    
    diseases = list(CHEXPERT_DISEASES)
    if ignore_NF:
        diseases.remove('No Finding')
    
    amounts_by_disease = df[diseases].sum(axis=0).sort_values(ascending=False)

    diseases, values = list(amounts_by_disease.index), amounts_by_disease.values

    diseases = [_shorten_ec(d) for d in diseases]
    
    plt.bar(diseases, values)
    plt.xticks(rotation=90, fontsize=16)
    # ylabel =  # if not thousands else 'N x1000 reports'
    # plt.ylabel(f'{ylabel} (total={n_images:,})', fontsize=16)
    if ylabel:
        plt.ylabel('N images', fontsize=16)
    plt.title(f'{name} (N={len(df):,})', fontsize=18)

    max_amount = max(values)
    y_padding = int(max_amount * 0.03)
    plt.ylim(0, max_amount * 1.15)

    for index, value in enumerate(values):
        value = int(value)
        perc = value / n_images * 100
        if not thousands:
            s = f'{value:,}'
        else:
            thousands = value / 1000
            s = f'{thousands:.1f}k'
        plt.text(index, value + y_padding, f'{s}\n{perc:.0f}%', ha='center', rotation=txt_rot)

In [None]:
# plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
df = reduce_df_to_subset(df_iu, IU_ALL)
plot_abn_dist(df, 'IU X-ray')

plt.subplot(1, 2, 2)
df = reduce_df_to_subset(df_mimic, MIMIC_ALL)
plot_abn_dist(df, 'MIMIC-CXR', thousands=True, ylabel=True)
# plt.tight_layout()

_fig_fpath = os.path.join(FIGURES_DIR, 'datasets-distributions-chexpert-labels.pdf')
plt.gcf().savefig(_fig_fpath, bbox_inches='tight')

## Test split only

### Plot

In [None]:
from collections import Counter
import numpy as np

In [None]:
def plot_abn_dist_subclasses(df, title=None,
                             horizontal=False,
                             sortby='None',
                             thousands=False, width=0.3, ignore_NF=True):
    n_reports = len(df)

    diseases = list(CHEXPERT_DISEASES)
    if ignore_NF:
        diseases.remove('No Finding')

    amounts_by_label = df.replace({ -1: 1 })[diseases].apply(Counter).apply(pd.Series).rename(
        index={ i: disease for i, disease in enumerate(diseases) },
        columns={
            -2: 'None',
            0: 'Neg',
            1: 'Pos',
        },
    ).fillna(0)
    if sortby is not None:
        amounts_by_label = amounts_by_label.sort_values(sortby, ascending=not horizontal)

#     if not horizontal:
#         plt.figure(figsize=(10,10))

    absolute_max = amounts_by_label.max().max()
    Y_PADDING_FACTOR = 0.03

    x_labels = [ABN_SHORTCUTS[label] for label in amounts_by_label.index]
    x = np.arange(len(x_labels))
    for i, c in enumerate(['None', 'Neg', 'Pos']):
        x_offset = (i-1)*width
        y = amounts_by_label[c].values

        if horizontal:
            plt.bar(x + x_offset, y, width=width, label=c)
        else:
            plt.barh(x + x_offset, y, height=width, label=c)

        y_padding = int(absolute_max * Y_PADDING_FACTOR)

        for index, value in enumerate(y):
            value = int(value)
            perc = value / n_reports * 100
            if not thousands:
                s = f'{value:,}'
            else:
                thousands = value / 1000
                s = f'{thousands:.1f}k'

            if horizontal:
                plt.text(index + x_offset, value + y_padding,
                         f'{s}\n{perc:.0f}%', ha='center')
            else:
                plt.text(value + y_padding, index + x_offset,
                         f'{s} ({perc:.0f}%)', ha='left', va='center')

    if horizontal:
        plt.xticks(ticks=x, labels=x_labels, rotation=90)
        plt.ylim(0, absolute_max * 1.15)
        plt.legend()
        plt.ylabel('N reports')
    else:
        plt.yticks(ticks=x, labels=x_labels)
        plt.xlim(0, absolute_max * 1.2)
        plt.legend(loc='lower right')
        plt.xlabel('N reports')
    plt.title(title)

In [None]:
df, subset_info, name = df_iu, IU_TEST, 'IU X-ray'
# df, subset_info, name = df_mimic, MIMIC_TEST, 'MIMIC-CXR'

df = reduce_df_to_subset(df, subset_info)
# plot_abn_dist(df, f'{name} (test-only)')
plot_abn_dist_subclasses(df, f'{name} (test-only)', horizontal=True)

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(1, 2, 1)
df1 = reduce_df_to_subset(df_iu, IU_TEST)
plot_abn_dist_subclasses(df1, f'IU X-ray (test-subset-only)', sortby=None)

plt.subplot(1, 2, 2)
df2 = reduce_df_to_subset(df_mimic, MIMIC_TEST)
plot_abn_dist_subclasses(df2, f'MIMIC X-ray (test-subset-only)', sortby=None)

In [None]:
def df_to_test_and_to_amounts(df, subset_info, ignore_NF=True):
    df = reduce_df_to_subset(df, subset_info)

    diseases = list(CHEXPERT_DISEASES)
    if ignore_NF:
        diseases.remove('No Finding')
    
    amounts_by_label = df.replace({ -1: 1 })[diseases].apply(Counter).apply(pd.Series).rename(
        index={ i: disease for i, disease in enumerate(diseases) },
        columns={
            -2: 'Unmention',
            0: 'Negative',
            1: 'Positive + Uncertain',
        },
    ).fillna(0)
    return amounts_by_label

In [None]:
am_iu = df_to_test_and_to_amounts(df_iu, IU_TEST)
am_mimic = df_to_test_and_to_amounts(df_mimic, MIMIC_TEST)
len(am_iu), len(am_mimic)

In [None]:
am_iu = am_iu.sum(axis=0) / 13 / len(IU_TEST)
am_iu

In [None]:
am_mimic = am_mimic.sum(axis=0) / 13 / len(MIMIC_TEST)
am_mimic

In [None]:
am = pd.concat([am_iu, am_mimic], axis=1).rename(
    columns={ 0: 'IU X-ray', 1: 'MIMIC-CXR' },
).transpose()
am

In [None]:
am_iu.sum(), am_mimic.sum()

In [None]:
# thousands = False
horizontal = True
density = True
to_100 = True

plt.figure(figsize=(8, 6))

width = 0.2
x_labels = am.index
x = np.arange(len(x_labels))

if to_100:
    am2 = am * 100
    max_y = 100
else:
    am2 = am
    max_y = 1
absolute_max = am2.max().max() if not density else max_y
Y_PADDING_FACTOR = 0.03
y_padding = float(absolute_max * Y_PADDING_FACTOR)

for i, c in enumerate(am.columns):
    x_offset = (i-1)*width
    y = am2[c]
    
    plt.bar(x + x_offset, y, width=width, label=c)
    
    for index, value in enumerate(y):
        # HACK!!
        if index == 1:
            n_reports = len(MIMIC_TEST) * 13
        else:
            n_reports = len(IU_TEST) * 13
        # n_reports = int(n_reports * value)
#         value = int(value)
#         perc = value / n_reports * 100

        # t = f'{value*100:.1f}%'
        t = f'{value:.1f}%'
        # t += '\n({n_reports:,})'
        plt.text(index + x_offset, value + y_padding, t, ha='center', fontsize=14)
    
plt.xticks(ticks=x, labels=x_labels, fontsize=16)
plt.ylabel(r'Percentage of mentions', fontsize=16) # of all images $\times$ all abnormalities
plt.title('Abnormality mentions (test subset)', fontsize=18)
plt.ylim(0, max_y)
plt.legend(fontsize=14)

_fig_fpath = os.path.join(FIGURES_DIR, 'stress-test-datasets-valorations-dist.pdf')
plt.gcf().savefig(_fig_fpath, bbox_inches='tight')