# Imports

In [None]:
import os
import numpy as np
from collections import Counter, defaultdict
import importlib
import re

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets import iu_xray, mimic_cxr
IU_DIR = iu_xray.DATASET_DIR
MIMIC_DIR = mimic_cxr.DATASET_DIR

# Load sentences

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF_IU = pd.read_csv(fpath)
SENTENCES_DF_IU.head(3)

In [None]:
fpath = os.path.join(MIMIC_DIR, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF_MIMIC = pd.read_csv(fpath)
SENTENCES_DF_MIMIC.head(3)

# Amounts by abnormality

In [None]:
def get_amounts_by_abn(sentences_df):
    amounts_by_abn = sentences_df[CHEXPERT_DISEASES].apply(Counter).apply(pd.Series)
    amounts_by_abn = amounts_by_abn.rename(
        index={ i: disease for i, disease in enumerate(CHEXPERT_DISEASES) },
        columns={ -2: 'Unmention', -1: 'Uncertain', 0: 'Negative', 1: 'Positive' },
    ).fillna(0) # .astype(np.int)
    cols = ['Unmention', 'Negative', 'Uncertain', 'Positive']
    return amounts_by_abn[cols]

In [None]:
df_iu = get_amounts_by_abn(SENTENCES_DF_IU)
df_iu

In [None]:
df_mimic = get_amounts_by_abn(SENTENCES_DF_MIMIC)
df_mimic

In [None]:
final_table = pd.concat([df_iu, df_mimic], axis=1)
final_table

In [None]:
x = 3000
f'{x:,}'

In [None]:
def bold(s):
    return '\textbf{' + s + '}'
def shorten(s):
    d = {
        'Unmention': 'None',
        'Negative': 'Neg',
        'Positive': 'Pos',
        'Uncertain': 'Unc',
    }
    return d[s]

s = final_table.rename(
    columns={col: bold(shorten(col)) for col in final_table.columns}
).to_latex(float_format='{:,.0f}'.format, escape=False)
s = re.sub(r' +', ' ', s)
print(s)

In [None]:
len(SENTENCES_DF_IU), len(SENTENCES_DF_MIMIC)

# Other stats

In [None]:
def print_stats(df, name):
    print(name)
    print(f'\tTotal sentences in DF: {len(df):,}')
    print(f'\tUnique sentences in DF: {len(df["sentence"].unique()):,}')

In [None]:
print_stats(SENTENCES_DF_IU, 'iu')
print_stats(SENTENCES_DF_MIMIC, 'mimic')

In [None]:
ACTUAL_DISEASES = CHEXPERT_DISEASES[1:]

In [None]:
d = SENTENCES_DF_IU
d = d.loc[(d[ACTUAL_DISEASES] == -2).all(axis=1)]
d = d.loc[(d['No Finding'] == 1)]
print(len(d))
d

In [None]:
list(d['sentence'])