# Imports

In [None]:
import os
import random
import pandas as pd

In [None]:
CHEXPERT_LABELS = [
    'No Finding',
    'Enlarged Cardiomediastinum',
    'Cardiomegaly',
    'Lung Lesion',
    'Lung Opacity',
    'Edema',
    'Consolidation',
    'Pneumonia',
    'Atelectasis',
    'Pneumothorax',
    'Pleural Effusion',
    'Pleural Other',
    'Fracture',
    'Support Devices',
]
CHEXPERT_LABELS_5 = [
    'Atelectasis',
    'Cardiomegaly',
    'Consolidation',
    'Edema',
    'Pleural Effusion',
]
CHEXPERT_LABELS_6 = CHEXPERT_LABELS_5 + ['Lung Opacity']

In [None]:
FOLDER = '/home/pdpino/workspace-medical-ai/report_generation/nlp-chex-gold-sentences'

In [None]:
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', '{:.3f}'.format)

# Load sentences

In [None]:
dataset = 'mimic'

In [None]:
if dataset == 'iu':
    path = '/mnt/workspace/iu-x-ray/dataset-pdpino/sentences_with_chexpert_labels.csv'
    sentences_df = pd.read_csv(path)
else:
    path1 = '/mnt/data/mimic-cxr-jpg/reports/sentences_with_chexpert_labels.csv'
    path2 = '/mnt/data/mimic-cxr-jpg/reports/sentences.csv'
    sentences_df = pd.read_csv(path1).merge(pd.read_csv(path2), on='sentence')
for abn in CHEXPERT_LABELS:
    sentences_df[abn] = sentences_df[abn].astype(int)
print(f'Using dataset {dataset}, n_sentences={len(sentences_df)}')
sentences_df.head()

# Sample random sentences

Make sure all abnormalities and all valuations are well represented

In [None]:
N_per_abn_per_val = 100
N_total = 1000

In [None]:
def sample_from_df(df, n):
    return random.sample(list(df.index), n)
    # TODO: handle repetitions properly
    # return random.sample(list(df.index), n, counts=df['appearances'])

In [None]:
def grab_n_samples(df_target, chosen_indexes, n_target):
    chosen_for_target = set(chosen_indexes).intersection(df_target.index)
    
    n_missing = n_target - len(chosen_for_target)

    if n_missing > 0:
        df_wo_repeating = df_target[~df_target.index.isin(chosen_for_target)]

        n_grab = min(n_missing, len(df_wo_repeating))
        if n_grab > 0:
            return sample_from_df(df_wo_repeating, n_grab)

    return []

In [None]:
VALUATIONS = (1, -1, 0, -2)
# ABNORMALITIES = tuple(CHEXPERT_LABELS[1:])
ABNORMALITIES = CHEXPERT_LABELS_5

def sample_sentences(df, n_total=1000, n_per_target=100):
    chosen_indexes = []

    for valuation in (1, -1, 0, -2):
        for abn in ABNORMALITIES:
            df_target = df.loc[df[abn] == valuation]
            chosen_indexes.extend(grab_n_samples(df_target, chosen_indexes, n_per_target))

    chosen_indexes.extend(grab_n_samples(df, chosen_indexes, n_total))

    random.shuffle(chosen_indexes)
    return df.iloc[chosen_indexes]

In [None]:
df = sample_sentences(sentences_df)[['sentence'] + CHEXPERT_LABELS]
len(df), len(df.index.unique())

In [None]:
df

In [None]:
def count_per_abn(df, labels=CHEXPERT_LABELS):
    return pd.concat([
        df[abn].value_counts()
        for abn in labels   
    ], axis=1).fillna(0).astype(int).transpose()
count_per_abn(df)

In [None]:
# fname_filled = f'{folder}/{dataset}-filled.csv'
fname_empty = f'{FOLDER}/{dataset}-empty.csv'

In [None]:
df_out = df[['sentence']]
# for col in CHEXPERT_LABELS_5 + ['Any other finding', '', 'Missing Context']:
#     df_out[col] = ""
df_out

In [None]:
df_out.to_csv(fname_empty, index=False)
fname_empty

# Clean answers and save

In [None]:
cols = ['Sentence'] + CHEXPERT_LABELS_6 # + ['Not understood or malformed']
dfs = {}
should_ignore = list()
for expert in (1,2):
    fname = f'cxr-sentence-assessment-expert{expert}.csv'
    df = pd.read_csv(f'{FOLDER}/{fname}')
    should_ignore.extend(list(df.loc[df['Not understood or malformed'] == True]['Sentence']))
    
    df = df[cols]
    df = df.replace('Abnormal', 1).replace('Normal', 0).replace('Uncertain', -1).fillna(-2)
    df = df.rename({'Sentence': 'sentence'}, axis=1)
    df = df.astype(int, errors='ignore')
    dfs[expert] = df
    
for expert in dfs.keys():
    df = dfs[expert]
    df = df.loc[~df['sentence'].isin(should_ignore)]
    dfs[expert] = df
len(dfs[1]), len(dfs[2])

In [None]:
for expert in dfs.keys():
    dfs[expert].to_csv(f'{FOLDER}/mimic-expert{expert}.csv', index=False)

# Analyze expert answers

In [None]:
dfs = {}
for expert in (1, 2):
    dfs[expert] = pd.read_csv(f'{FOLDER}/mimic-expert{expert}.csv')
dfs[1].head()

In [None]:
chex_gt = sentences_df[['sentence'] + CHEXPERT_LABELS_6].rename({
    abn: f'{abn}_chex'
    for abn in CHEXPERT_LABELS_6
}, axis=1)
chex_gt.head()

In [None]:
gts = dfs[1].merge(dfs[2], on='sentence', suffixes=("_exp1", "_exp2")).merge(chex_gt, on='sentence')
gts2 = gts.replace(-1, 1).replace(-2, 0)
len(gts)

## Compute Kappa agreement
  - use 2x2 and 4x4
  - per abnormality
  - chexpert vs each expert

In [None]:
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr

In [None]:
def compute_kappas(gts, score_fn=cohen_kappa_score):
    kappas_df = pd.DataFrame()
    for abn in CHEXPERT_LABELS_6:
        values_exp1 = gts[f'{abn}_exp1']
        values_exp2 = gts[f'{abn}_exp2']
        values_chex = gts[f'{abn}_chex']

        score_exps = score_fn(values_exp1, values_exp2)
        score_exp1_chex = score_fn(values_exp1, values_chex)
        score_exp2_chex = score_fn(values_exp2, values_chex)
        
        kappas_df.loc[abn, 'R1 - R2'] = score_exps
        kappas_df.loc[abn, 'R1 vs CheX'] = score_exp1_chex
        kappas_df.loc[abn, 'R2 vs CheX'] = score_exp2_chex
        
    return kappas_df

In [None]:
kappas4 = compute_kappas(gts)
kappas4

In [None]:
kappas2 = compute_kappas(gts2)
kappas2

In [None]:
pearson2 = compute_kappas(gts2, score_fn=lambda x, y: pearsonr(x, y)[0])
pearson2

In [None]:
print(kappas2.style.format(precision=3).to_latex())

## CheXpert test set

- are all sentences in test set? (how much are missing?)

In [None]:
%run ../../datasets/mimic_cxr.py
%run ../../utils/nlp.py

In [None]:
mimic_dataset = MIMICCXRDataset(dataset_type='test', do_not_load_image=True)
len(mimic_dataset)

In [None]:
reports = [
    mimic_dataset._reports[row['study_id']]['clean_text']
    for _, row in mimic_dataset.master_df.iterrows()
]
len(reports)

In [None]:
TEST_SENTENCES = set(
    sentence
    for report in reports
    for sentence in split_sentences_text(report)
)
len(TEST_SENTENCES)

In [None]:
GT_SENTENCES = list(gts['sentence'])
len(GT_SENTENCES)

In [None]:
len(TEST_SENTENCES.intersection(GT_SENTENCES))

In [None]:
count = []
for report in reports:
    sentences = split_sentences_text(report)
    n_appearances = sum(int(sentence in GT_SENTENCES) for sentence in sentences)
    count.append((report, n_appearances, len(sentences)))
len(count)

In [None]:
[
    (n, m)
    for report, n, m in count
    if n > 0
]

## Compute CheX against experts

In [None]:
from sklearn.metrics import precision_recall_fscore_support as prf1s

In [None]:
p, r, f1, _ = prf1s(gt, target, zero_division=0, average=None)
p.shape

In [None]:
%run ../../metrics/report_generation/chexpert.py

In [None]:
target_cols = [f'{abn}_chex' for abn in CHEXPERT_LABELS_6]
target = gts2[target_cols].to_numpy()

results = pd.DataFrame(index=CHEXPERT_LABELS_6)
for expert in (1, 2):
    gt_cols = [f'{abn}_exp{expert}' for abn in CHEXPERT_LABELS_6]
    gt = gts2[gt_cols].to_numpy()
    
    acc, precision, recall, f1, roc_auc, pr_auc = calculate_metrics(gt, target)
    
    results[f'prec-expert{expert}'] = precision
    results[f'recall-expert{expert}'] = recall
    results[f'f1-expert{expert}'] = f1
results

In [None]:
print(results.style.format('{:.3f}').to_latex())