# Imports

In [None]:
import os
import re

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DIR
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

# Sentences

In [None]:
fpath = os.path.join(IU_DIR, 'reports', 'sentences_with_extra_info.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF['clean_sentence'] = [
    ' '.join(s.lower().replace('xxxx', ' ').split())
    for s in SENTENCES_DF['sentence']
]
SENTENCES_DF.head(1)

In [None]:
SENTENCES = list(SENTENCES_DF['sentence'])
len(SENTENCES)

In [None]:
l = [
    s for s in SENTENCES_DF['sentence']
    if re.search(r'vertebral', s) # and re.search(r'height', s)
]
len(l), l[:10]

In [None]:
l[10:]

# Apply labeler

In [None]:
# DATASET_CHOSEN = IU_DIR
DATASET_CHOSEN = MIMIC_DIR

## Load reports

In [None]:
fpath = os.path.join(DATASET_CHOSEN, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath)
REPORTS_DF.head(1)

In [None]:
REPORTS = list(REPORTS_DF['Reports'])
len(REPORTS)

## Apply

In [None]:
%run ../../datasets/vocab/__init__.py
%run ../../utils/nlp.py
%run ../../metrics/report_generation/abn_match/textray.py

In [None]:
vocab = load_vocab(os.path.join(DATASET_CHOSEN, 'reports'), 'v4-2')
len(vocab)

In [None]:
labeler = TextRayLabeler(vocab, device='cpu', use_idx=False)
labeler

In [None]:
%%time

labels = labeler.label_reports(REPORTS)
labels.size()

In [None]:
df = pd.DataFrame(labels.cpu().numpy(), columns=labeler.diseases)
df['Reports'] = REPORTS
cols = ['Reports'] + labeler.diseases
df = df[cols]
df.head(2)

## Check leftout reports

In [None]:
d = df
d = d.loc[(df[labeler.diseases] == -2).all(axis=1)]
len(d)

In [None]:
leftout = list(d['Reports'])
leftout = sorted(leftout, key=lambda x: len(x))
len(leftout), leftout[:5]

In [None]:
leftout[5:]

In [None]:
len(df)

In [None]:
df.to_csv(os.path.join(IU_DIR, 'reports', 'reports_with_textray_labels.csv'), index=False)

## Labels distribution

In [None]:
df.replace({-2: 0}, inplace=True)
df.head(2)

In [None]:
plt.figure(figsize=(10, 10))
amount_by_abn = df[labeler.diseases].sum(axis=0).sort_values(ascending=True)
diseases = list(amount_by_abn.index)
amounts = amount_by_abn.values

plt.barh(diseases, amounts, 0.8)
# plt.xticks(rotation=90, fontsize=18)
dataset_name = 'IU' if 'iu' in DATASET_CHOSEN else 'MIMIC'
plt.title(f'{dataset_name} label distribution', fontsize=20)
plt.xlabel(f'N reports (N={len(df):,})', fontsize=15)
max_value = max(amounts)
plt.xlim(0, max_value * 1.18)

for idx, amount in enumerate(amounts):
    text = f'{amount:,}'
    perc = amount / len(df) * 100
    text += f' ({perc:.1f}%)'
    plt.text(amount + max_value*0.01, idx, text, va='center')
    
plt.margins(y=0.01)

# MIMIC reports

In [None]:
%run ../../datasets/common/constants.py

In [None]:
ACTUAL_DISEASES = CHEXPERT_DISEASES[1:]

In [None]:
from medai.datasets.mimic_cxr import DATASET_DIR as MIMIC_DIR

In [None]:
fpath = os.path.join(MIMIC_DIR, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath)
REPORTS_DF.head(1)

In [None]:
REPORTS = list(REPORTS_DF['Reports'])
len(REPORTS)

In [None]:
mimic_vocab = load_vocab(os.path.join(MIMIC_DIR, 'reports'), 'v4-2')
print(len(mimic_vocab))
labeler = TextRayLabeler(mimic_vocab, device='cpu', use_idx=False)
labeler

In [None]:
%%time

labels = labeler(REPORTS)
labels.size()

In [None]:
df = pd.DataFrame(labels.cpu().numpy(), columns=labeler.diseases)
df['Reports'] = REPORTS
cols = ['Reports'] + labeler.diseases
# df.replace({-2: 0}, inplace=True)
df = df[cols]
df.head(2)

In [None]:
rd = REPORTS_DF
rd = rd.loc[((rd[ACTUAL_DISEASES] == 1).any(axis=1))]
reports_with_some = set(rd['Reports'])
len(reports_with_some)

In [None]:
d = df
d = d.loc[d['Reports'].isin(reports_with_some)]
d = d.loc[(d[labeler.diseases] == -2).all(axis=1)]
len(d)

In [None]:
l = list(d['Reports'].unique())
len(l)

In [None]:
rd.loc[rd['Reports'] == 'no acute cardiopulmonary process .']

In [None]:
'no acute cardiopulmonary process .' in list(rd['Reports'])

In [None]:
%run ../../metrics/report_generation/chexpert.py

In [None]:
labels2 = apply_labeler_to_column(['no acute cardiopulmonary process .'])
labels2

In [None]:
l

In [None]:
i = labeler.diseases.index('Lung Opacity')
labeler.disease_matchers[i]

In [None]:
sorted(list(reports_with_some), key=lambda x: len(x))

In [None]:
REPORTS_DF.loc[REPORTS_DF['Reports'] == 'is .']

In [None]:
def load_report(fname):
    fname = os.path.join(MIMIC_DIR, 'raw-reports', fname)
    with open(fname) as f:
        return f.read()

In [None]:
text = load_report('p13/p13290560/s52121407.txt')
len(text)

In [None]:
print(text)

In [None]:
print(text[372:])

In [None]:
text.index('Lung volumes')

In [None]:
%run ../../metrics/report_generation/chexpert.py

In [None]:
apply_labeler_to_column(['is and for .'])

In [None]:
import json

In [None]:
with open(os.path.join(MIMIC_DIR, 'reports', 'reports.clean.v4.json'), 'r') as f:
    RR = json.load(f)
len(RR)

In [None]:
RR['57179687']