# Imports

In [None]:
import os
from collections import Counter, defaultdict
import importlib
import json
import re
import numpy as np

In [None]:
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)
plt.rcParams.update({'font.family': 'serif', 'font.sans-serif': ['CMU', 'Helvetica']})

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../../utils/__init__.py
config_logging(logging.INFO)

In [None]:
%run ../../datasets/common/constants.py

In [None]:
from medai.datasets import iu_xray, mimic_cxr
IU_DIR = iu_xray.DATASET_DIR
MIMIC_DIR = mimic_cxr.DATASET_DIR

# Load stuff

## Load dicts

In [None]:
dataset_dir, version = IU_DIR, 'v4-1'
# dataset_dir, version = MIMIC_DIR, 'v4-2'

_fpath = os.path.join(dataset_dir, 'reports', f'reports.clean.{version}.json')
with open(_fpath) as f:
    REPORTS_DICT = json.load(f)
len(REPORTS_DICT)

## Load sentences

In [None]:
#dataset_dir = IU_DIR
dataset_dir = MIMIC_DIR

fpath = os.path.join(dataset_dir, 'reports', 'sentences_with_chexpert_labels.csv')
SENTENCES_DF = pd.read_csv(fpath)
SENTENCES_DF.head(3)

In [None]:
d = SENTENCES_DF
d = d.loc[d['Lung Lesion'] == 0]
list(d['sentence'])

## Load reports with chexpert labels

In [None]:
# dataset_dir = IU_DIR
dataset_dir = MIMIC_DIR

fpath = os.path.join(dataset_dir, 'reports', 'reports_with_chexpert_labels.csv')
REPORTS_DF = pd.read_csv(fpath)
REPORTS_DF.head(3)

In [None]:
n_none, n_neg, n_unc, n_pos = zip(*[
    (row.tolist().count(-2), row.tolist().count(0), row.tolist().count(-1), row.tolist().count(1))
    for index, row in REPORTS_DF[CHEXPERT_DISEASES].iterrows()
])
REPORTS_DF = REPORTS_DF.assign(**{
    'n_none': n_none,'n_neg': n_neg,'n_unc': n_unc,'n_pos': n_pos,
})
REPORTS_DF['study_id'] = [filename[14:-4] for filename in REPORTS_DF['filename']]
REPORTS_DF.head(2)

## Load mimic sectioned file

In [None]:
_fpath = os.path.join(MIMIC_DIR, 'master_metadata.csv')
mimic_metadata = pd.read_csv(_fpath)
mimic_metadata.head(2)

In [None]:
_fpath = os.path.join(MIMIC_DIR, 'reports', 'mimic_cxr_sectioned.csv')
mimic_sectioned = pd.read_csv(_fpath)
print(len(mimic_sectioned))
mimic_sectioned.head(2)

In [None]:
mimic_sectioned = mimic_sectioned.merge(REPORTS_DF, left_on='study', right_on='study_id',
                                        how='left')
print(len(mimic_sectioned))
mimic_sectioned.head(3)

### Find an example

In [None]:
%run ../../utils/images.py

In [None]:
d = mimic_sectioned
d = d.fillna('').sort_values('findings', key=lambda x: x.str.len(), ascending=True)
# d = d.loc[d['comparison'].notnull()]
# d = d.loc[~d['comparison'].str.contains('None')]
# d = d.loc[d['comparison'] != '___.']
# d = d.loc[d['comparison'] != '___']
d = d.loc[((d['n_neg'] >= 1) & (d['n_pos'] >= 1))]
d = d.loc[d['No Finding'] == -2]
d = d.loc[d['Support Devices'] == -2]
d = d.loc[d['findings'].str.contains(r'[Aa]gain|unchanged|comparison')]
d = d.loc[d['findings'].str.contains(r'follow[\-u]')]
d = d.loc[d['findings'].str.contains(r'PA|frontal|lateral|single|techn')]
print(len(d))

In [None]:
list(d['findings'])

In [None]:
# target = 'Chest radiograph ___ and chest CT ___'
# target =  'AP upright and lateral views of the chest provided.\n \n Suture material is noted projecting over the left upper lung as on prior\n compatible with prior resection.  There is focal opacity in the right lower\n lobe and left mid lung, could represent pneumonia though follow-up to\n resolution advised.  There is a retrocardiac opacity containing a fluid level\n most compatible with a hiatal hernia.  No large effusion or pneumothorax. \n Cardiomediastinal silhouette is unchanged.  Bony structures are intact.'
# target = 'Frontal and lateral views of the chest were obtained.  There is a\n subtle patchy opacity projecting over the right upper lobe, difficult to\n discern whether it may have been subtly present on the prior study, concerning\n for focus of infection.  Recommend followup to resolution to exclude\n underlying lesion.  The remainder of the lungs is clear.  No pleural effusion\n or pneumothorax is seen.  The cardiac and mediastinal silhouettes are stable. \n A single-lead left-sided AICD is again seen, unchanged in position.'
target = 'Single AP upright portable view of the chest was obtained.  The\n cardiomediastinal silhouette remains enlarged and similar in appearance since\n the prior study.  Interstitial pulmonary edema is again seen, fairly similar\n in severity as compared to the prior study.  No large pleural effusion or\n pneumothorax is seen.  A posterior left infrahilar opacity is seen, somewhat\n rounded in contour, not as well seen on the lateral view on ___, could be\n due to underlying consolidation; recommend followup to resolution to exclude\n an underlying nodular lesion.  The opacity measures approximately 2.4 cm.'

In [None]:
d = mimic_sectioned
rows = d.loc[d['findings'] == target].sort_values('findings', key=lambda x: x.str.len())
print(len(rows))
rows

In [None]:
list(rows['impression'])

In [None]:
# target_impression = 'Mild pulmonary vascular congestion and bibasilar opacities, likely atelectasis\n but infection is not excluded.'
# target_impression = '1. PICC line positioned appropriately.\n 2. Mild cardiomegaly.\n 3. No signs of pneumonia.'
target_impression = 'Minimal patchy right lower lobe opacity which is concerning for infection in\n the correct clinical setting.'
rows2 = rows.loc[rows['impression'] == target_impression]
print(len(rows2))
rows2

In [None]:
# Sample:
# study_id = 's53031050'
# study_id = 's56273978'
# study_id = 's59372424' ## Figure 1 in intro
study_id = 's52756007'
# study_id = 's57385035'

In [None]:
d = mimic_sectioned
d = d.loc[d['study'] == study_id].iloc[0]
impression = d['impression']
findings = d['findings']
comparison = d['comparison']
path = d['filename']
print('PATH: ', path)
print('COMPARISON: ', comparison)
print('FINDINGS: ', findings)
print('IMPRESSION: ', impression)

In [None]:
d = mimic_metadata
d = d.loc[d['study_id'] == int(study_id[1:])]
image_fpaths = list(d['image_fpath'])
len(image_fpaths)

In [None]:
n_rows = 1
n_cols = len(image_fpaths)
for index, filename in enumerate(image_fpaths):
    plt.subplot(n_rows, n_cols, index+1)
    fpath = os.path.join(MIMIC_DIR, 'images', filename)
    image = load_image(fpath, 'RGB')
    plt.imshow(image)

## Search in reports_df

In [None]:
d = REPORTS_DF
# d = d.loc[((d['n_neg'] >= 1) & (d['n_pos'] >= 1) & (d['n_unc'] >= 1))]
# d = d.loc[d['Pneumonia'] == 1]
d = d.sort_values('Reports', key=lambda x: x.str.len())
print(len(d))
d.head(2)

In [None]:
# d = REPORTS_DF
# target = r'clinical.setting.*follow'
target = r'blunt'
d = d.loc[d['Reports'].str.contains(target)]
# l = [r for r in d['Reports'] if re.search(target, r)]
l = list(d['Reports'])
l[:20]

In [None]:
d.head(1)

## Search in dict

In [None]:
list(REPORTS_DICT.keys())[0]

In [None]:
REPORTS_DICT['53911762']

In [None]:
def _is_none(txt):
    if not txt:
        return True
    txt = txt.lower().replace('.', '').strip()
    if txt.startswith('none'):
        return True
    if txt.startswith('no comparison'):
        return True
    if txt in ('none', 'nones', 'xxxx', 'none available', 'none clinical', 'no prior'):
        return True
    return False
def _contains_xxxx(txt):
    return 'xxxx' in txt.lower()

In [None]:
condition = lambda txt: not _contains_xxxx(txt) and not _is_none(txt)

In [None]:
# target_image = 'CXR3095_IM-1448-1001'
found = []
for filename, d in REPORTS_DICT.items():
#     if any(image.get('id') == target_image for image in d['images']):
#         found.append(filename)
    comparison = d.get('comparison') or ''
    indication = d.get('indication') or ''
    if condition(comparison) and condition(indication):
        found.append(filename)
len(found), len(REPORTS_DICT)

In [None]:
for f in found:
    d = REPORTS_DICT[f]
    comparison = d['comparison']
    indication = d['indication']
    print(f'{f}, {comparison}\t\t{indication}')

In [None]:
REPORTS_DICT['1547.xml']

## Sentence examples

In [None]:
d = SENTENCES_DF
d = d.loc[d['Cardiomegaly'] == 0]
list(d['sentence'])

# ImageCLEF samples

In [None]:
%run ../../utils/images.py

In [None]:
IMAGECLEF_DIR = IU_DIR.replace('iu-x-ray/dataset',
                               'imageclef/2020-2021-Datasets/Captioning/caption-prediction')

In [None]:
_fpath = os.path.join(IMAGECLEF_DIR, 'Training_Set_Caption.csv')
df = pd.read_csv(_fpath, sep='\t', header=None, names=['image_name', 'caption'])
df.head(2)

In [None]:
image_folder = os.path.join(IMAGECLEF_DIR, 'Training-Images')

In [None]:
row = df.iloc[111]
image_name = row['image_name']
caption = row['caption']
row

In [None]:
image = load_image(os.path.join(image_folder, f'{image_name}.jpg'), 'RGB')
print(caption)
plt.imshow(image)

In [None]:
# Example 1: synpic43648, iloc 10
# Example 2: synpic33642, iloc 140