# Imports

In [None]:
import torch
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../mimic_cxr.py

# Load metadata

In [None]:
FNAME_PREFIX = 'mimic-cxr-2.0.0'

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-metadata.csv')
metadata = pd.read_csv(fpath)
metadata.head()

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-chexpert.csv')
chexpert_df = pd.read_csv(fpath)
chexpert_df.fillna(0, inplace=True)
chexpert_df.replace(-1, 1, inplace=True)
chexpert_df.head()

# Plot example

In [None]:
from PIL import Image

In [None]:
idx = -1001
row = metadata.iloc[idx]
row

In [None]:
subject_id = f'p{row["subject_id"]}'
study_id = f's{row["study_id"]}'
dicom_id = str(row['dicom_id'])
image_fname = f'{dicom_id}.jpg'
subfolder = subject_id[:3]
subfolder, subject_id, study_id, image_fname

In [None]:
image_fpath = os.path.join(DATASET_DIR, 'images', subfolder, subject_id, study_id, image_fname)

In [None]:
image = Image.open(image_fpath)
print(image.size)
plt.imshow(image, cmap='gray')

In [None]:
report_fpath = os.path.join(DATASET_DIR, 'reports', subfolder, subject_id, f'{study_id}.txt')

In [None]:
with open(report_fpath) as f:
    text = f.read()
print(text)

# Labels distribution

In [None]:
%run ../common/constants.py

In [None]:
chexpert_df.head()

In [None]:
n_samples = len(chexpert_df['study_id'])
n_patients = len(chexpert_df['subject_id'])

values = []
for disease in CHEXPERT_DISEASES:
    value = chexpert_df[disease].sum()
    values.append((disease, value))

values = sorted(values, key=lambda x:x[1], reverse=True)
labels, amounts = zip(*values)

In [None]:
plt.bar(labels, amounts)
plt.xticks(rotation=90)
plt.ylabel('Number of studies')
plt.ylim(0, amounts[0] + 10000)
plt.title('Chexpert labels distribution')

for index, (amount, disease) in enumerate(zip(amounts, labels)):
    perc = amount / n_samples * 100
    plt.text(index, amount + 1200, f'{perc:.0f}%', ha='center')

# Create master csv

## Keep only studies with a report present

In [None]:
%run -n ../preprocess/mimic_cxr.py

In [None]:
reports_df = load_raw_reports_df()
reports_df.head()

In [None]:
studies_with_report = set(int(report[1:]) for report in reports_df['study'])
len(studies_with_report)

## Check metadata vs chexpert discrepancies

* Only studies with a report are kept
* There are a few studies with no report, present in metadata and chexpert_df csvs

In [None]:
len(metadata), len(chexpert_df)

In [None]:
studies1 = set(metadata['study_id'])
studies2 = set(chexpert_df['study_id'])
len(studies1), len(studies2), studies2.issubset(studies1)

In [None]:
studies11 = studies1.intersection(studies_with_report)
studies22 = studies2.intersection(studies_with_report)
len(studies11), len(studies22), studies11 == studies22

## Merge metadata and chexpert

In [None]:
metadata_filtered = metadata.loc[metadata['study_id'].isin(studies_with_report)]
len(metadata_filtered), len(set(metadata_filtered['study_id']))

In [None]:
chexpert_filtered = chexpert_df.loc[chexpert_df['study_id'].isin(studies_with_report)]
len(chexpert_filtered), len(set(chexpert_filtered['study_id']))

In [None]:
studies1 = set(metadata_filtered['study_id'])
studies2 = set(chexpert_filtered['study_id'])
assert studies1 == studies2

In [None]:
master_df = metadata_filtered.merge(
    chexpert_filtered, on=['study_id', 'subject_id'], how='inner')
len(master_df), len(set(master_df['study_id']))

In [None]:
master_df.head()

## Merge with split

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-split.csv')
split_df = pd.read_csv(fpath)
split_df.head()

In [None]:
master_df = master_df.merge(split_df, on=['dicom_id', 'study_id', 'subject_id'], how='inner')
len(master_df), len(set(master_df['study_id']))

In [None]:
master_df.head()

## Add filepaths

In [None]:
def get_filenames(subject_id, study_id, dicom_id):
    subject_id = f'p{subject_id}'
    study_id = f's{study_id}'
    image_fname = f'{dicom_id}.jpg'
    subfolder = subject_id[:3]

    image_fpath = os.path.join(subfolder, subject_id, study_id, image_fname)
    report_fpath = os.path.join(subfolder, subject_id, f'{study_id}.txt')
    
    return image_fpath, report_fpath

In [None]:
image_fpaths, report_fpaths = zip(*[
    get_filenames(*ids)
    for ids in zip(master_df['subject_id'], master_df['study_id'], master_df['dicom_id'])
])
len(image_fpaths), len(report_fpaths)

In [None]:
master_df['image_fpath'] = image_fpaths
master_df['report_fpath'] = report_fpaths

In [None]:
master_df.head()

## Check lateral or frontal

In [None]:
master_df.head()

### Check weird positions 

In [None]:
positions = list(Counter(master_df['ViewPosition']).keys())
positions

In [None]:
weird_pos = positions[5:]
weird_pos

In [None]:
cols = ['image_fpath',
        'ViewPosition', 'PerformedProcedureStepDescription', 'ViewCodeSequence_CodeMeaning',
        # 'dicom_id',
       ]
df = master_df[cols]
weird_images = df.loc[df['ViewPosition'].isin(weird_pos)]
print(len(weird_images))

In [None]:
weird_images.sort_values('ViewPosition')

In [None]:
' '.join(list(weird_images['image_fpath']))

In [None]:
FRONTAL_POSITIONS = ['PA', 'AP', 'AP AXIAL', 'LAO', 'LPO', 'RAO']

### Check nan positions

FIXME: For now, samples with ViewPosition == nan, maybe frontal or lateral

In [None]:
Counter(master_df['ViewPosition'])

In [None]:
cols = ['image_fpath',
        'ViewPosition', 'PerformedProcedureStepDescription', 'ViewCodeSequence_CodeMeaning',
        # 'dicom_id',
       ]
df = master_df[cols]
nan_positions = df.loc[df['ViewPosition'].isnull()]
len(nan_positions)

In [None]:
nan_positions['ViewCodeSequence_CodeMeaning'].isnull().sum()

In [None]:
' '.join(list(nan_positions['image_fpath'])[:10])

## Save master csv

In [None]:
%run ../common/__init__.py

In [None]:
cols = ['dicom_id', 'subject_id', 'study_id',
        'image_fpath', 'report_fpath',
        'ViewPosition', 'split',
        ] + CHEXPERT_DISEASES

In [None]:
out_df = master_df[cols]
out_df.replace('validate', 'val', inplace=True)
out_df.head()

In [None]:
len(out_df), len(set(out_df['subject_id'])), len(set(out_df['study_id']))

In [None]:
Counter(out_df['split'])

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
out_df.to_csv(fpath, index=False)

# Preprocess reports

Clean and tokenize

## Debug tokenizer

In [None]:
%run ../preprocess/tokenize.py

In [None]:
text_to_tokens('5 p.m., asdf dr. asdf')

## Run preprocess

In [None]:
%run -n ../preprocess/mimic_cxr.py

In [None]:
reports, token_appearances, errors = preprocess_mimic_cxr()
len(reports), len(token_appearances)

## Some utils

In [None]:
REPORTS_DF = load_raw_reports_df()
REPORTS_DF.head()

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
master_df = pd.read_csv(fpath)
master_df.head(2)

In [None]:
def load_raw_report(study_id):
    if isinstance(study_id, str):
        study_id = int(study_id.strip('s'))

    d = master_df.loc[master_df['study_id'] == study_id]
    
    report_fpaths = list(d['report_fpath'].unique())
    assert len(report_fpaths) == 1, f'Not 1 subject: {report_fpaths}'
    report_fpath = report_fpaths[0]
    print(report_fpath)
    report_fpath = os.path.join(DATASET_DIR, 'raw-reports', report_fpath)
    
    with open(report_fpath) as f:
        text = f.read()
    
    return text

In [None]:
import re

In [None]:
def find_reports_with_tokens(tokens):
    if isinstance(tokens, str):
        tokens = [tokens]
    found = []
    for report in reports.values():
        text = report['clean_text']
        
        for token in tokens:
            if re.search(token, text):
                found.append(report)
                break
                
    return found

## Check errors

### Check wrong reports

In [None]:
studies = list(f's{s}' for s in errors['tokens-empty'])
len(studies)

In [None]:
for study_id in studies:
    print('=' * 60)
    print('Study ID: ', study_id)

    report = load_raw_report(study_id)
    print(report)

    print('-' * 30)

    d = REPORTS_DF.loc[REPORTS_DF['study'] == study_id]
    ids = list(d.index)
    assert len(ids) == 1, f'Not 1 study: {d}'
    d = d.loc[ids[0]]
    for k in ['text', 'comparison', 'findings', 'impression', 'last_paragraph']:
        print(f'{k}: {d[k]}')

    print('=' * 60)

### Check tokens

In [None]:
sorted([(k, v) for k, v in token_appearances.items()], key=lambda x: x[1], reverse=True)

In [None]:
found = find_reports_with_tokens(r'\sNUMBER\s')
len(found)

In [None]:
found[4]

In [None]:
sorted([(k, v) for k, v in token_appearances.items()], key=lambda x: x[1])

# Calculate mean and std

In [None]:
%run ../mimic_cxr.py
%run ../../utils/images.py

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
d = pd.read_csv(fpath)
d = d.loc[d['split'] == 'train']
train_images = list(d['image_fpath'].unique())
len(train_images)

In [None]:
image_folder = os.path.join(DATASET_DIR, 'images')

In [None]:
%%capture output
%%time

mean, std = compute_mean_std(ImageFolderIterator(image_folder, train_images), show=True)
mean, std

In [None]:
mean, std

# Debug Dataset class

In [None]:
%run ../mimic_cxr.py
%run ../../utils/common.py

In [None]:
dataset = MIMICCXRDataset('test', sort_samples=True)
len(dataset)

In [None]:
item = dataset[4]
item.image.size(), item.report

In [None]:
item.image.min(), item.image.max()

In [None]:
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))