# Imports

In [None]:
import torch
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
import pandas as pd
pd.options.display.max_columns = None

In [None]:
%run ../mimic_cxr.py

# Load metadata

In [None]:
FNAME_PREFIX = 'mimic-cxr-2.0.0'

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-metadata.csv')
metadata = pd.read_csv(fpath)
metadata.head()

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-chexpert.csv')
chexpert_df = pd.read_csv(fpath)
chexpert_df.fillna(0, inplace=True)
chexpert_df.replace(-1, 1, inplace=True)
chexpert_df.head()

# Plot example

In [None]:
from PIL import Image

In [None]:
idx = -1001
row = metadata.iloc[idx]
row

In [None]:
subject_id = f'p{row["subject_id"]}'
study_id = f's{row["study_id"]}'
dicom_id = str(row['dicom_id'])
image_fname = f'{dicom_id}.jpg'
subfolder = subject_id[:3]
subfolder, subject_id, study_id, image_fname

In [None]:
image_fpath = os.path.join(DATASET_DIR, 'images', subfolder, subject_id, study_id, image_fname)

In [None]:
image = Image.open(image_fpath)
print(image.size)
plt.imshow(image, cmap='gray')

In [None]:
report_fpath = os.path.join(DATASET_DIR, 'reports', subfolder, subject_id, f'{study_id}.txt')

In [None]:
with open(report_fpath) as f:
    text = f.read()
print(text)

# Pre-process

i.e. run only once

## Create master csv

### Keep only studies with a report present

In [None]:
%run -n ../preprocess/mimic_cxr.py

In [None]:
reports_df = load_raw_reports_df()
reports_df.head()

In [None]:
studies_with_report = set(int(report[1:]) for report in reports_df['study'])
len(studies_with_report)

### Check metadata vs chexpert discrepancies

* Only studies with a report are kept
* There are a few studies with no report, present in metadata and chexpert_df csvs

In [None]:
len(metadata), len(chexpert_df)

In [None]:
studies1 = set(metadata['study_id'])
studies2 = set(chexpert_df['study_id'])
len(studies1), len(studies2), studies2.issubset(studies1)

In [None]:
studies11 = studies1.intersection(studies_with_report)
studies22 = studies2.intersection(studies_with_report)
len(studies11), len(studies22), studies11 == studies22

### Merge metadata and chexpert

In [None]:
metadata_filtered = metadata.loc[metadata['study_id'].isin(studies_with_report)]
len(metadata_filtered), len(set(metadata_filtered['study_id']))

In [None]:
chexpert_filtered = chexpert_df.loc[chexpert_df['study_id'].isin(studies_with_report)]
len(chexpert_filtered), len(set(chexpert_filtered['study_id']))

In [None]:
studies1 = set(metadata_filtered['study_id'])
studies2 = set(chexpert_filtered['study_id'])
assert studies1 == studies2

In [None]:
master_df = metadata_filtered.merge(
    chexpert_filtered, on=['study_id', 'subject_id'], how='inner')
len(master_df), len(set(master_df['study_id']))

In [None]:
master_df.head()

### Merge with split

In [None]:
fpath = os.path.join(DATASET_DIR, f'{FNAME_PREFIX}-split.csv')
split_df = pd.read_csv(fpath)
split_df.head()

In [None]:
master_df = master_df.merge(split_df, on=['dicom_id', 'study_id', 'subject_id'], how='inner')
len(master_df), len(set(master_df['study_id']))

In [None]:
master_df.head()

### Add filepaths

In [None]:
def get_filenames(subject_id, study_id, dicom_id):
    subject_id = f'p{subject_id}'
    study_id = f's{study_id}'
    image_fname = f'{dicom_id}.jpg'
    subfolder = subject_id[:3]

    image_fpath = os.path.join(subfolder, subject_id, study_id, image_fname)
    report_fpath = os.path.join(subfolder, subject_id, f'{study_id}.txt')
    
    return image_fpath, report_fpath

In [None]:
image_fpaths, report_fpaths = zip(*[
    get_filenames(*ids)
    for ids in zip(master_df['subject_id'], master_df['study_id'], master_df['dicom_id'])
])
len(image_fpaths), len(report_fpaths)

In [None]:
master_df['image_fpath'] = image_fpaths
master_df['report_fpath'] = report_fpaths

In [None]:
master_df.head()

### Check lateral or frontal

In [None]:
master_df.head()

#### Check weird positions 

In [None]:
positions = list(Counter(master_df['ViewPosition']).keys())
positions

In [None]:
weird_pos = positions[5:]
weird_pos

In [None]:
cols = ['image_fpath',
        'ViewPosition', 'PerformedProcedureStepDescription', 'ViewCodeSequence_CodeMeaning',
        # 'dicom_id',
       ]
df = master_df[cols]
weird_images = df.loc[df['ViewPosition'].isin(weird_pos)]
print(len(weird_images))

In [None]:
weird_images.sort_values('ViewPosition')

In [None]:
' '.join(list(weird_images['image_fpath']))

In [None]:
FRONTAL_POSITIONS = ['PA', 'AP', 'AP AXIAL', 'LAO', 'LPO', 'RAO']

#### Check nan positions

FIXME: For now, samples with ViewPosition == nan, maybe frontal or lateral

In [None]:
Counter(master_df['ViewPosition'])

In [None]:
cols = ['image_fpath',
        'ViewPosition', 'PerformedProcedureStepDescription', 'ViewCodeSequence_CodeMeaning',
        # 'dicom_id',
       ]
df = master_df[cols]
nan_positions = df.loc[df['ViewPosition'].isnull()]
len(nan_positions)

In [None]:
nan_positions['ViewCodeSequence_CodeMeaning'].isnull().sum()

In [None]:
' '.join(list(nan_positions['image_fpath'])[:10])

### Save master csv

In [None]:
%run ../common/__init__.py

In [None]:
cols = ['dicom_id', 'subject_id', 'study_id',
        'image_fpath', 'report_fpath',
        'ViewPosition', 'split',
        ] + CHEXPERT_DISEASES

In [None]:
out_df = master_df[cols]
out_df.replace('validate', 'val', inplace=True)
out_df.head()

In [None]:
len(out_df), len(set(out_df['subject_id'])), len(set(out_df['study_id']))

In [None]:
Counter(out_df['split'])

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
out_df.to_csv(fpath, index=False)

## Clean and tokenize reports

### Review reports and tokens manually

In [None]:
import re

In [None]:
%run ../../utils/nlp.py
%run ../mimic_cxr.py

In [None]:
reports_fname = os.path.join(DATASET_DIR, 'reports', 'reports.clean.v1.json')
with open(reports_fname, 'r') as f:
    reports = json.load(f)

In [None]:
token_appearances = Counter()
for r in reports.values():
    for token in r['clean_text'].split():
        token_appearances[token] += 1
len(token_appearances)

In [None]:
more_than_k_appearances = lambda x: [(k, v) for k, v in token_appearances.items() if v > x]

In [None]:
len(more_than_k_appearances(10))

#### Vocab

In [None]:
[(k, v) for k, v in token_appearances.items() if re.search(r'twe', k)]

In [None]:
sorted(more_than_k_appearances(10), key=lambda x: x[1], reverse=False)

#### Reports

In [None]:
def find_reports_with_tokens(tokens, search_in='clean_text', absent_in=None):
    if isinstance(tokens, str):
        tokens = [tokens]
    found = []
    for report in reports.values():
        text = report[search_in]
        
        for token in tokens:
            if re.search(token, text):
                if absent_in is not None:
                    if not re.search(token, report[absent_in]):
                        continue
                found.append(report)
                break
                
    return found

In [None]:
found = find_reports_with_tokens(r'\b[kjhv]\b', absent_in='text')
len(found)

In [None]:
found

#### Sentences

In [None]:
sentences_appears = get_sentences_appearances(r['clean_text'] for r in reports.values())
sentences = list(sentences_appears)
len(sentences)

In [None]:
sentences[100:200]

In [None]:
found = [s for s in sentences if re.search(r'\A\W', s)]
len(found)

In [None]:
found

### Actually debug tokenizer

In [None]:
%run ../preprocess/tokenize.py

In [None]:
ts = text_to_tokens('M.D.')
ts

### Run preprocess

In [None]:
%run -n ../preprocess/mimic_cxr.py

In [None]:
# Takes around 1min
reports, token_appearances, errors = preprocess_mimic_cxr('v3')
len(reports), len(token_appearances)

### Check errors

In [None]:
REPORTS_DF = load_raw_reports_df()
REPORTS_DF.head()

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
master_df = pd.read_csv(fpath)
master_df.head(2)

In [None]:
def load_raw_report(study_id):
    if isinstance(study_id, str):
        study_id = int(study_id.strip('s'))

    d = master_df.loc[master_df['study_id'] == study_id]
    
    report_fpaths = list(d['report_fpath'].unique())
    assert len(report_fpaths) == 1, f'Not 1 subject: {report_fpaths}'
    report_fpath = report_fpaths[0]
    print(report_fpath)
    report_fpath = os.path.join(DATASET_DIR, 'raw-reports', report_fpath)
    
    with open(report_fpath) as f:
        text = f.read()
    
    return text

In [None]:
studies = list(f's{s}' for s in errors['tokens-empty'])
len(studies)

In [None]:
for study_id in studies:
    print('=' * 60)
    print('Study ID: ', study_id)

    report = load_raw_report(study_id)
    print(report)

    print('-' * 30)

    d = REPORTS_DF.loc[REPORTS_DF['study'] == study_id]
    ids = list(d.index)
    assert len(ids) == 1, f'Not 1 study: {d}'
    d = d.loc[ids[0]]
    for k in ['text', 'comparison', 'findings', 'impression', 'last_paragraph']:
        print(f'{k}: {d[k]}')

    print('=' * 60)

## Calculate mean and std

In [None]:
%run ../mimic_cxr.py
%run ../../utils/images.py

In [None]:
fpath = os.path.join(DATASET_DIR, 'master_metadata.csv')
d = pd.read_csv(fpath)
d = d.loc[d['split'] == 'train']
train_images = list(d['image_fpath'].unique())
len(train_images)

In [None]:
image_folder = os.path.join(DATASET_DIR, 'images')

In [None]:
%%capture output
%%time

mean, std = compute_mean_std(ImageFolderIterator(image_folder, train_images), show=True)
mean, std

In [None]:
mean, std

## Sentence2organ

In [None]:
import json

In [None]:
%run ../mimic_cxr.py
%run ../common/sentences2organs/compute.py
%run ../common/constants.py
%run ../../utils/nlp.py

In [None]:
reports_fname = os.path.join(DATASET_DIR, 'reports', 'reports.clean.v1.json')
with open(reports_fname, 'r') as f:
    reports = list(json.load(f).values())
len(reports)

In [None]:
sentences_appears = get_sentences_appearances(r['clean_text'] for r in reports)
sentences = list(sentences_appears)
len(sentences)

In [None]:
total_appearances = sum(sentences_appears.values())
total_appearances

In [None]:
organs_for_sentences, errors = find_organs_for_sentences(sentences, show=True)
len(organs_for_sentences), len(errors['all-empty'])

In [None]:
mimic_df = pd.DataFrame(organs_for_sentences, columns=JSRT_ORGANS)
mimic_df['sentences'] = sentences
mimic_df = mimic_df[['sentences'] + JSRT_ORGANS]
mimic_df.head()

### Check empty ones

In [None]:
error_appearances = {
    sentence: sentences_appears[sentence]
    for sentence in errors['all-empty']
}
n_errors = sum(error_appearances.values())
perc = n_errors / total_appearances * 100
f'{n_errors:,}', f'{total_appearances:,}', f'{perc:.0f}%'

In [None]:
l = sorted(error_appearances.items(), key=lambda x: x[1], reverse=True)
l[:10]

### Save sentence2organ in file

In [None]:
%run ../../utils/__init__.py
%run ../common/constants.py

In [None]:
from medai.datasets.iu_xray import DATASET_DIR as IU_DATASET_DIR

In [None]:
fpath = os.path.join(IU_DATASET_DIR, 'reports', 'sentences_with_organs_OLD.csv')
iu_df = pd.read_csv(fpath)
iu_df.head()

In [None]:
final_df = iu_df.append(mimic_df, ignore_index=True)
len(final_df), len(iu_df) + len(mimic_df)

In [None]:
final_df = final_df.groupby('sentences').last()
final_df.reset_index(drop=False, inplace=True)
print(len(final_df))
final_df.head()

In [None]:
fpath = os.path.join(WORKSPACE_DIR, 'sentences_with_organs.csv')
final_df.to_csv(fpath, index=False)

# Create mini-mimic

In [None]:
FPATH = os.path.join(DATASET_DIR, 'master_metadata.csv')
df = pd.read_csv(FPATH)
print(len(df))
df.head()

## Create random split

In [None]:
is_mini = list()
percentages = {
    'train': 0.12,
    'val': 0.15,
    'test': 0.15,
}

for split in ('train', 'val', 'test'):
    sub_df = df.loc[df['split'] == split]
    
    # Do not use images without frontal-lateral definition
    sub_df = sub_df.dropna(axis=0, subset=['ViewPosition'])
    
    images = list(sub_df['dicom_id'])
    k = int(percentages[split] * len(images))
    
    print(f'Choosing {k:,} from {split}')
    is_mini.extend(random.sample(images, k))
len(is_mini)

## Add column to master_df

In [None]:
is_mini = set(is_mini)

is_mini_column = [
    int(dicom_id in is_mini)
    for dicom_id in df.dicom_id
]
len(is_mini), len(is_mini_column)

In [None]:
df['mini'] = is_mini_column

In [None]:
df.to_csv(FPATH, index=False)

# Move mini-mimic to SSD

In [None]:
import os
from tqdm.auto import tqdm

In [None]:
FPATH = os.path.join(DATASET_DIR, 'master_metadata.csv')
# df = pd.read_csv(FPATH)
print(len(df))
df.head()

In [None]:
df = df[df.mini == 1]
len(df)

## Sum space needed

In [None]:
total_size = 0

images_dir = os.path.join(DATASET_DIR, 'images')
for image_path in tqdm(df.image_fpath):
    fpath = os.path.join(images_dir, image_path)
    r = os.stat(fpath)
    total_size += r.st_size / 1024 # kbytes
    
total_size /= 1024 # mbytes
total_size

In [None]:
total_size / 1024

## Copy images

In [None]:
from shutil import copyfile

In [None]:
src_dir = os.path.join(DATASET_DIR, 'images')
target_dir = os.path.join(DATASET_DIR_FAST, 'images')
for image_path in tqdm(df.image_fpath):
    src_fpath = os.path.join(src_dir, image_path)
    target_fpath = os.path.join(target_dir, image_path)
    os.makedirs(os.path.dirname(target_fpath), exist_ok=True)
    copyfile(src_fpath, target_fpath)

# Debug Dataset class

In [None]:
%run ../mimic_cxr.py
%run ../../utils/common.py

In [None]:
dataset = MIMICCXRDataset('test', sort_samples=True)
len(dataset)

In [None]:
item = dataset[4]
item.image.size(), item.report

In [None]:
item.image.min(), item.image.max()

In [None]:
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))

# Plot distributions

In [None]:
%run ../mimic_cxr.py
%run ../../utils/common.py

In [None]:
train_dataset = MIMICCXRDataset('train')
train_dataset_mini = MIMICCXRDataset('train', mini=1)
val_dataset = MIMICCXRDataset('val')
val_dataset_mini = MIMICCXRDataset('val', mini=1)
len(train_dataset), len(train_dataset_mini), len(val_dataset), len(val_dataset_mini)

In [None]:
def get_dataset_name(dataset):
    if dataset._mini is None:
        return dataset.dataset_type
    return f'mini{dataset._mini}-{dataset.dataset_type}'

## Labels distributions

In [None]:
def plot_labels_distribution(dataset):
    amounts_by_disease = dataset.master_df[CHEXPERT_DISEASES].sum(
        axis=0).sort_values(ascending=False)

    plt.title(get_dataset_name(dataset), fontsize=20)
    plt.bar(amounts_by_disease.index, amounts_by_disease.values)
    plt.xticks(rotation=60, fontsize=15, ha='right')
    plt.ylabel('Amount of samples', fontsize=15)
    plt.ylim(0, max(amounts_by_disease) * 1.15)
    y_padding = int(max(amounts_by_disease) * 0.03)
    
    n_samples = len(dataset.master_df)
    for index, (disease, amount) in enumerate(amounts_by_disease.iteritems()):
        amount = int(amount)
        perc = amount / n_samples * 100
        plt.text(index, amount + y_padding, f'{amount:,}\n{perc:.0f}%', ha='center')

In [None]:
n_rows = 2
n_cols = 2
plt.figure(figsize=(15, n_rows * 8))

plt.subplot(n_rows, n_cols, 1)
plot_labels_distribution(train_dataset)

plt.subplot(n_rows, n_cols, 2)
plot_labels_distribution(val_dataset)

plt.subplot(n_rows, n_cols, 3)
plot_labels_distribution(train_dataset_mini)

plt.subplot(n_rows, n_cols, 4)
plot_labels_distribution(val_dataset_mini)

plt.tight_layout()

## Length distribution

In [None]:
def plot_n_words_distribution(dataset):
    df = dataset.master_df.groupby('report_fpath').first()
    lengths = df['report_length']
    plt.title(f'Distribution of report-length ({get_dataset_name(dataset)})')
    plt.ylabel('Amount of images')
    plt.xlabel('Number of words')
    _ = plt.hist(lengths.values, bins=25, range=(0,200))

In [None]:
n_rows = 2
n_cols = 2

plt.figure(figsize=(7 * n_cols, 5*n_rows))
plt.subplot(n_rows, n_cols, 1)
plot_n_words_distribution(train_dataset)

plt.subplot(n_rows, n_cols, 2)
plot_n_words_distribution(train_dataset_mini)

plt.subplot(n_rows, n_cols, 3)
plot_n_words_distribution(val_dataset)

plt.subplot(n_rows, n_cols, 4)
plot_n_words_distribution(val_dataset_mini)

## Frontal vs lateral distribution

In [None]:
def plot_view_position_distribution(dataset):
    amounts = Counter(dataset.master_df['ViewPosition'])
    
    reduced_amounts = Counter()
    for key, value in amounts.items():
        key = str(key)
        if key == 'nan':
            reduced_key = 'nan'
        elif key in _FRONTAL_POSITIONS:
            reduced_key = 'frontal'
        else:
            reduced_key = 'lateral'
            
        reduced_amounts[reduced_key] += value
    
    plt.title(f'Frontal vs lateral ({get_dataset_name(dataset)})')
    plt.ylabel('Amount of images')
    # plt.xticks(rotation=90)
    
    reduced_amounts = sorted(reduced_amounts.items(), key=lambda x: x[1], reverse=True)
    keys, values = zip(*reduced_amounts)
    plt.bar(keys, values)
    
    plt.ylim(0, max(values) * 1.2)
    y_padding = max(values) * 0.03
    n_samples = len(dataset)
    for index, value in enumerate(values):
        perc = value / n_samples * 100
        text = f'{value:,}\n{perc:.2f}%'
        plt.text(index, value + y_padding, text, ha='center')

In [None]:
n_rows = 2
n_cols = 2

plt.figure(figsize=(n_cols * 7, n_rows * 5))

plt.subplot(n_rows, n_cols, 1)
plot_view_position_distribution(train_dataset)
plt.subplot(n_rows, n_cols, 2)
plot_view_position_distribution(train_dataset_mini)

plt.subplot(n_rows, n_cols, 3)
plot_view_position_distribution(val_dataset)
plt.subplot(n_rows, n_cols, 4)
plot_view_position_distribution(val_dataset_mini)