## Imports

In [None]:
import torch
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [None]:
%run ../cxr14.py

## Split dataframe

### Check splits are correct

In [None]:
master_df_fpath = os.path.join(DATASET_DIR, 'label_index.csv')
train_df_fpath = os.path.join(DATASET_DIR, 'train_label.csv')
val_df_fpath = os.path.join(DATASET_DIR, 'val_label.csv')
test_df_fpath = os.path.join(DATASET_DIR, 'test_label.csv')

In [None]:
df = pd.read_csv(master_df_fpath)
master_images = set(df['FileName'])
print(len(master_images))
df.head()

In [None]:
def load_images_from_csv(fpath):
    df = pd.read_csv(fpath)
    return set(df['FileName'])

def load_images_from_txt(fpath):
    with open(fpath, 'r') as f:
        return set(l.strip() for l in f.readlines())

In [None]:
def check_same_images(split):
    df_path = os.path.join(DATASET_DIR, f'{split}_label.csv')
    txt_path = os.path.join(DATASET_DIR, 'splits', f'{split}.txt')
    
    txt_images = load_images_from_txt(txt_path)
    csv_images = load_images_from_csv(df_path)
    
    return txt_images == csv_images

In [None]:
check_same_images('train'), check_same_images('val'), check_same_images('test')

### Create test-bbox splits

In [None]:
bbox_df = pd.read_csv(os.path.join(DATASET_DIR, 'BBox_List_2017.csv'))
bbox_df.drop(columns=[c for c in bbox_df.columns if 'Unnamed' in c], inplace=True)
bbox_df.head()

In [None]:
images_with_bbox = list(set(bbox_df['Image Index']))
len(images_with_bbox)

In [None]:
fpath = os.path.join(DATASET_DIR, 'splits', 'test-bbox.txt')
with open(fpath, 'w') as f:
    for image in images_with_bbox:
        f.write(f'{image}\n')

### Create all split

In [None]:
fpath = os.path.join(DATASET_DIR, 'splits', 'all.txt')

with open(fpath, 'w') as f:
    for image_name in master_images:
        f.write(f'{image_name}\n')
    
len(master_images)

## Calculate mean and std

In [None]:
%run ../cxr14.py
%run ../../utils/images.py

In [None]:
dataset = CXR14Dataset('train')
len(dataset)

In [None]:
train_images = list(dataset.label_index['FileName'])
len(train_images)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, train_images), show=True)
mean, std

In [None]:
print(mean)
print(std)

## Load class `CXR14Dataset`

In [None]:
%run ../cxr14.py

In [None]:
kwargs = {
    'dataset_type': 'train',
    'masks': True,
    'masks_version': 'v2',
}

dataset = CXR14Dataset(**kwargs)
len(dataset)

### Plot example

In [None]:
%run ../../utils/__init__.py
%run ../../utils/images.py

In [None]:
item = dataset[300]
item.image.size(), item.masks.size(), item.image_fname

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(squeeze_masks(item.masks))
plt.axis('off')

### Label distribution

In [None]:
df = dataset.label_index
df.head()

In [None]:
some_disease = df[CXR14_DISEASES].max(axis=1)
some_disease.sum()

In [None]:
pos_samples = df[CXR14_DISEASES].sum(axis=0)
neg_samples = len(df) - pos_samples
pos_samples, neg_samples

In [None]:
neg_samples // pos_samples

### Clahe images

In [None]:
%run ../cxr14.py
%run ../../utils/__init__.py

In [None]:
kwargs = {
    'dataset_type': 'test-bbox',
    'norm_by_sample': True,
    'masks': False,
}

dataset = CXR14Dataset(**kwargs)
clahe_dataset = CXR14Dataset(images_version='clahe', **kwargs)

len(dataset), len(clahe_dataset)

In [None]:
plt.figure(figsize=(15, 5))

idx = 50

item = dataset[idx]

plt.subplot(1, 2, 1)
plt.title('Normal')
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.axis('off')


item = clahe_dataset[idx]
plt.subplot(1, 2, 2)
plt.title('CLAHE')
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.axis('off')

### Debug organ-mask-reducing

In [None]:
%run ../cxr14.py

In [None]:
dataset = CXR14Dataset('test-bbox', norm_by_sample=True, masks=True)
len(dataset)

In [None]:
item = dataset[1]
item.masks.size()

In [None]:
plt.figure(figsize=(15, 5))

for i, m in enumerate(item.masks):
    plt.subplot(1, 4, i+1)
    plt.imshow(m)
    plt.axis('off')

In [None]:
reduced_masks = dataset.reduce_masks_for_disease('Edema', item.masks)
print(reduced_masks.size())

plt.imshow(reduced_masks)

## Simplify bounding-boxes

Save a dict with BBs information (faster than using bbox-df on-the-fly)

In [None]:
import json

In [None]:
bbox_fpath = os.path.join(DATASET_DIR, 'BBox_List_2017.csv')

In [None]:
df = pd.read_csv(bbox_fpath)
df.drop(columns=[c for c in df.columns if 'Unnamed' in c], inplace=True)
df.head()

In [None]:
MAPPING = {
    'Infiltrate': 'Infiltration',
}

In [None]:
bbs_by_image = {}
ds = set()

for index, row in df.iterrows():
    image_name, disease, x, y, w, h = row

    if image_name not in bbs_by_image:
        bbs_by_image[image_name] = {}
    
    disease = MAPPING.get(disease, disease)
    ds.add(disease)
    
    bbs_by_image[image_name][disease] = [x, y, w, h]

In [None]:
filepath = os.path.join(DATASET_DIR, 'bbox_by_image_by_disease.json')
with open(filepath, 'w') as f:
    json.dump(bbs_by_image, f)

## Plot patients vs n-images

In [None]:
from collections import Counter
import math

In [None]:
dataset = CXR14Dataset('all')
len(dataset)

In [None]:
df = pd.read_csv(os.path.join(DATASET_DIR, 'Data_Entry_2017.csv'))
df.head(2)

In [None]:
Counter(df['View Position'])

In [None]:
len(df), len(df['Image Index'].unique())

### Check patient variability

In [None]:
n_images_by_patient = Counter(df['Patient ID'])
n_distinct_patients = len(n_images_by_patient)
n_distinct_patients, len(df)

In [None]:
patients_and_amounts = sorted(n_images_by_patient.items(), key=lambda x: x[1], reverse=True)
patients_and_amounts[:10]

In [None]:
patients_ids, amounts = zip(*patients_and_amounts)
plt.plot(range(len(amounts)), amounts)
plt.xlabel('Patient ID')
plt.ylabel('N images')
plt.title('N images by patient (CXR14)')

In [None]:
amounts = np.array(amounts)
cum_amounts = np.cumsum(amounts) / len(df)
total_patients = len(amounts)

In [None]:
plt.plot(np.array(range(len(cum_amounts))) / total_patients, cum_amounts)
plt.title('N images vs N patients')
plt.ylabel('Fraction of images')
plt.xlabel('Fraction of patients')

In [None]:
for perc in [0.2, 0.3, 0.4, 0.45, 0.5, 0.8]:
    n_patients = np.argmax(cum_amounts > perc) + 1
    perc_patients = n_patients / total_patients * 100
    
    s1 = f'Top {n_patients:,} patients ({perc_patients:.1f}%)'
    s2 = f'{int(perc*100)}% of the images'
    print(f'{s1:<30} account for {s2}')
print(f'Total patients: {total_patients:,}')

### Check some images from the same patient

In [None]:
cols = ['Image Index', 'View Position']
projection_by_image_id = df[cols].set_index('Image Index')['View Position'].to_dict()

In [None]:
# patient_id = 10007
patient_id = 13670
patient_str = f'000{patient_id:05}_'
rows = dataset.label_index.loc[dataset.label_index['FileName'].str.contains(patient_str)]
indexes = list(rows.index)
indexes = indexes[:15]

n_cols = 3
n_rows = math.ceil(len(indexes) / n_cols)
plt.figure(figsize=(5*n_cols, 5*n_rows))

for plt_idx, idx in enumerate(indexes):
    item = dataset[idx]
    image = item.image[0]
    
    findings = '|'.join(d for d, present in zip(dataset.labels, item.labels) if present)
    if not findings:
        findings = 'NF'
    
    proj = projection_by_image_id.get(item.image_fname, '')
    
    plt.subplot(n_rows, n_cols, plt_idx + 1)
    plt.imshow(image, cmap='gray')
    plt.title(f'{item.image_fname} - {findings} ({proj})')

In [None]:
labels_by_patient = df.groupby('Patient ID')['Finding Labels'].apply(set)
labels_by_patient.head(2)