# Imports

In [None]:
import os
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

# Load one sample

In [None]:
%run ../common/__init__.py

In [None]:
DATASET_DIR = os.environ.get('DATASET_DIR_CHEXPERT')

In [None]:
# name = 'train/patient00001/study1/view1_frontal.jpg'
# name = 'valid/patient64671/study1/view1_frontal.jpg'
name = 'train/patient29328/study1/view1_frontal.jpg'
image_fpath = os.path.join(DATASET_DIR, name)

In [None]:
image = Image.open(image_fpath)# .convert('RGB')
image.size, image.mode

In [None]:
image = np.array(image)
image.shape, image.dtype

In [None]:
plt.imshow(image)

# Calculate mean and std

In [None]:
%run ../../utils/images.py

In [None]:
fpath = os.path.join(DATASET_DIR, 'train.csv')
d = pd.read_csv(fpath)
d.replace(r'CheXpert-v1.0-small/', '', regex=True, inplace=True)
d.head()

In [None]:
train_images_all = list(d['Path'])
len(train_images_all)

In [None]:
train_images_frontal = list(d[d['Frontal/Lateral'] == 'Frontal']['Path'])
len(train_images_frontal)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(DATASET_DIR, train_images_all), show=True)
mean, std

In [None]:
stats_frontal = compute_mean_std(ImageFolderIterator(DATASET_DIR, train_images_frontal),
                                 show=True)
stats_frontal

# Load `ChexpertDataset` class

In [None]:
%run ../chexpert.py
%run ../../utils/common.py

In [None]:
dataset = ChexpertDataset(dataset_type='train-val', frontal_only=True, masks=True)
len(dataset)

In [None]:
item = dataset[-8]
print(item.image_fname)
item.image.size(), item.masks.size()

In [None]:
n_rows = 1
n_cols = 2
plt.figure(figsize=(n_cols*8, n_rows*8))

plt.subplot(n_rows, n_cols, 1)
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.title(item.image_fname)

plt.subplot(n_rows, n_cols, 2)
plt.imshow(item.masks)

# Exploratory analysis

In [None]:
%run ../chexpert.py

In [None]:
train_dataset = ChexpertDataset(dataset_type='train', frontal_only=False)
val_dataset = ChexpertDataset(dataset_type='val', frontal_only=False)
len(train_dataset), len(val_dataset)

## Frontal vs lateral

Amount of images

In [None]:
def print_frontal_vs_lateral(dataset):
    counter = Counter(dataset.label_index['Frontal/Lateral'])
    print(dataset.dataset_type)
    for key in counter.keys():
        amount = counter[key]
        perc = amount / len(dataset) * 100
        print(f'\t{key}: {amount:,} ({perc:.1f}%)')

In [None]:
print_frontal_vs_lateral(train_dataset)
print_frontal_vs_lateral(val_dataset)

## Diseases distribution

In [None]:
def amounts_by_disease(dataset):
    amounts = dataset.label_index[CHEXPERT_DISEASES].sum(axis=0).sort_values()

    print(dataset.dataset_type)
    for disease in amounts.keys():
        amount = int(amounts[disease])
        perc = amount / len(dataset) * 100
        print(f'\t{disease:<28}: {amount:<7,} ({perc:.1f}%)')

In [None]:
amounts_by_disease(train_dataset)
amounts_by_disease(val_dataset)

In [None]:
def plot_amounts_by_disease(dataset):
    amounts = dataset.label_index[CHEXPERT_DISEASES].sum(axis=0).sort_values()
    
    plt.title(dataset.dataset_type)
    plt.bar(amounts.keys(), amounts)
    plt.xticks(rotation=90)
    plt.ylabel('Amount')

In [None]:
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plot_amounts_by_disease(train_dataset)

plt.subplot(1, 2, 2)
plot_amounts_by_disease(val_dataset)

## Images by patient

In [None]:
import re

In [None]:
patient_pattern = re.compile(r'.*patient(\d+)\/.*')
def extract_patient(path):
    # E.g.: CheXpert-v1.0-small/valid/patient64541/study1/view1_frontal.jpg
    match = patient_pattern.search(path)
    if not match:
        return ''
    return match.group(1)

In [None]:
df = pd.read_csv(os.path.join(DATASET_DIR, 'train-val.csv'))
cols = ['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA']
df = df[cols]
df.head(2)

In [None]:
df['PatientID'] = [extract_patient(row['Path']) for _, row in df.iterrows()]
df.head(2)

In [None]:
n_images_by_patient = Counter(df['PatientID'])
n_distinct_patients = len(n_images_by_patient)
n_distinct_patients, len(df)

### Plot distribution

In [None]:
patients_and_amounts = sorted(n_images_by_patient.items(), key=lambda x: x[1], reverse=True)
patients_and_amounts[:10]

In [None]:
patients_ids, amounts = zip(*patients_and_amounts)
plt.plot(range(len(amounts)), amounts)
plt.xlabel('Patient ID')
plt.ylabel('N images')
plt.title('N images by patient (Chexpert)')

In [None]:
amounts = np.array(amounts)
cum_amounts = np.cumsum(amounts) / len(df)
total_patients = len(amounts)

In [None]:
plt.plot(np.array(range(len(cum_amounts))) / total_patients, cum_amounts)
plt.title('N images vs N patients')
plt.ylabel('Fraction of images')
plt.xlabel('Fraction of patients')

In [None]:
for perc in [0.2, 0.3, 0.4, 0.45, 0.5, 0.7, 0.8]:
    n_patients = np.argmax(cum_amounts > perc) + 1
    perc_patients = n_patients / total_patients * 100
    
    s1 = f'Top {n_patients:,} patients ({perc_patients:.1f}%)'
    s2 = f'{int(perc*100)}% of the images'
    print(f'{s1:<30} account for {s2}')
print(f'Total patients: {total_patients:,}')

### Check some images from the same patient

In [None]:
import math

In [None]:
%run ../common/constants.py

In [None]:
dataset = ChexpertDataset(dataset_type='train-val', frontal_only=False)
len(dataset)

In [None]:
cols = ['Path', 'AP/PA']
projection_by_image_id = df[cols].set_index('Path')['AP/PA'].to_dict()
projection_by_image_id = {
    k.replace('CheXpert-v1.0-small/', ''): v
    for k, v in projection_by_image_id.items()
}

In [None]:
patient_id = 33155
patient_str = f'patient{patient_id:05}'
rows = dataset.label_index.loc[dataset.label_index['Path'].str.contains(patient_str)]

indexes = list(rows.index)
indexes = indexes[:15]

n_cols = 3
n_rows = math.ceil(len(indexes) / n_cols)
plt.figure(figsize=(5*n_cols, 5*n_rows))

for plt_idx, idx in enumerate(indexes):
    item = dataset[idx]
    image = item.image[0]
    
    findings = '|'.join(
        ABN_SHORTCUTS[d]
        for d, present in zip(dataset.labels, item.labels)
        if present
    )
    if not findings:
        findings = 'NF'
    
    proj = projection_by_image_id.get(item.image_fname, '')
    
    plt.subplot(n_rows, n_cols, plt_idx + 1)
    plt.imshow(image, cmap='gray')
    plt.title(f'{findings} ({proj})')

### Projection by type of patient

In [None]:
df.fillna('--', inplace=True)

In [None]:
def plot_proj_in_subset(condition, title):
    patients = [patient for patient, amount in n_images_by_patient.items() if condition(amount)]

    d = df.loc[df['PatientID'].isin(set(patients))]
    images_by_proj = Counter(d['AP/PA'])

    plt.bar(images_by_proj.keys(), images_by_proj.values())
    plt.ylabel('N images')
    plt.title(title)
    
    return patients

In [None]:
THRESH = 10

plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
pat1 = plot_proj_in_subset(lambda x: x > THRESH, f'Patients with > {THRESH}')

plt.subplot(1, 2, 2)
pat2 = plot_proj_in_subset(lambda x: x <= THRESH, f'Patients with <= {THRESH}')

In [None]:
set(pat1).intersection(pat2)

## Hierarchical labels

In [None]:
dataset = ChexpertDataset(dataset_type='train-val', frontal_only=False)
len(dataset)

In [None]:
df = dataset.label_index
df.head(2)

In [None]:
def check(children, parent):
    d = df
    d = d.loc[(d[children] == 1).any(axis=1)]
    print(Counter(d[parent]))
    return d

In [None]:
_ = check(['Consolidation', 'Pneumonia', 'Lung Lesion', 'Atelectasis'], 'Lung Opacity')

In [None]:
_ = check(['Cardiomegaly'], 'Enlarged Cardiomediastinum')

Any sub-label marked as positive **does not** imply the parent-label is marked as positive