## Imports

In [1]:
import torch
import pandas as pd
import os
import matplotlib.pyplot as plt

In [2]:
import matplotlib
matplotlib.rcParams['figure.facecolor'] = 'white'

In [3]:
%run ../cxr14.py

## Check broken images in DF

In [5]:
fpath = os.path.join(DATASET_DIR, 'label_index.csv')
df = pd.read_csv(fpath, header=0)
print(len(df))
df.head()

112120


Unnamed: 0,FileName,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia
0,00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,00000001_002.png,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
df.loc[df['FileName'].isin(_BROKEN_IMAGES)]

Unnamed: 0,FileName,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia
27529,00007160_002.png,1,0,0,0,0,0,0,0,0,0,0,0,0,0


## Split dataframe

### Check splits are correct

In [None]:
master_df_fpath = os.path.join(DATASET_DIR, 'label_index.csv')
train_df_fpath = os.path.join(DATASET_DIR, 'train_label.csv')
val_df_fpath = os.path.join(DATASET_DIR, 'val_label.csv')
test_df_fpath = os.path.join(DATASET_DIR, 'test_label.csv')

In [None]:
df = pd.read_csv(master_df_fpath)
master_images = set(df['FileName'])
print(len(master_images))
df.head()

In [None]:
def load_images_from_csv(fpath):
    df = pd.read_csv(fpath)
    return set(df['FileName'])

def load_images_from_txt(fpath):
    with open(fpath, 'r') as f:
        return set(l.strip() for l in f.readlines())

In [None]:
def check_same_images(split):
    df_path = os.path.join(DATASET_DIR, f'{split}_label.csv')
    txt_path = os.path.join(DATASET_DIR, 'splits', f'{split}.txt')
    
    txt_images = load_images_from_txt(txt_path)
    csv_images = load_images_from_csv(df_path)
    
    return txt_images == csv_images

In [None]:
check_same_images('train'), check_same_images('val'), check_same_images('test')

### Create test-bbox splits

In [None]:
bbox_df = pd.read_csv(os.path.join(DATASET_DIR, 'BBox_List_2017.csv'))
bbox_df.drop(columns=[c for c in bbox_df.columns if 'Unnamed' in c], inplace=True)
bbox_df.head()

In [None]:
images_with_bbox = list(set(bbox_df['Image Index']))
len(images_with_bbox)

In [None]:
fpath = os.path.join(DATASET_DIR, 'splits', 'test-bbox.txt')
with open(fpath, 'w') as f:
    for image in images_with_bbox:
        f.write(f'{image}\n')

### Create all split

In [None]:
fpath = os.path.join(DATASET_DIR, 'splits', 'all.txt')

with open(fpath, 'w') as f:
    for image_name in master_images:
        f.write(f'{image_name}\n')
    
len(master_images)

## Calculate mean and std

In [None]:
%run ../cxr14.py
%run ../../utils/images.py

In [None]:
dataset = CXR14Dataset('train')
len(dataset)

In [None]:
train_images = list(dataset.label_index['FileName'])
len(train_images)

In [None]:
mean, std = compute_mean_std(ImageFolderIterator(images_dir, train_images), show=True)
mean, std

In [None]:
print(mean)
print(std)

## Load class `CXR14Dataset`

In [None]:
from tqdm.notebook import tqdm

In [None]:
%run ../cxr14.py
%run ../__init__.py

In [None]:
kwargs = {
    'dataset_name': 'cxr14',
    'dataset_type': 'train',
    'batch_size': 40,
    'masks': True,
}

dataloader = prepare_data_classification(**kwargs)
len(dataloader.dataset)

### Label distribution

In [None]:
df = dataloader.dataset.label_index
df.head()

In [None]:
some_disease = df[CXR14_DISEASES].max(axis=1)
some_disease.sum()

In [None]:
pos_samples = df[CXR14_DISEASES].sum(axis=0)
neg_samples = len(df) - pos_samples
pos_samples, neg_samples

In [None]:
neg_samples // pos_samples

### Clahe images

In [None]:
%run ../cxr14.py
%run ../../utils/__init__.py

In [None]:
kwargs = {
    'dataset_type': 'test-bbox',
    'norm_by_sample': True,
    'masks': False,
}

dataset = CXR14Dataset(**kwargs)
clahe_dataset = CXR14Dataset(images_version='clahe', **kwargs)

len(dataset), len(clahe_dataset)

In [None]:
plt.figure(figsize=(15, 5))

idx = 50

item = dataset[idx]

plt.subplot(1, 2, 1)
plt.title('Normal')
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.axis('off')


item = clahe_dataset[idx]
plt.subplot(1, 2, 2)
plt.title('CLAHE')
plt.imshow(tensor_to_range01(item.image).permute(1, 2, 0))
plt.axis('off')

### Debug organ-mask-reducing

In [None]:
%run ../cxr14.py

In [None]:
dataset = CXR14Dataset('test-bbox', norm_by_sample=True, masks=True)
len(dataset)

In [None]:
item = dataset[1]
item.masks.size()

In [None]:
plt.figure(figsize=(15, 5))

for i, m in enumerate(item.masks):
    plt.subplot(1, 4, i+1)
    plt.imshow(m)
    plt.axis('off')

In [None]:
reduced_masks = dataset.reduce_masks_for_disease('Edema', item.masks)
print(reduced_masks.size())

plt.imshow(reduced_masks)

In [None]:
for index, batch in enumerate(tqdm(dataloader)):
    pass

In [None]:
index

In [None]:
item = dataset[3]

In [None]:
sample = item.image.view(3, -1)
sample.mean(-1), sample.std(-1)

## Simplify bounding-boxes

Save a dict with BBs information (faster than using bbox-df on-the-fly)

In [None]:
import json

In [None]:
bbox_fpath = os.path.join(DATASET_DIR, 'BBox_List_2017.csv')

In [None]:
df = pd.read_csv(bbox_fpath)
df.drop(columns=[c for c in df.columns if 'Unnamed' in c], inplace=True)
df.head()

In [None]:
MAPPING = {
    'Infiltrate': 'Infiltration',
}

In [None]:
bbs_by_image = {}
ds = set()

for index, row in df.iterrows():
    image_name, disease, x, y, w, h = row

    if image_name not in bbs_by_image:
        bbs_by_image[image_name] = {}
    
    disease = MAPPING.get(disease, disease)
    ds.add(disease)
    
    bbs_by_image[image_name][disease] = [x, y, w, h]

In [None]:
filepath = os.path.join(DATASET_DIR, 'bbox_by_image_by_disease.json')
with open(filepath, 'w') as f:
    json.dump(bbs_by_image, f)