# Imports

In [None]:
import os
from glob import glob
from collections import Counter
import pandas as pd
import numpy as np
import pydicom as pydcm

In [None]:
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.facecolor'] = 'white'
matplotlib.rcParams['figure.figsize'] = (15, 5)

In [None]:
DATASET_DIR = '/mnt/workspace/covid-siim'
DICOM_DIR = '/mnt/data/covid-siim'

# Load metadata

## Utils

For pre-processing

In [None]:
TRAIN_IMAGE_PATHS = glob(os.path.join(DICOM_DIR, 'train/*/*/*.dcm'))
TEST_IMAGE_PATHS = glob(os.path.join(DICOM_DIR, 'test/*/*/*.dcm'))

In [None]:
IMAGE_ID_TO_FPATH = {}

for fpath in TRAIN_IMAGE_PATHS + TEST_IMAGE_PATHS:
    imagename = os.path.basename(fpath).replace('.dcm', '')
    assert imagename not in IMAGE_ID_TO_FPATH, f'Repeated filename: {filename}'
    
    image_fpath = fpath.replace(DICOM_DIR, '').replace('.dcm', '')
    if image_fpath.startswith('/'):
        image_fpath = image_fpath[1:]
    IMAGE_ID_TO_FPATH[imagename] = image_fpath

## Image-level labels

In [None]:
fpath = os.path.join(DATASET_DIR, 'train_image_level.csv')
df_image = pd.read_csv(fpath)
df_image['image_fpath'] = [
    IMAGE_ID_TO_FPATH[image_id.replace('_image', '')]
    for image_id in df_image['id']
]
df_image.head()

In [None]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [None]:
labels_with_bb = list(df_image['label'])
labels = [
    label
    for ll in labels_with_bb
    for label, score, a, b, c, d in tuple(chunks(ll.split(), 6))
]
len(labels)

In [None]:
Counter(labels)

## Study level labels

In [None]:
fpath = os.path.join(DATASET_DIR, 'train_study_level.csv')
df_study = pd.read_csv(fpath)
df_study['id'] = [s.replace('_study', '') for s in df_study['id']]
df_study.head()

In [None]:
COVID_SIIM_DISEASES = [
    'Negative for Pneumonia',
    'Typical Appearance',
    'Indeterminate Appearance',
    'Atypical Appearance',
]

In [None]:
# Check if is multilabel or not
Counter(df_study[COVID_SIIM_DISEASES].sum(axis=1).values)

In [None]:
n_studies = len(df_study)
dist_labels = df_study[COVID_SIIM_DISEASES].sum(axis=0).sort_values(ascending=False)
dist_labels

In [None]:
plt.figure(figsize=(10, 5))
plt.title('Labels distribution')
plt.bar(dist_labels.index, dist_labels.values)
plt.ylabel('N studies')
plt.ylim(0, max(dist_labels.values) * 1.10)

for index, value in enumerate(dist_labels.values):
    perc = value / n_studies * 100
    txt = f'{value:,} ({perc:.1f}%)'
    plt.text(index, value * 1.02, txt, ha='center')

### Split studies in train/val

In [None]:
import random

In [None]:
studies = list(df_study['id'])
n_studies = len(studies)
assert n_studies == len(set(studies))
n_studies

In [None]:
val_split = 0.1
n_val_studies = int(n_studies * val_split)
n_train_studies = n_studies - n_val_studies
n_train_studies, n_val_studies

In [None]:
val_studies = random.sample(studies, n_val_studies)
train_studies = list(set(studies) - set(val_studies))
assert len(set(val_studies).intersection(train_studies)) == 0
len(train_studies), len(val_studies)

In [None]:
def write_list_to_txt(arr, filepath, sep='\n'):
    """Writes a list of strings to a file"""
    with open(filepath, 'w') as f:
        for line in arr:
            f.write(line + sep)

In [None]:
splits_folder = os.path.join(DATASET_DIR, 'splits')
write_list_to_txt(train_studies, os.path.join(splits_folder, 'train.txt'))
write_list_to_txt(val_studies, os.path.join(splits_folder, 'val.txt'))

## Merge into master_metadata.csv

In [None]:
df_study.head(2)

In [None]:
df_image.head(2)

In [None]:
set(df_study['id']) == set(df_image['StudyInstanceUID'])

In [None]:
df_image_2 = df_image.copy()
df_image_2['image_id'] = [i.replace('_image', '') for i in df_image_2['id']]
del df_image_2['id']

In [None]:
master_df = df_image_2.merge(df_study, how='inner', left_on='StudyInstanceUID', right_on='id')
del master_df['id']
master_df.rename(columns={'StudyInstanceUID': 'study_id'}, inplace=True)
master_df.head(2)

In [None]:
assert len(master_df) == len(df_image)

### Add test images to metadata.csv

In [None]:
import re

In [None]:
test_metadata = []
for image_fpath in TEST_IMAGE_PATHS:
    image_id = os.path.basename(image_fpath).replace('.dcm', '')
    study_id = re.search(r'.*test\/([a-z\d]+)\/.*', image_fpath).group(1)
    
    fpath = image_fpath.replace(DICOM_DIR, '').replace('.dcm', '')
    if fpath.startswith('/'):
        fpath = fpath[1:]
    test_metadata.append((
        image_id, study_id, fpath, [], 'none 1 0 0 1 1', 0, 0, 0, 0,
    ))
len(test_metadata)

In [None]:
cols = ['image_id', 'study_id', 'image_fpath', 'boxes', 'label', *COVID_SIIM_DISEASES]
test_df = pd.DataFrame(test_metadata, columns=cols)
test_df.head()

In [None]:
master_df = master_df.append(test_df)
assert len(master_df) == len(df_image) + len(test_df)
len(master_df)

In [None]:
master_df.fillna("[]", inplace=True)

In [None]:
master_df.isnull().any(axis=0)

In [None]:
master_df['disease'] = [
    np.argmax(row[COVID_SIIM_DISEASES])
    for _, row in master_df.iterrows()
]
master_df.head(2)

## Save original_size into metadata

In [None]:
original_heights = []
original_widths = []

for _, row in master_df.iterrows():
    path = os.path.join(DATASET_DIR, 'images', f"{row['image_fpath']}.png")
    image = load_image(path, 'RGB')
    
    height = image.height
    width = image.width
    
    original_heights.append(height)
    original_widths.append(width)

master_df['original_height'] = original_heights
master_df['original_width'] = original_widths
master_df.head()

In [None]:
master_df.head()

In [None]:
master_df.to_csv(os.path.join(DATASET_DIR, 'metadata.csv'), index=False)

## Create bboxes_by_image_id

In [None]:
import json
import ast

In [None]:
cols = ['image_id', 'boxes']
d = master_df[cols] # .set_index('image_id')

bboxes_by_image_id = dict()
errors = []
for index, row in d.iterrows():
    image_id = row['image_id']
    boxes = row['boxes']

    if isinstance(boxes, str):
        boxes = ast.literal_eval(boxes)

    bboxes_by_image_id[image_id] = boxes
len(bboxes_by_image_id)

In [None]:
with open(os.path.join(DATASET_DIR, 'bboxes.json'), 'w') as f:
    json.dump(bboxes_by_image_id, f, indent=2)

# Load one DICOM sample

In [None]:
# name = 'train/00086460a852/9e8302230c91/65761e66de9f.dcm'
name = 'test/00188a671292/3eb5a506ccf3/3dcdfc352a06.dcm'
fpath = os.path.join(DICOM_DIR, name)

In [None]:
dicom = pydcm.read_file(fpath)
dicom

In [None]:
def dicom_to_np(dicom):
    data = apply_voi_lut(dicom.pixel_array, dicom)

    if dicom.PhotometricInterpretation == 'MONOCHROME1':
        data = np.amax(data) - data

    return data

In [None]:
def arr_to_uint8(data):
    data = data - np.min(data)
    data = np.true_divide(data, np.max(data))
    data = (data * 255).astype(np.uint8)
    
    return data

In [None]:
data = dicom_to_np(dicom)
data.min(), data.max(), data.dtype

In [None]:
plt.subplot(1, 2, 1)
plt.imshow(data, cmap='gray')
plt.title('Np array')

plt.subplot(1, 2, 2)
plt.imshow(dicom.pixel_array, cmap='gray')
plt.title('Original DICOM')

# Transform all images into PNG

Needs to run with py2gdcm for some images, that throw error:

`"The following handlers are available to decode the pixel data however they are missing required dependencies: GDCM (req. GDCM)"`

In [None]:
from tqdm.auto import tqdm
from PIL import Image
from collections import defaultdict

In [None]:
errors = defaultdict(list)
dtypes_by_image = dict()

In [None]:
# took ~1h for test split, ~6h for train split

In [None]:
for split in ('test', 'train'):
    fpaths = glob(os.path.join(DICOM_DIR, split, '*/*/*.dcm'))

    for fpath in tqdm(fpaths):
        # Get png fpath
        filename = fpath.replace(DICOM_DIR, '')
        if filename.startswith('/'):
            filename = filename[1:]
        out_fpath = os.path.join(DATASET_DIR, 'images-16bit', filename)
        out_fpath = out_fpath.replace('.dcm', '.png')
        
        if os.path.isfile(out_fpath):
            continue
        
        # Ensure folder exists
        out_folder = os.path.dirname(out_fpath)
        if not os.path.isdir(out_folder):
            os.makedirs(out_folder)
            # Cannot use exist_ok=True in python 2
        
        dicom = pydcm.read_file(fpath)
        
        # Save dtype
        dtypes_by_image[filename] = str(dicom.pixel_array.dtype)

        # Transform tu numpy
        try:
            data = dicom_to_np(dicom)
        except Exception as e:
            errors['to-np'].append((fpath, e))
            continue

        # Choose image mode (8bit or 16bit)
        if data.dtype == np.uint8:
            mode = 'L'
        elif data.dtype == np.uint16:
            mode = 'I;16'
        else:
            errors['dtype'].append((fpath, data.type))
            continue
        
        image = Image.fromarray(data, mode=mode)
        
        # Save as png
        image.save(out_fpath, optimize=True)
        image.close()

In [None]:
from collections import Counter
import json

In [None]:
with open(os.path.join(DATASET_DIR, 'dtypes_by_image.json'), 'w') as f:
    json.dump(dtypes_by_image, f, indent=2)

In [None]:
Counter(dtypes_by_image.values())

In [None]:
unique_exceptions = set(str(e) for f, e in errors['to-np'])
unique_exceptions

In [None]:
error_fpaths = [f for f, e in errors['to-np']]
len(error_fpaths)

In [None]:
error_fpaths

# Save images-small

Save into folder images-256

In [None]:
from torchvision.transforms import Resize

In [None]:
%run ../../utils/images.py

In [None]:
errors = []
resizer = Resize((256, 256))

for split in ('train', 'test'):
    fpaths = glob(os.path.join(DATASET_DIR, 'images', split, '*/*/*.png'))

    for fpath in tqdm(fpaths):
        target_fpath = fpath.replace('/images/', '/images-256/')
        
        if os.path.isfile(target_fpath):
            continue
        
        # Create folder
        target_folder = os.path.dirname(target_fpath)
        os.makedirs(target_folder, exist_ok=True)
            
        # Load large image
        src_image = load_image(fpath, 'RGB')
        
        # Resize to 256
        target_image = resizer(src_image)
        
        # Save
        target_image.save(target_fpath)

# Compute mean and std

In [None]:
%run ../../utils/images.py

In [None]:
TRAIN_IMAGE_PATHS = glob(os.path.join(DATASET_DIR, 'images', 'train/*/*/*.png'))
len(TRAIN_IMAGE_PATHS)

In [None]:
mean, std = compute_mean_std(
    ImageFolderIterator(os.path.join(DATASET_DIR, 'images'), TRAIN_IMAGE_PATHS),
    show=True,
)
mean, std

# Debug dataset class

## Definition

In [None]:
# Copy from tutoriales repo

## Usage

In [None]:
transform_16bit = transforms.Compose([
    PILToTensorAndRange01(), # Use instead of transforms.ToTensor(), see docstring
    GrayTo3Channels(), # Change to 3 channels (by default 16bit images have 1 channel)
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

dataset = CovidSiimDataset(
    'test', image_folder='images-16bit', image_format='I;16', transform=transform_16bit)
len(dataset)

In [None]:
image, labels, bboxes = dataset[10]

In [None]:
image.size(), image.min(), image.max()

In [None]:
plt.imshow(image[0], cmap='gray')