# Notebook 1 - Set Project Structure

### 1. Setup constants

In [1]:
from constants import *

### 2. Imports and notebook setup

In [2]:
# Set up multiple outputs for cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Printing with markdown
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [3]:
# Default imports
import os
import random
import shutil
from send2trash import send2trash
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pylab as pl
from PIL import Image, ImageOps

In [4]:
if os.path.isdir('../skin-cancer-mnist-ham10000'):
    os.rename(src='../skin-cancer-mnist-ham10000', dst=DATA_ORIGINAL_DIR_PATH, src_dir_fd=None, dst_dir_fd=None)
    print('Renamed directory')

### 3. Directories setup & train test split

#### 3.1 Create directories from scratch

In [5]:
# Create directory structure
if os.path.isdir(DATA_PROCESSED_DIR_PATH):
    print('Reseting project directory structure...')
    send2trash(DATA_PROCESSED_DIR_PATH)

print('Building directory structure...')
os.mkdir(DATA_PROCESSED_DIR_PATH)
for split in SPLIT_DIRS:
    os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split))
    for cls in DIAGNOSTIC_CLASSES:
        os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split, cls))
print('Created successfully!')

Reseting project directory structure...
Building directory structure...
Created successfully!


#### 3.2 Train, test, validation split

In [6]:
df = pd.read_csv(os.path.join(DATA_ORIGINAL_DIR_PATH, 'HAM10000_metadata.csv'))
df.set_index('image_id', inplace=True)

In [7]:
# Setting random seed for the split
random.seed(100)

# We're splitting on lesion_id to avoid data leakage
test_rate = 0.15
val_rate = 0.15

lesions = df.lesion_id.unique().tolist()
test_size = round(test_rate * len(lesions))
val_size = round(val_rate * len(lesions))

test_val_lesions = random.sample(population=lesions, k=test_size + val_size)
test_lesions = random.sample(population=test_val_lesions, k=test_size)
val_lesions = [x for x in test_val_lesions if x not in test_lesions]

df_train = df[~df.lesion_id.isin(test_val_lesions)] 
df_test = df[df.lesion_id.isin(test_lesions)]
df_val = df[df.lesion_id.isin(val_lesions)]

In [8]:
# Check if the split was stratified
# % difference between train and test
a = 100 * df_train.dx.value_counts() / len(df_train)
b = 100 * df_test.dx.value_counts() / len(df_test)
c = 100 * df_val.dx.value_counts() / len(df_val)

a - b
a - c

akiec   -0.946704
bcc      0.524280
bkl     -0.282594
df       0.123424
mel     -0.076202
nv       0.496146
vasc     0.161650
Name: dx, dtype: float64

nv       0.182502
mel     -0.954642
bkl     -0.289346
bcc     -0.091907
akiec    0.841355
vasc     0.042418
df       0.269619
Name: dx, dtype: float64

#### 3.3 Copy images to appropriate directory

In [9]:
def resize_to_square(img):
    '''
    Takes a PIL.JpegImagePlugin.JpegImageFile as input
    '''
    desired_size = max(img.size)
    old_size = img.size
    delta_w = desired_size - old_size[0]
    delta_h = desired_size - old_size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
    return ImageOps.expand(img, padding)

In [10]:
# Data directories
dir_part_1 = os.path.join(DATA_ORIGINAL_DIR_PATH, 'HAM10000_images_part_1')
dir_part_2 = os.path.join(DATA_ORIGINAL_DIR_PATH, 'HAM10000_images_part_2')

# List of images in each part
images_part_1 = os.listdir(dir_part_1)
images_part_2 = os.listdir(dir_part_2)

# List in trainning and test data splits
train_images = df_train.index.unique()
test_images = df_test.index.unique()
val_images = df_val.index.unique()

for split, images in zip(SPLIT_DIRS, [train_images, test_images, val_images]):
    for img in images:
        img_name = img + '.' + IMG_FORMAT
        img_diagnosis = df.loc[img, 'dx']

        origin = os.path.join(dir_part_1 if img_name in images_part_1 else dir_part_2, img_name)
        destiny = os.path.join(DATA_PROCESSED_DIR_PATH, split, img_diagnosis, img_name)
        
        original_img = Image.open(origin)
        processed_img = resize_to_square(original_img)
        
        processed_img.save(destiny)

In [11]:
# Check proportion for each class
for i in range(len(SPLIT_DIRS)):
    print(SPLIT_DIRS[i])
    
    for cls in DIAGNOSTIC_CLASSES:
        cls_sizes = [len(os.listdir(os.path.join(DATA_PROCESSED_DIR_PATH, split, cls))) 
                      for split in SPLIT_DIRS]
        print(cls, cls_sizes[i] / sum(cls_sizes) * 100)
    print()

train
akiec 70.03058103975535
bcc 71.20622568093385
bkl 69.79071883530482
df 73.91304347826086
nv 70.4399701715138
vasc 71.83098591549296
mel 69.36208445642407

test
akiec 18.960244648318042
bcc 13.424124513618677
bkl 15.013648771610555
df 13.91304347826087
nv 14.660700969425802
vasc 13.380281690140844
mel 14.645103324348607

validation
akiec 11.009174311926607
bcc 15.369649805447471
bkl 15.195632393084624
df 12.173913043478262
nv 14.899328859060404
vasc 14.788732394366196
mel 15.992812219227314



In [12]:
len(df_train)
len(df_test)
len(df_val)

7044

1477

1494