# Notebook 1 - Set Project Structure

### 1. Setup constants

In [1]:
from constants import *

In [2]:
IMAGE_SIZE = 224

### 2. Imports and notebook setup

In [3]:
# Set up multiple outputs for cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Printing with markdown
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [4]:
# Default imports
import os
import random
import shutil
from send2trash import send2trash
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from PIL import Image, ImageOps
import cv2

In [5]:
if os.path.isdir('../skin-cancer-mnist-ham10000'):
    os.rename(src='../skin-cancer-mnist-ham10000', dst=DATA_ORIGINAL_DIR_PATH, src_dir_fd=None, dst_dir_fd=None)
    print('Renamed directory')

### 3. Directories setup & train test split

#### 3.1 Create directories from scratch

In [6]:
# Create directory structure
if os.path.isdir(DATA_PROCESSED_DIR_PATH):
    print('Reseting project directory structure...')
    send2trash(DATA_PROCESSED_DIR_PATH)

print('Building directory structure...')
os.mkdir(DATA_PROCESSED_DIR_PATH)

for i, split in enumerate(SPLIT_DIRS):
    os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split))
    
    if i == 0:
        for cls in CLASSES_2019:
            os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split, cls))
    else:
        os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split, 'mel'))
        os.mkdir(os.path.join(DATA_PROCESSED_DIR_PATH, split, 'other'))
        
print('Created successfully!')

Reseting project directory structure...
Building directory structure...
Created successfully!


#### 3.2 Train, test, validation split

In [7]:
df_2019 = pd.read_csv('../ISIC-2019/ISIC_2019_Training_Metadata.csv')
df_2019_target = pd.read_csv('../ISIC-2019/ISIC_2019_Training_GroundTruth.csv', 
                             index_col='image').drop('UNK', axis=1)

In [8]:
df_2019_target = df_2019_target.astype(int)

In [9]:
df = pd.merge(left=df_2019_target, right=df_2019[['image', 'lesion_id']], 
              on='image', how='left')

In [10]:
# Check how many images for each lesion
df.sum(axis=0)

image    ISIC_0000000ISIC_0000001ISIC_0000002ISIC_00000...
MEL                                                   4522
NV                                                   12875
BCC                                                   3323
AK                                                     867
BKL                                                   2624
DF                                                     239
VASC                                                   253
SCC                                                    628
dtype: object

In [11]:
def train_test_split_on_column(df, column_to_split, test_rate=0.15, val_rate=0.15, random_seed=40):
    # Setting random seed for the split
    random.seed(random_seed)
    
    num_of_missing = df[column_to_split].isnull().sum()
    fill_missing_values = ['missing_' + str(x) for x in range(num_of_missing)]
    
    null_indices = df[df[column_to_split].isna()].index
    for i, fill_label in zip(null_indices, fill_missing_values):
        df.loc[i, column_to_split] = fill_label
    
    values = df[column_to_split].unique().tolist()
    
    test_size = round(test_rate * len(values))
    val_size = round(val_rate * len(values))

    test_val_values = random.sample(population=values, k=test_size + val_size)
    test_values = random.sample(population=test_val_values, k=test_size)
    val_values = [x for x in test_val_values if x not in test_values]

    df_train = df[~df[column_to_split].isin(test_val_values)]
    df_test = df[df[column_to_split].isin(test_values)]
    df_val = df[df[column_to_split].isin(val_values)]
    
    return df_train, df_test, df_val

In [12]:
df_train, df_test, df_val = train_test_split_on_column(df, 'lesion_id')

In [13]:
# Check if the split was stratified
all_counts = df.drop(['image', 'lesion_id'], axis=1).sum(axis=0)
train_counts = df_train.drop(['image', 'lesion_id'], axis=1).sum(axis=0)
test_counts = df_test.drop(['image', 'lesion_id'], axis=1).sum(axis=0)
val_counts = df_val.drop(['image', 'lesion_id'], axis=1).sum(axis=0)

100 * train_counts.divide(all_counts)
100 * test_counts.divide(all_counts)
100 * val_counts.divide(all_counts)

# (train_counts + test_counts + val_counts).divide(all_counts)

MEL     70.919947
NV      70.392233
BCC     71.291002
AK      71.049596
BKL     68.750000
DF      68.200837
VASC    73.122530
SCC     71.974522
dtype: float64

MEL     14.329943
NV      14.485437
BCC     13.391514
AK      17.070358
BKL     15.967988
DF      18.410042
VASC    10.276680
SCC     15.127389
dtype: float64

MEL     14.750111
NV      15.122330
BCC     15.317484
AK      11.880046
BKL     15.282012
DF      13.389121
VASC    16.600791
SCC     12.898089
dtype: float64

#### 3.3 Copy and pre-process images to appropriate directory

In [14]:
for data_frame in [df, df_train, df_test, df_val]:
    # Set index
    data_frame.set_index('image', inplace=True)

    # Setup diagnostic column
    data_frame['dx'] = data_frame.drop('lesion_id', axis=1).idxmax(axis=1).str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
def resize_to_square(img, final_shape):
    '''
    Takes a PIL.JpegImagePlugin.JpegImageFile as input
    '''
    desired_size = max(img.size)
    old_size = img.size
    delta_w = desired_size - old_size[0]
    delta_h = desired_size - old_size[1]
    padding = (delta_w // 2, delta_h // 2, delta_w - (delta_w // 2), delta_h - (delta_h // 2))
    new_img = np.asarray(ImageOps.expand(img, padding))
    return cv2.resize(new_img, final_shape)

In [16]:
###########################################################
# TRANSFER IMAGES TO APROPIATE DIRECTORIES
###########################################################

# Data directory
dir_path = os.path.join(NEW_DATA_DIR_PATH, 'ISIC_2019_Training_Input')

for i, (split, data_frame_split, rate) in enumerate(zip(SPLIT_DIRS, [df_train, df_test, df_val], [.7, .15, .15])):
    print()
    print(split)
    for disease in CLASSES_2019:
        images = data_frame_split[data_frame_split['dx'] == disease].index.unique()

        if i != 0 and disease != 'mel':
            diag = 'other'
        else:
            diag = disease
        
#         allowed_shapes = [(1024, 1024, 3), (450, 600, 3), (680, 1024, 3)]
        allowed_shapes = [(1024, 1024, 3), (450, 600, 3)]
        counter = [0] * len(allowed_shapes)
        
        for img in images:
            img_name = img + '.' + IMG_FORMAT
            origin = os.path.join(dir_path, img_name)
            original_img = Image.open(origin)
            img_shape = np.asarray(original_img).shape
            
            if split != 'train' or img_shape in allowed_shapes:
                if img_shape not in allowed_shapes:
                    allowed_shapes.append(img_shape)
                    counter.append(0)
                    
                index = allowed_shapes.index(img_shape)
                
                # Limit number of images of each shape
                # limit = 2000 if disease == 'mel' else 1000
                limit = 1000
                if counter[index] >= limit:
                    continue
                
                counter[index] += 1
                
                destiny = os.path.join(DATA_PROCESSED_DIR_PATH, split, diag, img_name)

                original_img = Image.open(origin)
                processed_img = resize_to_square(original_img, (IMAGE_SIZE, IMAGE_SIZE))

                plt.imsave(destiny, processed_img)
        print(disease, counter)


train
ak [516, 100]
bcc [1000, 383]
bkl [783, 749]
df [75, 88]
nv [1000, 1000]
vasc [82, 103]
mel [1000, 761]
scc [319, 133]

test
ak [131, 17]
bcc [382, 63]
bkl [200, 168, 30, 6, 12, 1, 1, 1]
df [27, 17]
nv [567, 1000, 2, 86, 1, 1, 1, 1, 2, 11, 1, 1, 8, 19, 2, 94, 19, 2, 5, 5, 2, 2, 3, 6, 4]
vasc [11, 15]
mel [389, 168, 1, 20, 1, 1, 1, 1, 4, 3, 1, 1, 1, 1, 4, 39, 4, 1, 2, 1, 2, 1, 1]
scc [54, 41]

validation
ak [90, 13]
bcc [441, 68]
bkl [155, 182, 1, 41, 10, 10, 2]
df [22, 10]
nv [612, 1000, 1, 92, 1, 4, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 10, 1, 3, 2, 3, 91, 25, 14, 8, 10, 7, 6, 3, 1, 2, 1, 1, 1, 1]
vasc [18, 24]
mel [402, 184, 1, 22, 1, 1, 1, 1, 1, 1, 1, 36, 2, 1, 4, 1, 3, 3, 1]
scc [58, 23]


In [17]:
# Check proportion for each class
for i in range(len(SPLIT_DIRS)):
    print(SPLIT_DIRS[i])
    dirs = [x for x in os.listdir(os.path.join(DATA_PROCESSED_DIR_PATH, SPLIT_DIRS[i])) if x[0] != '.']
    for cls in dirs:
        print(cls, len([x for x in os.listdir(os.path.join(DATA_PROCESSED_DIR_PATH, SPLIT_DIRS[i], cls)) if x[0] != '.']))
    print()

train
df 163
ak 616
bkl 1532
vasc 185
nv 2000
bcc 1383
scc 452
mel 1761

test
other 3022
mel 648

validation
other 3080
mel 667

