# Notebook 2 - Offline Data Augmentation

### 1. Setup constants

In [1]:
from constants import *

IMAGE_SIZE = 224

### 2. Imports and notebook setup

In [2]:
# Set up multiple outputs for cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Printing with markdown
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [3]:
# Default imports
import os
import random
import shutil
from send2trash import send2trash
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob

### 3. Data augmentation offline and save to directory

In [4]:
# Check number of examples of each class in the train set
for cls in CLASSES_2019:
    print(cls, len([x for x in os.listdir(os.path.join(TRAIN_PATH, cls)) if x[0] != '.']))

ak 616
bcc 1383
bkl 1532
df 163
nv 2000
vasc 185
mel 1761
scc 452


In [5]:
def clean_vec(x):
    x = x.split('/')[-1]
    i = 0
    while x[i].isnumeric():
        i += 1
    if i == 0:
        return x
    return x[i+1:]

In [6]:
# ###########################################################
# # Augmentation for classes other than melanoma
# ###########################################################

# NUM_AUG_IMAGES_WANTED = 1000

# np.random.seed(100)

# aug_classes = [x for x in CLASSES_2019 if x != 'mel']

# for cls in aug_classes:
#     images = [x for x in os.listdir(os.path.join(TRAIN_PATH, cls)) if x[0] != '.']
    
#     if len(images) < NUM_AUG_IMAGES_WANTED:
#         remainder = NUM_AUG_IMAGES_WANTED % len(images)
#         multiple = -1 + round((NUM_AUG_IMAGES_WANTED - remainder) / len(images))
#         extra_imgs = np.random.choice(images, size=remainder, replace=False)
        
#         if multiple > 0:
#             for img in images:
#                     for i in range(multiple):
#                         origin = os.path.join(TRAIN_PATH, cls, img)
#                         destiny = os.path.join(TRAIN_PATH, cls, str(i + 1) + '_' + img)
#                         _ = shutil.copyfile(origin, destiny)
        
#         if remainder > 0:
#             for img in extra_imgs:
#                 origin = os.path.join(TRAIN_PATH, cls, img)
#                 destiny = os.path.join(TRAIN_PATH, cls, str(multiple + 1) + '_' + img)
#                 _ = shutil.copyfile(origin, destiny)

In [7]:
###########################################################
# Shape Balanced Augmentation
###########################################################
np.random.seed(100)

# aug_classes = [x for x in CLASSES_2019 if x != 'mel']
aug_classes = CLASSES_2019
name_2_dim = {}
for cls in aug_classes:
    images = [x for x in os.listdir(os.path.join(TRAIN_PATH, cls)) if x[0] != '.']
    
    # Create dictionary shape to images file name
    dim_2_files = {}
    keys = set()
    for img_name in images:
        img_shape = plt.imread('../ISIC-2019/ISIC_2019_Training_Input/' + img_name).shape
        if img_shape not in keys:
            keys.add(img_shape)
            dim_2_files[img_shape] = [img_name]
        else:
            dim_2_files[img_shape].append(img_name)
            
        name_2_dim[img_name] = img_shape

    for key, items in dim_2_files.items():
        # num_images_limit = 2000 if cls == 'mel' else 1000
        num_images_limit = 1000
        if len(items) < num_images_limit:
            remainder = num_images_limit % len(items)
            multiple = -1 + round((num_images_limit - remainder) / len(items))
            extra_items = np.random.choice(items, size=remainder, replace=False)

            if multiple > 0:
                for img in items:
                        for i in range(multiple):
                            origin = os.path.join(TRAIN_PATH, cls, img)
                            destiny = os.path.join(TRAIN_PATH, cls, str(i + 1) + '_' + img)
                            _ = shutil.copyfile(origin, destiny)

            if remainder > 0:
                for img in extra_items:
                    origin = os.path.join(TRAIN_PATH, cls, img)
                    destiny = os.path.join(TRAIN_PATH, cls, str(multiple + 1) + '_' + img)
                    _ = shutil.copyfile(origin, destiny)

In [8]:
# Check examples of each class
for cls in CLASSES_2019:
    print(cls, len([x for x in os.listdir(os.path.join(DATA_PROCESSED_DIR_PATH, SPLIT_DIRS[0], cls)) if x[0] != '.']))

ak 2000
bcc 2000
bkl 2000
df 2000
nv 2000
vasc 2000
mel 2000
scc 2000


In [9]:
# Check examples of each class
for cls in CLASSES_2019:
    shape_2_counts = {}
    keys = set()
    images = [x for x in os.listdir(os.path.join(DATA_PROCESSED_DIR_PATH, SPLIT_DIRS[0], cls)) if x[0] != '.']
    images = list(map(clean_vec, images))
    for img_name in images:
        img_shape = name_2_dim[img_name]
        if img_shape not in keys:
            keys.add(img_shape)
            shape_2_counts[img_shape] = 1
        else:
            shape_2_counts[img_shape] += 1
    print(cls)
    print(shape_2_counts)
    print()

ak
{(1024, 1024, 3): 1000, (450, 600, 3): 1000}

bcc
{(450, 600, 3): 1000, (1024, 1024, 3): 1000}

bkl
{(450, 600, 3): 1000, (1024, 1024, 3): 1000}

df
{(450, 600, 3): 1000, (1024, 1024, 3): 1000}

nv
{(1024, 1024, 3): 1000, (450, 600, 3): 1000}

vasc
{(450, 600, 3): 1000, (1024, 1024, 3): 1000}

mel
{(1024, 1024, 3): 1000, (450, 600, 3): 1000}

scc
{(1024, 1024, 3): 1000, (450, 600, 3): 1000}



In [10]:
os.mkdir(os.path.join(TRAIN_PATH, 'other'))

In [11]:
for cls in aug_classes:
    images = [x for x in os.listdir(os.path.join(TRAIN_PATH, cls)) if x[0] != '.']
    for img in images:
        origin = os.path.join(TRAIN_PATH, cls, img)
        destiny = os.path.join(TRAIN_PATH, 'other', img)
        _ = shutil.copyfile(origin, destiny)

In [12]:
for cls in aug_classes:
    send2trash(os.path.join(TRAIN_PATH, cls))

In [13]:
len([x for x in os.listdir(os.path.join(TRAIN_PATH, 'other')) if x[0] != '.'])

16000