# Notebook 2 - Offline Data Augmentation

### 1. Setup constants

In [1]:
# Constants
data_original_dir = 'data_original'
data_processed_dir = 'data_processed'
split_dirs = ['train', 'test']
diagnostic_classes = ['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']
img_format = 'jpg'

num_aug_images_wanted = 1000

### 2. Imports and notebook setup

In [2]:
# Set up multiple outputs for cells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Printing with markdown
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

In [3]:
# Default imports
import os
import random
import shutil
from send2trash import send2trash
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow import set_random_seed

In [4]:
# Magics
%matplotlib inline

In [5]:
data_processed_dir_path = os.path.join('..', data_processed_dir)
data_original_dir_path = os.path.join('..', data_original_dir)
train_path = os.path.join(data_processed_dir_path, split_dirs[0])
test_path = os.path.join(data_processed_dir_path, split_dirs[1])

### 3. Data augmentation offline and save to directory

In [6]:
# Check number of examples of each class in the train set
for cls in diagnostic_classes:
    print(cls, len(os.listdir(os.path.join(train_path, cls))))

nv 4723
mel 772
bkl 767
bcc 366
akiec 229
vasc 102
df 85


In [7]:
np.random.seed(100)
set_random_seed(100)

# Note that we are not augmenting class 'nv' which is the majority class
aug_classes = [x for x in diagnostic_classes if x != 'nv']

for cls in aug_classes:
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)
    
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    images = os.listdir(os.path.join(train_path, cls))

    for img in images:
            origin = os.path.join(train_path, cls, img)
            destiny = os.path.join(img_dir, img)
            _ = shutil.copyfile(origin, destiny)

    save_path = os.path.join(train_path, cls)
    
    datagen = ImageDataGenerator(rotation_range=180,
                                 width_shift_range=0.1,
                                 height_shift_range=0.1,
                                 zoom_range=0.1,
                                 horizontal_flip=True,
                                 vertical_flip=True,
                                 fill_mode='nearest',
#                                brightness_range=(0.9, 1.1)
                                )

    batch_size = 50

    aug_datagen = datagen.flow_from_directory(aug_dir,
                                              save_to_dir=save_path,
                                              save_format=img_format,
                                              target_size=(224, 224),
                                              batch_size=batch_size)
    
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted-num_files)/batch_size))

    # run the generator and create about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)
    
    # delete temporary directory with the raw image files
    send2trash(aug_dir)

Found 772 images belonging to 1 classes.
Found 767 images belonging to 1 classes.
Found 366 images belonging to 1 classes.
Found 229 images belonging to 1 classes.
Found 102 images belonging to 1 classes.
Found 85 images belonging to 1 classes.


In [8]:
# Check examples of each class
for cls in diagnostic_classes:
    print(cls, len(os.listdir(os.path.join(data_processed_dir_path, split_dirs[0], cls))))

nv 4723
mel 1022
bkl 1017
bcc 982
akiec 966
vasc 714
df 900
