# Preprocessing

Two things to accomplish: (unsure of order, but this is the order we're doing for now)
1. Scaling/downsampling to 224x224
2. Standardizing pixels (per image, standard deviation of 1 and mean of 0)
3. Grayscaling (maybe toss out in the future)

In [11]:
import os
import copy
import shutil
from importlib import reload
import pathlib
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf

import utils
import data
reload(utils);
reload(data);

In [12]:
# Set aside small set for playground
img_idxs = utils.get_random_image_ids(20, random_seed=0)

if not os.path.exists(playground_dir := ("/".join([utils.dir_name, 'playground']))):
    os.mkdir(playground_dir)

# Subdirectory needed -- ideally subdir name is class name -- punt it to a different time
if not os.path.exists(playground_images_dir := ("/".join([utils.dir_name, 'playground', 'images']))):
    os.mkdir(playground_images_dir)

for img_idx in img_idxs:
    shutil.copyfile(
        utils.dir_name + '/ISIC_2019_Training_Input/' + img_idx + '.' + utils.image_format,
        "/".join([playground_images_dir, img_idx + '.' + utils.image_format]),
    )
    
train_labels = utils.get_label_of_image_id(img_idxs)
print(train_labels.head())

FileNotFoundError: [Errno 2] No such file or directory: 'isic_data/ISIC_2019_Training_Input/ISIC_0013425_downsampled.jpg'

## 1 - Scaling (included when loading into keras layer)

In [None]:
# Loading data the keras way
data_dir = pathlib.Path("/".join([utils.dir_name, 'playground']))

image_count = len(list(data_dir.glob('*/*.jpg')))
print(f"{image_count=}")

batch_size = 1
img_height = 244
img_width = 244
random_seed=123

train_ds = tf.keras.utils.image_dataset_from_directory(
    data_dir,
    labels=train_labels.tolist(),
    validation_split=0.2,
    subset="training",
    seed=random_seed,
    image_size=(img_height, img_width),  # default 256x256
    batch_size=batch_size,
    shuffle=False,
)

# Performance configurations
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache(str(data_dir) + '/cache').prefetch(buffer_size=AUTOTUNE)

In [None]:
# Visualize scaled data
plt.figure(figsize=(10, 10))
for i, (images, labels) in enumerate(train_ds.take(9)):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[0].numpy().astype("uint8"))
    plt.title(data.SkinClass(labels.numpy()[0]).name)
    plt.axis("off")
    
print("Image shape:", images[0].numpy().shape)

## 2 - Standardize data
Unsure yet if we should use (-1 to 1) or (0 to 1). Help?

In [None]:
normalization_layer = tf.keras.layers.Rescaling(1./255)  # RGB goes up to 255; for 0 to 1
# normalization_layer = tf.keras.layers.Rescaling(1./127.5, offset=-1)  # For -1 to 1

In [None]:
normalized_ds = train_ds.map(lambda x, y: (normalization_layer(x), y))

# Visualize standardized data
plt.figure(figsize=(10, 10))
for i, (images, labels) in enumerate(normalized_ds.take(9)):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[0].numpy())
    plt.title(data.SkinClass(labels.numpy()[0]).name)
    plt.axis("off")

normalized_images = [images for images, _ in normalized_ds.take(1)]
first_image = normalized_images[0]
# Notice the pixel values are now in `[0,1]`.
print("min:", np.min(first_image), "max:", np.max(first_image))

## 3 - Grayscale

In [None]:
grayscaled_ds = normalized_ds.map(lambda x, y: (tf.image.rgb_to_grayscale(x), y))

# Visualize grayscale data
plt.figure(figsize=(10, 10))
for i, (images, labels) in enumerate(grayscaled_ds.take(9)):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[0].numpy(), cmap='gray')
    plt.title(data.SkinClass(labels.numpy()[0]).name)
    plt.axis("off")

print("Image shape:", images[0].numpy().shape)

## Putting it altogether
These are theoretically, the only code we need.

In [13]:
reload(utils)
utils.select_subset_for_playground(1000)  # bumping up to 1000 images

FileNotFoundError: [Errno 2] No such file or directory: 'isic_data/undersampled/MEL'

In [None]:
batch_size = 1
img_height = 244
img_width = 244
random_seed=123

train_ds = tf.keras.utils.image_dataset_from_directory(
    'isic_data/playground/',  # switch to 'training' when ready
    class_names=sorted(data.SkinClass.__members__.keys()),
    validation_split=0.2,
    subset="training",  # switch to 'validation' for the 20%
#     seed=random_seed,
    image_size=(img_height, img_width),  # default 256x256
    batch_size=batch_size,
    shuffle=False,
)

# train_ds = train_ds.cache('isic_data/playground/cache').prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
preprocessed_ds = copy.copy(train_ds)
for layer in [
    tf.keras.layers.Rescaling(1./255),
    tf.image.rgb_to_grayscale,
]:
    preprocessed_ds = preprocessed_ds.map(lambda x, y: (layer(x), y))

In [None]:
# Visualize grayscale data
plt.figure(figsize=(10, 10))
total_images_taken = 0
for i, (images, labels) in enumerate(preprocessed_ds.take(9)):
    skin_class = train_ds.class_names[labels[0]]
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[0].numpy(), cmap='gray')
    plt.title(skin_class)
    plt.axis("off")


print("Image shape:", images[0].numpy().shape)

In [None]:
# Describe dataset
print(f"{len(preprocessed_ds)=}")
preprocessed_ds.element_spec