In [None]:
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import img_to_array, array_to_img, img_to_array, load_img
import os
import random
import math
import shutil

In [None]:
ORIGINAL_DATA_PATH = './data/original_images'
AUGMENTED_DATA_PATH = './data/augmented_images'
BALANCED_DATA_PATH = './data/balanced_images'
SPLIT_DATA_PATH = './data/images'

CATEGORY_DIRECTORIES = ['Chickenpox', 'Cowpox', 'Healthy', 'HFMD', 'Measles', 'Monkeypox']

TRAIN_RATIO = 0.8
TEST_RATIO = 0.1
VAL_RATIO = 0.1

TRAIN_DIRECTORY = 'train'
TEST_DIRECTORY = 'test'
VAL_DIRECTORY = 'val'

SUB_DIRECTORIES = [TRAIN_DIRECTORY, TEST_DIRECTORY, VAL_DIRECTORY]

DS_STORE = '.DS_Store'

TARGET_COUNT = 1000

ROTATION_RANGE = 90
WIDTH_SHIFT_RANGE = 0.2
HEIGHT_SHIFT_RANGE = 0.2
SHEAR_RANGE = 0.2
ZOOM_RANGE = 0.2
HORIZONTAL_FLIP = True
FILL_MODE = 'nearest'

In [None]:
def count_images(dir_name):
    total = 0
    for category in CATEGORY_DIRECTORIES:
        path = os.path.join(dir_name, category)
        images = [img for img in os.listdir(path) if img.endswith(('jpg','jpeg'))]
        print(f"Number of images in {path.split('/')[-1]}: {len(images)}")
        total += len(images)
    print(f"Total image count: {total}")

In [None]:
count_images(ORIGINAL_DATA_PATH)

In [None]:
def copy_image(old_path, copy_path):
    if old_path.endswith(('jpg','jpeg')):
        name, ext = os.path.splitext(old_path)
        new_name = f"{name}_ORIGINAL{ext}".split("/")[-1]
        new_path = os.path.join(copy_path, new_name)
        # copy the images with ORIGINAL appended
        shutil.copy(old_path, new_path)
        img_name = old_path.split("/")[-1]
        print(f"Renamed: {img_name} → {new_name}")

In [None]:
datagen = ImageDataGenerator(
    rotation_range=ROTATION_RANGE,
    width_shift_range=WIDTH_SHIFT_RANGE,
    height_shift_range=HEIGHT_SHIFT_RANGE,
    shear_range=SHEAR_RANGE,
    zoom_range=ZOOM_RANGE,
    horizontal_flip=HORIZONTAL_FLIP,
    fill_mode=FILL_MODE)

In [None]:
for category in CATEGORY_DIRECTORIES:
    path = AUGMENTED_DATA_PATH
    path = os.path.join(path, category)
    os.mkdir(path)

In [None]:
for category in CATEGORY_DIRECTORIES:
    path = BALANCED_DATA_PATH
    path = os.path.join(path, category)
    os.mkdir(path)

In [None]:
for dataset in SUB_DIRECTORIES:
    for category in CATEGORY_DIRECTORIES:
        path = os.path.join(SPLIT_DATA_PATH, dataset, category)
        os.mkdir(path)

In [None]:
for category in CATEGORY_DIRECTORIES:
    path_o = os.path.join(ORIGINAL_DATA_PATH, category)
    path_a = os.path.join(AUGMENTED_DATA_PATH, category)
    files = os.listdir(path_o)
    for index, file in enumerate(files):
        if DS_STORE in file: ### Skip .DS_Store file
            continue
        print(f'{path_o}/{file}') ### Print the pathname and file name to get an idea about the processing file
        img = load_img(str(os.path.join(path_o, file))) ### This is a PIL image
        x = img_to_array(img) ### This is a NumPy array with shape (3, 150, 150)
        x = x.reshape((1,) + x.shape) ### This is a NumPy array with shape (1, 3, 150, 150)
        ### The .flow() command generates batches of randomly transformed images and save the results in the 
        ### ./data/augmented_images/<data_directory> along with the original image
        i = 0
        for batch in datagen.flow(x, batch_size=1, save_to_dir=path_a, save_prefix=file.split(".")[0], save_format='jpg'):
            i += 1
            if i > 15:
                break ### Stop looping the generator infinitely
        ### Save the original image in ./data/augmented/<data_directory> as well
        copy_image(os.path.join(path_o, file), path_a)

In [None]:
count_images(AUGMENTED_DATA_PATH)

In [None]:
def create_balanced_dataset():
    for category in CATEGORY_DIRECTORIES:
        source_path = os.path.join(AUGMENTED_DATA_PATH, category)
        target_path = os.path.join(BALANCED_DATA_PATH, category)

        images = [img for img in os.listdir(source_path) if img.endswith(("jpg", "jpeg"))]

        original_images = [img for img in images if "ORIGINAL" in img]
        other_images = [img for img in images if "ORIGINAL" not in img]

        selected_images = original_images[:min(TARGET_COUNT, len(original_images))]

        if len(selected_images) < TARGET_COUNT:
            remaining_needed = TARGET_COUNT - len(selected_images)
            selected_images.extend(random.sample(other_images, min(remaining_needed, len(other_images))))

        for img in selected_images:
            shutil.copy(os.path.join(source_path, img), os.path.join(target_path, img))

        print(f"{category}: {len(selected_images)} images copied to {target_path}")
    print("\n Balanced dataset created successfully!\n")

In [None]:
create_balanced_dataset()

In [None]:
count_images(BALANCED_DATA_PATH)

In [None]:
def split_dataset():
    for category in CATEGORY_DIRECTORIES:
        source_path = os.path.join(BALANCED_DATA_PATH, category)

        # Define split directories
        train_path = os.path.join(SPLIT_DATA_PATH, TRAIN_DIRECTORY, category)
        val_path = os.path.join(SPLIT_DATA_PATH, VAL_DIRECTORY, category)
        test_path = os.path.join(SPLIT_DATA_PATH, TEST_DIRECTORY, category)

        images = [img for img in os.listdir(source_path) if img.endswith(('jpg', 'jpeg'))]
        random.shuffle(images)
            
        # Split images
        train_split = int(len(images) * TRAIN_RATIO)
        val_split = int(len(images) * (TRAIN_RATIO + VAL_RATIO))

        train_images = images[:train_split]
        val_images = images[train_split:val_split]
        test_images = images[val_split:]

        # Copy images to respective folders
        for img in train_images:
            shutil.copy(os.path.join(source_path, img), os.path.join(train_path, img))

        for img in val_images:
            shutil.copy(os.path.join(source_path, img), os.path.join(val_path, img))

        for img in test_images:
            shutil.copy(os.path.join(source_path, img), os.path.join(test_path, img))

        print(f"{category}: {len(train_images)} train, {len(val_images)} val, {len(test_images)} test images")

    print("\n Dataset successfully split into train, val, and test. \n")

In [None]:
split_dataset()