In [2]:
import os
import numpy as np
from src.config import IMAGE_SIZE, PROCESSED_DIR, RAW_DIR, SEED, TEST_DIR, TRAIN_DIR, VAL_DIR
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from sklearn.model_selection import train_test_split
import shutil

In [3]:
# Augmentation parameters
AUGMENTATION_CONFIG = {
    "rotation_range": 45,
    "width_shift_range": 0.3,
    "height_shift_range": 0.3,
    "shear_range": 45,
    "zoom_range": [0.8, 1.25],
    "horizontal_flip": True,
    "vertical_flip": True,
    "brightness_range": [0.1, 2],
    "fill_mode": 'constant',
}

In [4]:
def create_directories():
    """Create directories for train, validation, and test sets if they don't exist."""
    os.makedirs(TRAIN_DIR, exist_ok=True)
    os.makedirs(VAL_DIR, exist_ok=True)
    os.makedirs(TEST_DIR, exist_ok=True)
    os.makedirs(PROCESSED_DIR, exist_ok=True)

In [5]:
def augment_data(datagen, img, img_name, save_dir, count=5):
    """Augment a single image and save multiple versions."""
    img = img_to_array(img)
    img = np.expand_dims(img, 0)
    i = 0
    for batch in datagen.flow(img, batch_size=1, save_to_dir=save_dir, save_prefix=img_name, save_format='jpeg'):
        i += 1
        if i >= count:  # Generate 'count' augmented images
            break

In [6]:
def preprocess_and_augment_data():
    """Load images, preprocess, augment, and split them into train, val, and test sets."""
    
    # Image data generator for augmentation
    datagen = ImageDataGenerator(**AUGMENTATION_CONFIG)
    
    # Lists to store image paths and labels
    images = []
    labels = []

    # Load images and append to lists
    for category in os.listdir(RAW_DIR):
        category_path = os.path.join(RAW_DIR, category)
        if os.path.isdir(category_path):
            for img_name in os.listdir(category_path):
                img_path = os.path.join(category_path, img_name)
                if img_name.lower().endswith(('jpeg', 'jpg', 'png')):
                    images.append(img_path)
                    labels.append(category)
    
    # Split the dataset
    train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2, random_state=SEED)
    train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=0.1, random_state=SEED)

    # Augment and save training images
    for img_path, label in zip(train_images, train_labels):
        img = load_img(img_path, target_size=IMAGE_SIZE)
        save_dir = os.path.join(TRAIN_DIR, label)
        os.makedirs(save_dir, exist_ok=True)
        augment_data(datagen, img, os.path.basename(img_path), save_dir)

    # Save validation images
    for img_path, label in zip(val_images, val_labels):
        img = load_img(img_path, target_size=IMAGE_SIZE)
        save_dir = os.path.join(VAL_DIR, label)
        os.makedirs(save_dir, exist_ok=True)
        img.save(os.path.join(save_dir, os.path.basename(img_path)))

    # Save test images
    for img_path, label in zip(test_images, test_labels):
        img = load_img(img_path, target_size=IMAGE_SIZE)
        save_dir = os.path.join(TEST_DIR, label)
        os.makedirs(save_dir, exist_ok=True)
        img.save(os.path.join(save_dir, os.path.basename(img_path)))

    print("Preprocessing and augmentation completed.")

In [7]:
def main():
    # Create directories
    create_directories()
    
    # Preprocess and augment the data
    preprocess_and_augment_data()

In [9]:
if __name__ == '__main__':
    main()

Preprocessing and augmentation completed.
