In [10]:
import tensorflow as tf
import os
import json
import shutil

In [11]:
# Define paths to your dataset directories
image_dir = 'dataset/augmented_data/images'
label_dir = 'dataset/augmented_data/labels'

In [12]:
# Define paths to save preprocessed data
preprocessed_dir = 'preprocessed_data'
train_dir = os.path.join(preprocessed_dir, 'train')
val_dir = os.path.join(preprocessed_dir, 'val')
test_dir = os.path.join(preprocessed_dir, 'test')

# Create directories to save preprocessed data
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [13]:
# Define constants for image size and batch size
IMAGE_SIZE = (256, 256)  # Resize images to this size
BATCH_SIZE = 32

# Data preprocessing function
def preprocess_image(image):
    # Normalize pixel values to range [0, 1] by dividing by 255
    image = tf.cast(image, tf.float32) / 255.0
    # Resize images to predefined size
    image = tf.image.resize(image, IMAGE_SIZE)
    return image

In [14]:
# Load and preprocess dataset using TensorFlow data pipeline
def load_and_preprocess_image(image_path, label_path):
    # Read image from file
    image = tf.io.read_file(image_path)
    # Decode JPEG-encoded image to tensor
    image = tf.image.decode_jpeg(image, channels=3)
    # Preprocess image
    image = preprocess_image(image)
    return image, label_path, tf.strings.regex_replace(image_path, '.*/', '')

In [15]:
# Load dataset using TensorFlow data pipeline
def load_dataset(image_directory, label_directory):
    # Get list of image file paths in the directory
    image_paths = tf.data.Dataset.list_files(os.path.join(image_directory, '*.jpg'))
    label_paths = tf.data.Dataset.list_files(os.path.join(label_directory, '*.json'))
    # Load images and labels
    dataset = tf.data.Dataset.zip((image_paths, label_paths))
    # Load and preprocess images in parallel using map
    dataset = dataset.map(load_and_preprocess_image)
    return dataset

In [16]:
# Split dataset into training, validation, and testing sets
def split_dataset(dataset, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    # Get dataset size
    dataset_size = len(dataset)
    # Shuffle dataset
    dataset = dataset.shuffle(buffer_size=dataset_size)
    # Calculate split sizes
    train_size = int(train_ratio * dataset_size)
    val_size = int(val_ratio * dataset_size)
    test_size = int(test_ratio * dataset_size)
    # Split dataset
    train_dataset = dataset.take(train_size)
    val_dataset = dataset.skip(train_size).take(val_size)
    test_dataset = dataset.skip(train_size + val_size).take(test_size)
    return train_dataset, val_dataset, test_dataset

In [17]:
# Load dataset
dataset = load_dataset(image_dir, label_dir)

# Split dataset into training, validation, and testing sets
train_dataset, val_dataset, test_dataset = split_dataset(dataset)

In [18]:
# Save preprocessed data for training, validation, and testing
def save_preprocessed_data(dataset, directory):
    images_save_dir = os.path.join(directory, 'images')
    labels_save_dir = os.path.join(directory, 'labels')
    os.makedirs(images_save_dir, exist_ok=True)
    os.makedirs(labels_save_dir, exist_ok=True)
    for (image, label_path, image_path) in dataset:
        # Extract original file name of the image
        image_name = os.path.basename(image_path.numpy()).decode("utf-8")
        # Save preprocessed images with original file names
        tf.keras.preprocessing.image.save_img(os.path.join(images_save_dir, image_name), image)
        # Extract original file name of the label
        label_name = os.path.basename(label_path.numpy()).decode("utf-8")
        # Copy labels to preprocessed labels directory with original file names
        label_dest_path = os.path.join(labels_save_dir, label_name)
        shutil.copy(label_path.numpy(), label_dest_path)

# Save preprocessed data for training, validation, and testing
save_preprocessed_data(train_dataset, train_dir)
save_preprocessed_data(val_dataset, val_dir)
save_preprocessed_data(test_dataset, test_dir)