In [16]:
import os
import shutil
import random

In [17]:
# Make the split reproducible
random.seed(42)  

In [18]:
source_dir = '/exchange/dspro01/group3/data/images'

In [19]:
data_dir = "/exchange/dspro01/group3/data/img_split"

In [20]:
# Destination directories
train_dir = "/exchange/dspro01/group3/data/img_split/train"
val_dir = "/exchange/dspro01/group3/data/img_split/val"
test_dir = "/exchange/dspro01/group3/data/img_split/test"

# Split ratios
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

In [21]:
# Number of images per category
images_per_category = 3000

# Create destination directories
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

In [22]:
# List all images in the source directory
all_images = os.listdir(source_dir)
total_images = len(all_images)
num_categories = total_images // images_per_category

In [23]:
print(f'Nr of images = {total_images}')
print(f'Nr of categories = {num_categories}')

Nr of images = 36000
Nr of categories = 12


In [24]:
# Loop through each category and split images
for category_index in range(num_categories):
    # Calculate the start and end index for the category
    start_index = category_index * images_per_category
    end_index = start_index + images_per_category
    
    # Get images for the current category
    category_images = all_images[start_index:end_index]
    random.shuffle(category_images)  # Shuffle images for randomness

    # Calculate split sizes
    num_images = len(category_images)
    train_size = int(num_images * train_ratio)
    val_size = int(num_images * val_ratio)
    test_size = num_images - train_size - val_size  # Remaining for test

    # Split images
    train_images = category_images[:train_size]
    val_images = category_images[train_size:train_size + val_size]
    test_images = category_images[train_size + val_size:]

    # Create category directories in train, val, and test folders
    category_name = f"category_{category_index + 1}"  # You can modify this to your category naming
    train_category_dir = os.path.join(train_dir, category_name)
    val_category_dir = os.path.join(val_dir, category_name)
    test_category_dir = os.path.join(test_dir, category_name)
    
    os.makedirs(train_category_dir, exist_ok=True)
    os.makedirs(val_category_dir, exist_ok=True)
    os.makedirs(test_category_dir, exist_ok=True)

    # Copy images to respective folders
    for split, split_images in zip(
        [train_category_dir, val_category_dir, test_category_dir],
        [train_images, val_images, test_images]
    ):
        for img in split_images:
            src = os.path.join(source_dir, img)
            dst = os.path.join(split, img)
            shutil.copy(src, dst)

print("Data has been successfully organized into train, validation, and test sets by category.")


Data has been successfully organized into train, validation, and test sets by category.
