In [1]:
# Split the dataset into train and test folders
# with the same class distribution as the original dataset

import os
import shutil
import random
import pandas as pd


In [2]:

# Set the seed for reproducibility
random.seed(42)

# Set the path to the original dataset
original_dataset_dir = '/Users/rz20505/Documents/training_year/applied_data_science/data/uob_image_set_resized/'

# Set the base directory for the new dataset
base_dir = '/Users/rz20505/Documents/training_year/applied_data_science/data/processed/folder_as_class/train_test_split/'

# Remove directories if exist
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
os.mkdir(base_dir)
# Create the directories

# Create the train directory
train_dir = os.path.join(base_dir, 'train')

if not os.path.exists(train_dir):
    os.mkdir(train_dir)

# Create the test directory
test_dir = os.path.join(base_dir, 'val')

if not os.path.exists(test_dir):
    os.mkdir(test_dir)



In [3]:
# Create the directories for each class using the original dataset directory
# each folder is it's own class

# Get the folder names from the original dataset directory

classes = os.listdir(original_dataset_dir)

# Remove hidden files
classes = [c for c in classes if not c.startswith('.')]

# Create the directories for each class in the train and test directories

for class_name in classes:
    # Create the train directory for the class
    train_class_dir = os.path.join(train_dir, class_name)
    
    if not os.path.exists(train_class_dir):
        os.mkdir(train_class_dir)
        
    # Create the test directory for the class
    test_class_dir = os.path.join(test_dir, class_name)
    
    if not os.path.exists(test_class_dir):
        os.mkdir(test_class_dir)



In [4]:
# Copy the images into the train and test directories

# Get the list of images for each class and store them in a dataframe

# Create a dataframe to store the image names and their class
dataset = pd.DataFrame(columns=['image_name', 'class'])

# Loop through each class
for class_name in classes:
    # Get the list of images for the class
    images = os.listdir(os.path.join(original_dataset_dir, class_name))
    for image in images:
        # Add the image name and class to the dataframe using pd.concat
        dataset = pd.concat([dataset, pd.DataFrame({'image_name': [image], 'class': [class_name]})], ignore_index=True)

In [5]:
# Split the dataset into train and test
# Use a 80/20 split
# Ensure that each class has at least one example in both the train and test sets

# Create a dataframe to store the train images
train = pd.DataFrame(columns=['image_name', 'class'])

# Create a dataframe to store the test images
test = pd.DataFrame(columns=['image_name', 'class'])

# Loop through each class
for class_name in classes:
    # Get the list of images for the class
    images = dataset[dataset['class'] == class_name]
    
    # Get the number of images for the class
    num_images = len(images)
    
    # Get the number of images to use for the test set
    num_test_images = int(num_images * 0.2)
    
    # Get the number of images to use for the train set
    num_train_images = num_images - num_test_images
    
    # Get the images to use for the train set
    train_images = images.sample(n=num_train_images, random_state=42)
    
    # Get the images to use for the test set
    test_images = images.drop(train_images.index)

    if len(test_images) == 0:
        # If there are no images in the test set, move one image from the train set to the test set
        test_images = train_images.sample(n=1, random_state=42)
        train_images = train_images.drop(test_images.index)
        
    # Add the images to the train and test dataframes
    train = pd.concat([train, train_images], ignore_index=True)
    test = pd.concat([test, test_images], ignore_index=True)

In [6]:
# Copy the images into the train and test directories

# Loop through each class
for class_name in classes:
    # Get the list of images for the class
    train_images = train[train['class'] == class_name]['image_name'].tolist()
    test_images = test[test['class'] == class_name]['image_name'].tolist()

    # Check if the train_images and test_images lists are empty
    if len(train_images) == 0:
        print('No train images for class: {}'.format(class_name))

    if len(test_images) == 0:
        print('No test images for class: {}'.format(class_name))
    
    # Copy the images into the train and test directories
    for image in train_images:
        src = os.path.join(original_dataset_dir, class_name, image)
        dst = os.path.join(train_dir, class_name, image)
        shutil.copyfile(src, dst)
        
    for image in test_images:
        src = os.path.join(original_dataset_dir, class_name, image)
        dst = os.path.join(test_dir, class_name, image)
        shutil.copyfile(src, dst)

In [7]:
# Check if any of the folders in the train and test directories are empty

# Loop through each class
for class_name in classes:
    # Get the list of images for the class
    train_images = os.listdir(os.path.join(train_dir, class_name))
    test_images = os.listdir(os.path.join(test_dir, class_name))
    
    # Check if the train and test directories are empty
    if len(train_images) == 0:
        print('The train directory for the class {} is empty'.format(class_name))
    if len(test_images) == 0:
        print('The test directory for the class {} is empty'.format(class_name))