In [1]:
# The name of this file is train-test-split, although our main point is to split the training set, and the validation set
import pandas as pd
import numpy as np
import os
import shutil

In [2]:
# At this stage, we want to split the image in 20241005_image into a training set and a validation set
# There are 410147 images

# We create an index array. We will shuffle in this array
array = np.arange(1, 410148)

# Shuffle
np.random.shuffle(array)

# Calculate the split index
split_index = int(len(array) * 0.7)

# Split the array into training set and testing set
train_array = array[:split_index]
test_array = array[split_index:]

# Print the sizes of the splits
print(f"Train array size: {len(train_array)}")
print(f"Test array size: {len(test_array)}")

Train array size: 287102
Test array size: 123045


In [3]:
# Check train_array
train_array

array([ 80556, 174887, 133979, ...,  25676, 285013, 132346])

In [4]:
# Check test_array
test_array

array([243701, 106699,  82642, ..., 222865, 317490, 341247])

In [5]:
# Train-Test-Split

# Create Training Folder
training_folder = "training_data"
if not os.path.exists(training_folder):
    os.makedirs(training_folder)

# Create Validation Folder
validation_folder = "validation_data"
if not os.path.exists(validation_folder):
    os.makedirs(validation_folder)

image_folder = "20241005_image" 

# Convert arrays to sets for faster 
train_set = set(train_array)
test_set = set(test_array)

# Move images to their folders based on their indexes
for img in os.listdir(image_folder):
    # Extract the index from the image filename
    index_part = img.split('_')[0]  # Get the first part before the underscore
    index = int(index_part)  # Convert it to an integer

    # Move the file to the appropriate folder
    if index in train_set:
        shutil.move(os.path.join(image_folder, img), os.path.join(training_folder, img))
    elif index in test_set:
        shutil.move(os.path.join(image_folder, img), os.path.join(validation_folder, img))

print(f"Moved {len(train_array)} images to the training folder.")
print(f"Moved {len(test_array)} images to the validation folder.")

Moved 287102 images to the training folder.
Moved 123045 images to the validation folder.
