In [4]:
# The name of this file is train-test-split, not train-validation-split, because train-test-split is the common phrase
# This cell is to store all the libraries
import pandas as pd
import numpy as np
import os
import shutil

In [2]:
# At this stage, we want to change the 20241009_image_input_60 into a training set and a validation set
# There are 123930 images in 20241009_image_input_60

# We create an index array. We will shuffle in this array.
array = np.arange(1, 123931)

# Shuffle
np.random.shuffle(array)

# Calculate the split index
split_index = int(len(array) * 0.7)

# Split the array into training set and testing set
train_array = array[:split_index]
test_array = array[split_index:]

# Print the sizes of the splits
print(f"Train array size: {len(train_array)}")
print(f"Test array size: {len(test_array)}")

Train array size: 86751
Test array size: 37179


In [3]:
# Check train_array
train_array

array([ 79138,  39208, 117643, ...,   2777,   8400,   9662])

In [5]:
# Train-Test-Split (It should be Train-Validation-Split)

# Create training folder
training_folder = "training_data" 
if not os.path.exists(training_folder):
    os.makedirs(training_folder)

# Create validation folder
validation_folder = "validation_data"
if not os.path.exists(validation_folder):
    os.makedirs(validation_folder)

image_folder = "20241009_image_input_60"

# Convert arrays to sets for faster lookup
train_set = set(train_array)
test_set = set(test_array)

# Move images to their folders based on their indexes
for img in os.listdir(image_folder):
    # Extract the index from the image filename
    index_part = img.split('_')[0]  # Get the first part before the underscore
    index = int(index_part)  # Convert it to an integer

    # Move the file to the appropriate folder
    if index in train_set:
        shutil.move(os.path.join(image_folder, img), os.path.join(training_folder, img))
    elif index in test_set:
        shutil.move(os.path.join(image_folder, img), os.path.join(validation_folder, img))

print(f"Moved {len(train_array)} images to the training folder.")
print(f"Moved {len(test_array)} images to the validation folder.")


Moved 86751 images to the training folder.
Moved 37179 images to the validation folder.
