In [2]:
import os
import json
import shutil
import random
from sklearn.model_selection import train_test_split

# Paths
data_folder = "data/archive/images"  # Folder containing images and the all_data.json file
output_folder = "data/high_quality"  # Output folder for train, valid, test subfolders

# Create output directories
os.makedirs(os.path.join(output_folder, "train"), exist_ok=True)
os.makedirs(os.path.join(output_folder, "valid"), exist_ok=True)
os.makedirs(os.path.join(output_folder, "test"), exist_ok=True)

# Load the JSON data
with open(os.path.join(data_folder, "../all_data.json"), "r") as file:
    all_data = json.load(file)

# Convert JSON dictionary to a list of tuples (key, data) for indexing
data_items = list(all_data.items())

# Split data into train, validation, and test sets
train_items, test_items = train_test_split(data_items, test_size=0.2, random_state=42)
train_items, valid_items = train_test_split(train_items, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Helper function to save a subset
def save_subset(items, subset_name):
    subset_data = {k: v for k, v in items}

    # Save subset JSON
    with open(os.path.join(output_folder, subset_name, "all_data.json"), "w") as f:
        json.dump(subset_data, f, indent=4)

    # Copy images to the subset folder
    for _, item in items:
        src_img_path = os.path.join(data_folder, item["file_name"])
        dest_img_path = os.path.join(output_folder, subset_name, item["file_name"])
        shutil.copy2(src_img_path, dest_img_path)

# Save train, valid, and test subsets
save_subset(train_items, "train")
save_subset(valid_items, "valid")
save_subset(test_items, "test")

print("Data has been split and organized successfully.")


Data has been split and organized successfully.
