In [2]:
import sys
import os

# Get the project root directory (parent of the notebooks folder)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the src folder to the Python path
sys.path.append(project_root)

In [1]:
import os
import shutil
from sklearn.model_selection import train_test_split

full_data_dir = '../data/full_data/street-location-images/data_mapped/'

# Destination directories for processed data
train_dir = '../data/full_data/train/'
val_dir = '../data/full_data/val/'
test_dir = '../data/full_data/test/'

# Create processed directories if they don't exist
for dir_path in [train_dir, val_dir, test_dir]:
    os.makedirs(dir_path, exist_ok=True)

# Get list of image filenames (assume PNG) and corresponding JSON labels
image_files = [f for f in os.listdir(full_data_dir) if f.endswith('.png')]
label_files = [f for f in os.listdir(full_data_dir) if f.endswith('.json')]

# Create a dictionary where the key is the base filename (without extension)
# and the value is a tuple of (image_filename, label_filename)
file_pairs = {os.path.splitext(f)[0]: (f, f.replace('.png', '.json')) for f in image_files if f.replace('.png', '.json') in label_files}

# Extract the image and label filenames
image_files, label_files = zip(*file_pairs.values())

# Split into train, validation, and test sets (80% train, 10% validation, 10% test)
train_images, temp_images, train_labels, temp_labels = train_test_split(image_files, label_files, test_size=0.2, random_state=42)
val_images, test_images, val_labels, test_labels = train_test_split(temp_images, temp_labels, test_size=0.5, random_state=42)

# Helper function to move files
def move_files(file_list, src_dir, dest_dir):
    for file_name in file_list:
        src_path = os.path.join(src_dir, file_name)
        dest_path = os.path.join(dest_dir, file_name)
        shutil.move(src_path, dest_path)

# Move image files
move_files(train_images, full_data_dir, train_dir)
move_files(val_images, full_data_dir, val_dir)
move_files(test_images, full_data_dir, test_dir)

# Move label files
move_files(train_labels, full_data_dir, train_dir)
move_files(val_labels, full_data_dir, val_dir)
move_files(test_labels, full_data_dir, test_dir)

print("Data split and moved successfully!")


Data split and moved successfully!
