In [1]:
import os
import random
import shutil

In [2]:
def split_dataset(input_root, train_root, test_root, train_ratio=0.9):
    """
    Splits a folder with subfolders and files into train and test sets, preserving the folder structure.

    Args:
        input_root (str): Path to the root folder containing the original dataset.
        train_root (str): Path to the root folder to save the training dataset.
        test_root (str): Path to the root folder to save the testing dataset.
        train_ratio (float): Ratio of files to include in the training set (default: 0.9).
    """
    # Collect all files with their relative paths
    all_files = []
    for root, _, files in os.walk(input_root):
        for file in files:
            input_path = os.path.join(root, file)
            relative_path = os.path.relpath(input_path, input_root)  # Path relative to input_root
            all_files.append(relative_path)
    
    # Shuffle files and split into train and test
    random.shuffle(all_files)
    split_index = int(len(all_files) * train_ratio)
    train_files = all_files[:split_index]
    test_files = all_files[split_index:]
    
    # Copy files to train and test directories
    for relative_path in train_files:
        copy_file_to_output(input_root, train_root, relative_path)
    
    for relative_path in test_files:
        copy_file_to_output(input_root, test_root, relative_path)
    
    print(f"Dataset split completed: {len(train_files)} files in train, {len(test_files)} files in test.")
# end split_dataset

def copy_file_to_output(input_root, output_root, relative_path):
    """
    Copies a file from the input folder to the output folder, preserving the folder structure.

    Args:
        input_root (str): Root directory of the original dataset.
        output_root (str): Root directory for the output dataset.
        relative_path (str): Path of the file relative to the input root.
    """
    input_path = os.path.join(input_root, relative_path)
    output_path = os.path.join(output_root, relative_path)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)  # Create necessary subfolders
    shutil.copy2(input_path, output_path)  # Copy file while preserving metadata (e.g., modification time)
# end copy_file_to_output

In [None]:
# Example usage
input_root = '/media/datadisk/datasets/hooktheory_xmls_transposed'
train_root = '/media/datadisk/datasets/hooktheory_train'
test_root = '/media/datadisk/datasets/hooktheory_test'
split_dataset(input_root, train_root, test_root, train_ratio=0.9)

Dataset split completed: 13680 files in train, 1520 files in test.
