In [1]:
from pathlib import Path
import shutil

# Ensure the following directory structure:
# dataset/
# ├── class1/
# ├── class2/
# └── ...


dataset_dir = Path(".") / "dataset"
dataset_dir.mkdir(exist_ok=True)

classes = [d for d in Path("original_images/images").glob("*") if d.is_dir()]

max_sample_count_per_class = None # set to None to copy all samples

for _class in classes:
    class_name = _class.name
    class_dir = dataset_dir / class_name
    class_dir.mkdir(exist_ok=True)
    for idx, img in enumerate(_class.rglob("*.png")):
        if max_sample_count_per_class is not None and idx >= max_sample_count_per_class:
            break
        parent_name = img.parent.name
        shutil.copy(img, class_dir / (parent_name+ "_" + img.name))


### Split the dataset into train, test and validation

In [2]:
from sklearn.model_selection import train_test_split

train_dir = Path("dataset_split/train")
test_dir = Path("dataset_split/test")
val_dir = Path("dataset_split/val")

for dir_path in [train_dir, test_dir, val_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

train_ratio = 0.6
test_ratio = 0.2
val_ratio = 0.2

for class_dir in dataset_dir.iterdir():
    if class_dir.is_dir():
        files = list(class_dir.iterdir())
        train_files, temp_files = train_test_split(files, test_size=(1 - train_ratio))
        val_files, test_files = train_test_split(temp_files, test_size=(test_ratio / (test_ratio + val_ratio)))

        (train_dir / class_dir.name).mkdir(parents=True, exist_ok=True)
        (test_dir / class_dir.name).mkdir(parents=True, exist_ok=True)
        (val_dir / class_dir.name).mkdir(parents=True, exist_ok=True)

        for file in train_files:
            shutil.move(str(file), str(train_dir / class_dir.name / file.name))
        for file in test_files:
            shutil.move(str(file), str(test_dir / class_dir.name / file.name))
        for file in val_files:
            shutil.move(str(file), str(val_dir / class_dir.name / file.name))