In [13]:
import os
import shutil
import random
from typing import Dict
import pandas as pd


In [14]:
class DatasetCSVSplitter:
    def __init__(
        self,
        source_dir: str,
        split_ratio: Dict[str, float],
        seed: int = 42
    ):
        self.source_dir = source_dir
        self.split_ratio = split_ratio
        random.seed(seed)

    def split(self):
        records = []

        classes = [
            d for d in os.listdir(self.source_dir)
            if os.path.isdir(os.path.join(self.source_dir, d))
        ]

        for cls in classes:
            cls_path = os.path.join(self.source_dir, cls)
            images = [
                img for img in os.listdir(cls_path)
                if img.lower().endswith((".png", ".jpg", ".jpeg"))
            ]

            random.shuffle(images)
            total = len(images)

            train_end = int(total * self.split_ratio["train"])
            test_end = train_end + int(total * self.split_ratio["test"])

            splits = {
                "train": images[:train_end],
                "test": images[train_end:test_end],
                "eval": images[test_end:]
            }

            for split, imgs in splits.items():
                for img in imgs:
                    records.append({
                        "image_path": os.path.join(cls_path, img),
                        "class": cls,
                        "split": split
                    })

        return pd.DataFrame(records)


In [15]:
SOURCE_DIR = "/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam"

SPLIT_RATIO = {
    "train": 0.7,
    "test": 0.2,
    "eval": 0.1
}

splitter = DatasetCSVSplitter(
    source_dir=SOURCE_DIR,
    split_ratio=SPLIT_RATIO
)

df_splits = splitter.split()
df_splits.head()


Unnamed: 0,image_path,class,split
0,/kaggle/input/spacenet-an-optimally-distribute...,planet,train
1,/kaggle/input/spacenet-an-optimally-distribute...,planet,train
2,/kaggle/input/spacenet-an-optimally-distribute...,planet,train
3,/kaggle/input/spacenet-an-optimally-distribute...,planet,train
4,/kaggle/input/spacenet-an-optimally-distribute...,planet,train


In [17]:
os.listdir("/kaggle/working/dataset/train")



['planet',
 'black hole',
 'asteroid',
 'constellation',
 'star',
 'nebula',
 'comet',
 'galaxy']