In [1]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
DATASET_PATH = "/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam"
OUTPUT_PATH = Path("/kaggle/working/celestial_dataset")
TRAIN_RATIO, TEST_RATIO, EVAL_RATIO = 0.7, 0.2, 0.1
SEED = 42

In [3]:
class DatasetPartitioner:
    SPLITS = ("train", "test", "eval")

    def __init__(self, input_dir: str, output_dir: str):
        self.input_dir = Path(input_dir)
        self.output_dir = Path(output_dir)

    def _reset_output(self):
        if self.output_dir.exists():
            shutil.rmtree(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    def _list_classes(self):
        return sorted(
            folder.name
            for folder in self.input_dir.iterdir()
            if folder.is_dir()
        )

    def _prepare_folders(self, classes):
        for split in self.SPLITS:
            for cls in classes:
                (self.output_dir / split / cls).mkdir(parents=True, exist_ok=True)

    def _partition_files(self, files, train_r, test_r, eval_r, seed):
        train_set, temp_set = train_test_split(
            files,
            test_size=(1 - train_r),
            random_state=seed,
            shuffle=True
        )

        test_fraction = test_r / (test_r + eval_r)
        test_set, eval_set = train_test_split(
            temp_set,
            train_size=test_fraction,
            random_state=seed,
            shuffle=True
        )

        return {
            "train": train_set,
            "test": test_set,
            "eval": eval_set
        }

    def organize(self, train_r=0.7, test_r=0.2, eval_r=0.1, seed=42):
        self._reset_output()
        classes = self._list_classes()
        self._prepare_folders(classes)

        for cls in classes:
            cls_path = self.input_dir / cls
            files = [f for f in cls_path.iterdir() if f.is_file()]

            split_sets = self._partition_files(
                files, train_r, test_r, eval_r, seed
            )

            for split, file_list in split_sets.items():
                for file_path in file_list:
                    target = self.output_dir / split / cls / file_path.name
                    if not target.exists():
                        os.symlink(file_path, target)

            print(
                f"[{cls}] → "
                f"Train: {len(split_sets['train'])}, "
                f"Test: {len(split_sets['test'])}, "
                f"Eval: {len(split_sets['eval'])}"
            )

In [4]:
partitioner = DatasetPartitioner(input_dir=DATASET_PATH,output_dir=OUTPUT_PATH)

partitioner.organize(train_r=TRAIN_RATIO,test_r=TEST_RATIO,eval_r=EVAL_RATIO,seed=SEED)


[asteroid] → Train: 198, Test: 56, Eval: 29
[black hole] → Train: 459, Test: 131, Eval: 66
[comet] → Train: 291, Test: 83, Eval: 42
[constellation] → Train: 1086, Test: 310, Eval: 156
[galaxy] → Train: 2788, Test: 797, Eval: 399
[nebula] → Train: 834, Test: 238, Eval: 120
[planet] → Train: 1030, Test: 294, Eval: 148
[star] → Train: 2288, Test: 654, Eval: 327
