In [1]:
import os
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split

DATASET_PATH = "/kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam"
OUTPUT_PATH = Path("/kaggle/working/celestial_dataset")
TRAIN_RATIO, TEST_RATIO, EVAL_RATIO = 0.7, 0.2, 0.1
SEED = 42

In [8]:
class DatasetOrganizer:
    def __init__(self,source_path,dest_path):
        self.source_path = Path(source_path)
        self.dest_path = Path(dest_path)

    def create_structure(self,class_names):
        for split in ["train","test","eval"]:
            for cls in class_names:
                (self.dest_path / split / cls).mkdir(parents=True, exist_ok=True)

    def split_data(self,train_r,test_r,eval_r,seed):
        #1 cleanupp
        if self.dest_path.exists():
            shutil.rmtree(self.dest_path)
            
        class_names = [d.name for d in self.source_path.iterdir() if d.is_dir()]
        self.create_structure(class_names)
        #2 processing per class
        for cls in class_names:
            cls_dir = self.source_path/cls
            files=[f for f in cls_dir.iterdir() if f.is_file()]

            train_f, remainder = train_test_split(
                files, test_size=(1 - train_r), random_state=seed
            )
            relative_test_size = test_r / (test_r + eval_r)
            test_f, eval_f = train_test_split(
                remainder, train_size=relative_test_size, random_state=seed
            )

    #3 create symbolic links (virtual copies)
            split_map = {"train": train_f, "test": test_f, "eval": eval_f}
            for split_name, file_list in split_map.items():
                for f in file_list:
                    dest_file = self.dest_path / split_name / cls / f.name
                    os.symlink(f, dest_file)
            print(f"{cls}: {len(train_f)} Train | {len(test_f)} Test | {len(eval_f)} Eval")
                

In [9]:
organizer = DatasetOrganizer(DATASET_PATH, OUTPUT_PATH)
organizer.split_data(TRAIN_RATIO, TEST_RATIO, EVAL_RATIO, SEED)

planet: 1030 Train | 294 Test | 148 Eval
galaxy: 2788 Train | 797 Test | 399 Eval
black hole: 459 Train | 131 Test | 66 Eval
asteroid: 198 Train | 56 Test | 29 Eval
comet: 291 Train | 83 Test | 42 Eval
star: 2288 Train | 654 Test | 327 Eval
constellation: 1086 Train | 310 Test | 156 Eval
nebula: 834 Train | 238 Test | 120 Eval


In [10]:
# Verification
sample_path = next(OUTPUT_PATH.glob("train/*/*"))
print(f"Sample link: {sample_path}")
print(f"Points to: {os.readlink(sample_path)}")

Sample link: /kaggle/working/celestial_dataset/train/planet/planet_page_1_image_4_aug3_SwinIR_large.png
Points to: /kaggle/input/spacenet-an-optimally-distributed-astronomy-data/SpaceNet.FLARE.imam_alam/planet/planet_page_1_image_4_aug3_SwinIR_large.png
