In [1]:
import json
import os
import shutil
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

In [3]:
# Paths to source datasets
dataset1_images = Path("../data/training/coco/images")
dataset2_images = Path("../data/training/fish")  # same structure

dataset_1_json = Path("../data/training/coco/annotations.json")
dataset_2_json = Path("../data/training/coco/labels_fish-maze_2025-04-04-07-41-19.json")

output_dir = Path("../data/training/coco/merged_output")
output_images_dir = output_dir / "images"
output_json_path = output_dir / "annotations.json"

# Ensure output directory exists
output_images_dir.mkdir(parents=True, exist_ok=True)

In [4]:
# Load COCO JSONs
def load_coco(json_path):
    with open(json_path, 'r') as f:
        return json.load(f)

In [5]:
coco1 = load_coco(dataset_1_json)
coco2 = load_coco(dataset_2_json)

In [8]:
# Initialize merged dataset
merged = {
    "images": [],
    "annotations": [],
    "categories": coco1["categories"],  # assuming both have same categories
}

# ID tracking
next_image_id = 1
next_annotation_id = 1
used_filenames = set()

# Mapping from old image_id to new image_id
image_id_map = {}

def process_dataset(coco_data, image_root):
    global next_image_id, next_annotation_id

    for img in tqdm(coco_data["images"], desc=f"Processing {image_root.name}"):
        original_name = img["file_name"]
        new_name = original_name
        stem = Path(original_name).stem
        suffix = Path(original_name).suffix

        # Resolve filename conflict
        counter = 1
        while new_name in used_filenames:
            new_name = f"{stem}_{counter}{suffix}"
            counter += 1

        used_filenames.add(new_name)
        new_img_id = next_image_id
        next_image_id += 1

        # Copy image to output dir
        src_path = image_root / original_name
        dst_path = output_images_dir / new_name
        shutil.copy2(src_path, dst_path)

        # Add updated image entry
        merged["images"].append({
            "id": new_img_id,
            "file_name": new_name,
            "width": img["width"],
            "height": img["height"],
        })

        image_id_map[img["id"]] = new_img_id

    for ann in coco_data["annotations"]:
        new_ann = ann.copy()
        new_ann["id"] = next_annotation_id
        new_ann["image_id"] = image_id_map[ann["image_id"]]
        next_annotation_id += 1
        merged["annotations"].append(new_ann)

In [9]:
process_dataset(coco1, dataset1_images)
process_dataset(coco2, dataset2_images)

Processing images: 100%|██████████| 6104/6104 [00:21<00:00, 277.75it/s]
Processing fish: 100%|██████████| 73/73 [00:00<00:00, 260.29it/s]


In [10]:
# Save merged JSON
with open(output_json_path, 'w') as f:
    json.dump(merged, f, indent=2)

print(f"\n✅ Merged dataset saved to {output_dir}")


✅ Merged dataset saved to ..\data\training\coco\merged_output
