In [19]:
import json

# === FILE PATHS ===
path1 = "instances_default_first.json"
path2 = "instances_default_second.json"
path3 = "instances_default_third.json"

# === LOAD FILES ===
with open(path1) as f:
    d1 = json.load(f)

with open(path2) as f:
    d2 = json.load(f)

with open(path3) as f:
    d3 = json.load(f)

print("Loaded all three JSON files.")

# === NORMALIZE FILENAMES ===
def normalize_filenames(dataset):
    for img in dataset["images"]:
        if img["file_name"].startswith("images/"):
            img["file_name"] = img["file_name"][len("images/"):]
    return dataset

d3 = normalize_filenames(d3)
print("Filenames in d3 normalized.")

# === FIX CATEGORY IDS ===
ref_cat_name_to_id = {c["name"]: c["id"] for c in d1["categories"]}

def fix_category_ids(dataset, ref_map, label=""):
    local_id_to_name = {c["id"]: c["name"] for c in dataset["categories"]}

    for ann in dataset["annotations"]:
        name = local_id_to_name[ann["category_id"]]
        ann["category_id"] = ref_map[name]

    dataset["categories"] = d1["categories"]

    print(f"\n--- Category IDs for {label} ---")
    for c in dataset["categories"]:
        print(f"ID {c['id']}: {c['name']}")

    return dataset

d1 = fix_category_ids(d1, ref_cat_name_to_id, label="d1 (reference)")
d2 = fix_category_ids(d2, ref_cat_name_to_id, label="d2 (after fix)")
d3 = fix_category_ids(d3, ref_cat_name_to_id, label="d3 (after fix)")

print("\nCategory IDs normalized for all datasets.")

# === MERGE WITH PROPER ID REMAPPING ===
def merge_datasets(base_dataset, new_dataset, label=""):
    """Merge new_dataset into base_dataset with proper ID remapping"""

    # Create filename -> image mapping for base dataset
    base_filename_to_image = {img["file_name"]: img for img in base_dataset["images"]}

    # Track image ID mappings: old_id -> new_id
    image_id_mapping = {}

    # Get max IDs from base dataset
    max_img_id = max([img["id"] for img in base_dataset["images"]]) if base_dataset["images"] else 0
    max_ann_id = max([ann["id"] for ann in base_dataset["annotations"]]) if base_dataset["annotations"] else 0

    # Process images from new dataset
    images_to_add = []
    for img in new_dataset["images"]:
        if img["file_name"] in base_filename_to_image:
            # Image already exists - use existing ID
            old_id = img["id"]
            existing_img = base_filename_to_image[img["file_name"]]
            image_id_mapping[old_id] = existing_img["id"]
            # print(f"Image '{img['file_name']}' already exists with ID {existing_img['id']}")
        else:
            # New image - assign new ID
            old_id = img["id"]
            max_img_id += 1
            img["id"] = max_img_id
            image_id_mapping[old_id] = max_img_id
            images_to_add.append(img)

    # Process annotations with remapped image IDs
    for ann in new_dataset["annotations"]:
        # Shift annotation ID
        max_ann_id += 1
        ann["id"] = max_ann_id

        # Remap image_id
        if ann["image_id"] in image_id_mapping:
            ann["image_id"] = image_id_mapping[ann["image_id"]]
        else:
            print(f"WARNING: Annotation {ann['id']} references unknown image_id {ann['image_id']}")

    # Add to base dataset
    base_dataset["images"].extend(images_to_add)
    base_dataset["annotations"].extend(new_dataset["annotations"])

    print(f"{label}: Added {len(images_to_add)} new images, {len(new_dataset['annotations'])} annotations")

    return base_dataset

# Start with d1 as base
merged = {
    "categories": d1["categories"],
    "images": d1["images"][:],  # Copy
    "annotations": d1["annotations"][:]  # Copy
}

# Merge d2
merged = merge_datasets(merged, d2, label="Merging d2")

# Merge d3
merged = merge_datasets(merged, d3, label="Merging d3")

print("\nMerged all datasets with correct category + annotation + image IDs.")

# Save intermediate merged CVAT file
output_path = "merged_cvat_all.json"
with open(output_path, "w") as f:
    json.dump(merged, f, indent=4)

print("Saved merged CVAT file →", output_path)

Loaded all three JSON files.
Filenames in d3 normalized.

--- Category IDs for d1 (reference) ---
ID 1: potted plant
ID 2: vase
ID 3: cup
ID 4: book
ID 5: chair

--- Category IDs for d2 (after fix) ---
ID 1: potted plant
ID 2: vase
ID 3: cup
ID 4: book
ID 5: chair

--- Category IDs for d3 (after fix) ---
ID 1: potted plant
ID 2: vase
ID 3: cup
ID 4: book
ID 5: chair

Category IDs normalized for all datasets.
Merging d2: Added 0 new images, 955 annotations
Merging d3: Added 0 new images, 1520 annotations

Merged all datasets with correct category + annotation + image IDs.
Saved merged CVAT file → merged_cvat_all.json
