In [1]:

!git clone https://github.com/YRIKKA/yrikka-btt-aistudio-2025.git

Cloning into 'yrikka-btt-aistudio-2025'...
remote: Enumerating objects: 1011, done.[K
remote: Total 1011 (delta 0), reused 0 (delta 0), pack-reused 1011 (from 3)[K
Receiving objects: 100% (1011/1011), 244.53 MiB | 42.49 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [2]:
import os
import json
import csv
import shutil
from tqdm import tqdm
from PIL import Image

def clean_dataset(images_dir, coco_json_path, out_dir):

    # Create output directory & images folder
    os.makedirs(os.path.join(out_dir, "images"), exist_ok=True)

    # -------------------------------------------
    # Step 1: Define canonical classes + mappings
    # -------------------------------------------
    canonical_classes = ["potted plant", "chair", "cup", "vase", "book"]
    class_mapping = {
        "##ted": "potted plant",
        "pot plant": "potted plant",
        "cup vase": "vase",
        "pot": "potted plant",
        "vase potted plant": "potted plant",
        "potted": "potted plant"
    }

    # -------------------------------------------
    # Step 2: Load COCO annotations
    # -------------------------------------------
    with open(coco_json_path) as f:
        coco = json.load(f)

    # Map canonical class names → new IDs (1 to N)
    name_to_new_id = {cls: i + 1 for i, cls in enumerate(canonical_classes)}

    # Original COCO category_id → category_name mapping
    category_id_to_name = {cat["id"]: cat["name"] for cat in coco.get("categories", [])}

    # -------------------------------------------
    # Step 3: Collect valid images + dimensions
    # -------------------------------------------
    image_id_to_info = {}
    for img in coco["images"]:
        file_path = os.path.join(images_dir, img["file_name"])
        if not os.path.exists(file_path):
            continue
        try:
            with Image.open(file_path) as im:
                width, height = im.size
        except:
            continue

        image_id_to_info[img["id"]] = {
            "file_name": img["file_name"],
            "path": file_path,
            "width": width,
            "height": height
        }

    # -------------------------------------------
    # Step 4: Validate and clean annotations
    # -------------------------------------------
    valid_annotations = []
    valid_image_ids = set()
    dropped_rows = []
    used_original_classes = []  # Track only class names actually present

    for ann in tqdm(coco["annotations"], desc="Validating annotations"):
        image_id = ann["image_id"]

        # Skip annotation if image is missing/unloadable
        if image_id not in image_id_to_info:
            dropped_rows.append(["Missing image", image_id, ann["id"], ann.get("bbox", "")])
            continue

        # Validate bbox
        info = image_id_to_info[image_id]
        img_w, img_h = info["width"], info["height"]
        x, y, w, h = ann["bbox"]

        if w <= 0 or h <= 0:
            dropped_rows.append(["Invalid bbox size", image_id, ann["id"], ann["bbox"]])
            continue

        # Clip box to image boundaries
        x = max(0, min(x, img_w))
        y = max(0, min(y, img_h))
        w = max(1, min(w, img_w - x))
        h = max(1, min(h, img_h - y))

        if w <= 0 or h <= 0:
            dropped_rows.append(["Box outside image", image_id, ann["id"], [x, y, w, h]])
            continue

        # Apply class mapping for noisy labels
        original_class = category_id_to_name.get(ann["category_id"], "Unknown")
        if original_class not in used_original_classes:
          used_original_classes.append(original_class)  # Only track used classes
        mapped_class = class_mapping.get(original_class, original_class)

        # Keep only mapped or canonical classes
        if mapped_class not in canonical_classes:
            dropped_rows.append(["Unmapped class", image_id, ann["id"], original_class])
            continue

        # Final annotation cleanup
        ann["category_id"] = name_to_new_id[mapped_class]
        ann["bbox"] = [x, y, w, h]
        valid_annotations.append(ann)
        valid_image_ids.add(image_id)

    # -------------------------------------------
    # Step 5: Keep only valid images
    # -------------------------------------------
    valid_images = [img for img in coco["images"] if img["id"] in valid_image_ids]

    # -------------------------------------------
    # Step 6: Save cleaned COCO JSON
    # -------------------------------------------
    cleaned_coco = {
        "images": valid_images,
        "annotations": valid_annotations,
        "categories": [{"id": i + 1, "name": cls} for i, cls in enumerate(canonical_classes)]
    }
    with open(os.path.join(out_dir, "cleaned_coco.json"), "w") as f:
        json.dump(cleaned_coco, f, indent=2)

    # Copy only validated images to the new folder
    for img in valid_images:
        src = os.path.join(images_dir, img["file_name"])
        dst = os.path.join(out_dir, "images", img["file_name"])
        if os.path.exists(src):
            shutil.copy2(src, dst)

    # -------------------------------------------
    # Step 7: Save dropped annotation report
    # -------------------------------------------
    with open(os.path.join(out_dir, "dropped_report.csv"), "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Reason", "ImageID", "AnnotationID", "Details"])
        for r in dropped_rows:
            writer.writerow(r)

    # -------------------------------------------
    # Step 8: Save real class mapping used
    # -------------------------------------------
    with open(os.path.join(out_dir, "class_mapping.csv"), "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["Original Class (found)", "Mapped Class"])
        for orig in used_original_classes:
            mapped = class_mapping.get(orig, orig)
            if orig != mapped:
                writer.writerow([orig, mapped])

    print(f"✅ Cleaned dataset saved ({out_dir}): {len(valid_images)} images, {len(valid_annotations)} annotations")

# ---------------------------
# Run on both datasets
# ---------------------------
clean_dataset(
    images_dir="yrikka-btt-aistudio-2025/BTT_Data/852a64c6-4bd3-495f-8ff7-f5cc85e34316/images",
    coco_json_path="yrikka-btt-aistudio-2025/BTT_Data/852a64c6-4bd3-495f-8ff7-f5cc85e34316/coco.json",
    out_dir="cleaned_dataset1"
)

clean_dataset(
    images_dir="yrikka-btt-aistudio-2025/BTT_Data/8e0a5d2d-3ae0-4ff0-b6ee-2d85f7da4fee/images",
    coco_json_path="yrikka-btt-aistudio-2025/BTT_Data/8e0a5d2d-3ae0-4ff0-b6ee-2d85f7da4fee/coco.json",
    out_dir="cleaned_dataset2"
)



Validating annotations: 100%|██████████| 1668/1668 [00:00<00:00, 452499.78it/s]


✅ Cleaned dataset saved (cleaned_dataset1): 497 images, 1668 annotations


Validating annotations: 100%|██████████| 1708/1708 [00:00<00:00, 270620.70it/s]


✅ Cleaned dataset saved (cleaned_dataset2): 496 images, 1708 annotations


In [3]:
import os
import json
import shutil
from tqdm import tqdm

def merge_coco_datasets(cleaned_dir1, cleaned_dir2, output_dir):
    """
    Merges two cleaned COCO datasets (same categories) into one unified dataset.
    """

    os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)

    # Load both COCO annotations
    with open(os.path.join(cleaned_dir1, "cleaned_coco.json")) as f:
        coco1 = json.load(f)
    with open(os.path.join(cleaned_dir2, "cleaned_coco.json")) as f:
        coco2 = json.load(f)

    # Categories should be the same (because we already cleaned them)
    categories = coco1["categories"]

    # To avoid ID conflicts, find maximum existing IDs in dataset1
    max_image_id = max([img["id"] for img in coco1["images"]], default=0)
    max_ann_id = max([ann["id"] for ann in coco1["annotations"]], default=0)

    merged_images = []
    merged_annotations = []

    # Step 1: Copy all images + annotations from dataset1
    for img in coco1["images"]:
        src = os.path.join(cleaned_dir1, "images", img["file_name"])
        dst = os.path.join(output_dir, "images", img["file_name"])
        if os.path.exists(src):
            shutil.copy2(src, dst)
        merged_images.append(img)

    for ann in coco1["annotations"]:
        merged_annotations.append(ann)

    # Step 2: Append all from dataset2 but shift IDs
    for img in coco2["images"]:
        new_img = img.copy()
        new_img["id"] = img["id"] + max_image_id  # shift image ID
        src = os.path.join(cleaned_dir2, "images", img["file_name"])
        dst = os.path.join(output_dir, "images", img["file_name"])
        if os.path.exists(src):
            shutil.copy2(src, dst)
        merged_images.append(new_img)

    for ann in coco2["annotations"]:
        new_ann = ann.copy()
        new_ann["id"] = ann["id"] + max_ann_id  # shift annotation ID
        new_ann["image_id"] = ann["image_id"] + max_image_id  # link to shifted image ID
        merged_annotations.append(new_ann)

    # Step 3: Save merged COCO JSON
    merged_coco = {
        "images": merged_images,
        "annotations": merged_annotations,
        "categories": categories
    }

    with open(os.path.join(output_dir, "merged_coco.json"), "w") as f:
        json.dump(merged_coco, f, indent=2)

    print(f"✅ Merged dataset saved to {output_dir}:")
    print(f"   → Total Images: {len(merged_images)}")
    print(f"   → Total Annotations: {len(merged_annotations)}")


# ----------------------------
# Run After Your Cleaning Step
# ----------------------------
merge_coco_datasets(
    cleaned_dir1="cleaned_dataset1",
    cleaned_dir2="cleaned_dataset2",
    output_dir="merged_cleaned_dataset"
)


✅ Merged dataset saved to merged_cleaned_dataset:
   → Total Images: 993
   → Total Annotations: 3376


In [None]:
# from google.colab import files

# # Create a zip of the misclassified images folder
# shutil.make_archive("merged_cleaned_dataset/images", "zip", "merged_cleaned_dataset/images")


# # Download the zip


In [4]:
pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.230-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.18 (from ultralytics)
  Downloading ultralytics_thop-2.0.18-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.230-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.18-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.230 ultralytics-thop-2.0.18


In [5]:
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [6]:
import os
import shutil
import csv
import json
from collections import defaultdict
from ultralytics import YOLO

# -----------------------------
# Step 1. Run YOLO Predictions
# -----------------------------
model = YOLO("yolo11n.pt")

results = model.predict(
    source="merged_cleaned_dataset/images",
    save=True,
    save_txt=True,
    conf=0.5,
    classes=[58, 56, 41, 75, 73]  # COCO IDs for potted plant, chair, cup, vase, book
)

# -----------------------------
# Step 2. Class Mapping
# -----------------------------
coco_to_canonical = {58: 1, 56: 2, 41: 3, 75: 4, 73: 5}
canonical_names = {
    1: "potted plant", 2: "chair", 3: "cup", 4: "vase", 5: "book"
}

# -----------------------------
# Step 3. Load Ground Truth
# -----------------------------
with open("merged_cleaned_dataset/merged_coco.json") as f:
    gt = json.load(f)

image_id_to_ann = defaultdict(list)
for ann in gt["annotations"]:
    image_id_to_ann[ann["image_id"]].append(ann)

image_id_to_name = {img["id"]: img["file_name"] for img in gt["images"]}

# -----------------------------
# Step 4. IOU Function
# -----------------------------
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0] + boxA[2], boxB[0] + boxB[2])
    yB = min(boxA[1] + boxA[3], boxB[1] + boxB[3])
    inter = max(0, xB - xA) * max(0, yB - yA)
    union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
    return inter / union if union > 0 else 0

# -----------------------------
# Step 5. Compare Predictions vs GT
# -----------------------------
misclassified_images = set()
report_rows = []
iou_threshold = 0.4

for result in results:
    file_name = os.path.basename(result.path)

    # Match GT image id
    image_id = next((img["id"] for img in gt["images"] if img["file_name"] == file_name), None)
    if image_id is None:
        continue

    gt_boxes = [(ann["bbox"], ann["category_id"]) for ann in image_id_to_ann[image_id]]

    preds = []
    for box in result.boxes:
        coco_id = int(box.cls.cpu().numpy()[0])
        if coco_id in coco_to_canonical:
            pred_class = coco_to_canonical[coco_id]
            xyxy = box.xyxy.cpu().numpy()[0]
            w, h = xyxy[2] - xyxy[0], xyxy[3] - xyxy[1]
            preds.append(([float(xyxy[0]), float(xyxy[1]), float(w), float(h)], pred_class))

    matched_preds = set()

    # GT vs Preds
    for gt_box, gt_cls in gt_boxes:
        for i, (pred_box, pred_cls) in enumerate(preds):
            if i not in matched_preds and iou(gt_box, pred_box) >= iou_threshold:
                matched_preds.add(i)
                if pred_cls != gt_cls:
                    misclassified_images.add(file_name)
                    report_rows.append([file_name, "Wrong Class",
                                        canonical_names[gt_cls],
                                        canonical_names[pred_cls],
                                        round(iou(gt_box, pred_box), 3)])

    # Extra Predictions = Missing GT annotations
    for i, (pred_box, pred_cls) in enumerate(preds):
        if i not in matched_preds:
            misclassified_images.add(file_name)
            report_rows.append([file_name, "Missing Annotation", "None",
                                canonical_names[pred_cls], 0.0])

# -----------------------------
# Step 6. Save Everything in One Folder
# -----------------------------
output_folder = "misclassified_dataset"
images_folder = os.path.join(output_folder, "images")
os.makedirs(images_folder, exist_ok=True)

# Create COCO JSON with only misclassified images
misclassified_gt = {
    "images": [img for img in gt["images"] if img["file_name"] in misclassified_images],
    "annotations": [ann for ann in gt["annotations"]
                    if image_id_to_name[ann["image_id"]] in misclassified_images],
    "categories": gt["categories"]
}

with open(os.path.join(output_folder, "misclassified_coco.json"), "w") as f:
    json.dump(misclassified_gt, f, indent=2)

# Copy misclassified images
for img in misclassified_gt["images"]:
    src = os.path.join("merged_cleaned_dataset/images", img["file_name"])
    dst = os.path.join(images_folder, img["file_name"])
    if os.path.exists(src):
        shutil.copy2(src, dst)

# Save CSV Report
with open(os.path.join(output_folder, "misclassified_report.csv"), "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Image", "Reason", "GroundTruth", "Prediction", "IOU"])
    writer.writerows(report_rows)

print(f"✅ {len(misclassified_gt['images'])} misclassified images found.")
print(f"📁 All outputs saved inside '{output_folder}/'")



[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt': 100% ━━━━━━━━━━━━ 5.4MB 166.1MB/s 0.0s

image 1/993 /content/merged_cleaned_dataset/images/0067dc2e-4bb7-48a2-b9c5-9da631f86595_flux_1756081928.png: 640x640 1 potted plant, 1 vase, 8.3ms
image 2/993 /content/merged_cleaned_dataset/images/007abd03-fa5e-43d8-af0a-fbed58e17821_flux_1756081085.png: 640x640 (no detections), 8.3ms
image 3/993 /content/merged_cleaned_dataset/images/00f8644a-c4af-4097-a0bc-e8af706554ce_flux_1756082211.png: 640x640 1 vase, 8.2ms
image 4/993 /content/merged_cleaned_dataset/images/0142ef49-931a-46b3-a841-452224567d33_flux_1756079514.png: 640x640 1 chair, 8.2ms
image 5/993 /content/merged_cleaned_dataset/images/01a1e58d-cdf5-4de9-8c35-bf4a9e741350_flux_1756079529.png: 640x640 1 chair, 1 vase, 8.2ms
image 6/993 /content/merged_cleaned_dataset/images/02534685-590c-4899-a048-c9371666f4b6_flux_1756080506.png: 640x640 1 cup, 1 book, 8.2ms
image 7/993 /content/merge

In [7]:
import csv

wrong_class = 0
missing_annotation = 0

with open("misclassified_dataset/misclassified_report.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    for row in reader:
        reason = row[1]
        if reason == "Wrong Class":
            wrong_class += 1
        elif reason == "Missing Annotation":
            missing_annotation += 1

total = wrong_class + missing_annotation

print("Wrong Class:", wrong_class)
print("Missing Annotation:", missing_annotation)
print("Total Flagged:", total)

Wrong Class: 27
Missing Annotation: 277
Total Flagged: 304


In [8]:
import json

with open("misclassified_dataset/misclassified_coco.json") as f:
    coco = json.load(f)

total_annotations = len(coco["annotations"])
print("Total annotations in misclassified COCO JSON:", total_annotations)

Total annotations in misclassified COCO JSON: 825


In [10]:
#-----------------------------
# Merging the cvat export with original merged dataset
# upload merged_cvat_all.json before running this cell.
#-----------------------------

import json, copy, hashlib
import math

# ------------------------------
# Helper: IoU for two COCO bboxes
# bbox = [x, y, w, h]
# ------------------------------
def iou(b1, b2):
    x1, y1, w1, h1 = b1
    x2, y2, w2, h2 = b2

    xa = max(x1, x2)
    ya = max(y1, y2)
    xb = min(x1 + w1, x2 + w2)
    yb = min(y1 + h1, y2 + h2)

    inter_w = xb - xa
    inter_h = yb - ya
    if inter_w <= 0 or inter_h <= 0:
        return 0.0

    inter_area = inter_w * inter_h
    area1 = w1 * h1
    area2 = w2 * h2
    union_area = area1 + area2 - inter_area

    return inter_area / union_area if union_area > 0 else 0.0


# ------------------------------
# Helper: Hash annotation to detect exact duplicates
# ------------------------------
def hash_ann(a):
    txt = json.dumps({
        "category_id": a.get("category_id"),
        "bbox": a.get("bbox"),
        "segmentation": a.get("segmentation"),
        "area": a.get("area"),
        "iscrowd": a.get("iscrowd")
    }, sort_keys=True)
    return hashlib.md5(txt.encode()).hexdigest()


# ------------------------------
# Helper: category name → object
# ------------------------------
def index_by_name(cats):
    return {c["name"]: c for c in cats}


# ------------------------------
# Main function
# ------------------------------
def fix_and_merge(all_path, corrected_path, out_path, verbose=True,
                  iou_duplicate_threshold=0.90):

    # load both files
    with open(all_path, "r") as f:
        all_coco = json.load(f)
    with open(corrected_path, "r") as f:
        corr_coco = json.load(f)

    # final merged structure
    final = {
        "images": [],
        "annotations": [],
        "categories": [],
        "info": all_coco.get("info", {}),
        "licenses": all_coco.get("licenses", [])
    }

    # --------------------------------------------------------------------
    # STEP 1 — Merge categories (by name)
    # --------------------------------------------------------------------
    cats1 = all_coco.get("categories", [])
    cats2 = corr_coco.get("categories", [])

    final["categories"] = copy.deepcopy(cats1)
    name_to_cat = index_by_name(cats1)
    next_cat_id = max([c["id"] for c in cats1], default=0) + 1

    cat_map = {}

    for c in cats2:
        name = c["name"]
        if name in name_to_cat:
            cat_map[c["id"]] = name_to_cat[name]["id"]
        else:
            new_c = copy.deepcopy(c)
            new_c["id"] = next_cat_id
            final["categories"].append(new_c)
            name_to_cat[name] = new_c
            cat_map[c["id"]] = next_cat_id
            next_cat_id += 1

    # --------------------------------------------------------------------
    # STEP 2 — Merge images by file_name
    # --------------------------------------------------------------------
    img1 = all_coco.get("images", [])
    img2 = corr_coco.get("images", [])

    final["images"] = copy.deepcopy(img1)
    fname_to_id = {img["file_name"]: img["id"] for img in img1}
    next_img_id = max([img["id"] for img in img1], default=0) + 1

    img_map = {}

    for img in img2:
        fn = img["file_name"]
        if fn in fname_to_id:
            img_map[img["id"]] = fname_to_id[fn]
        else:
            new_img = copy.deepcopy(img)
            new_img["id"] = next_img_id
            final["images"].append(new_img)
            fname_to_id[fn] = next_img_id
            img_map[img["id"]] = next_img_id
            next_img_id += 1

    # --------------------------------------------------------------------
    # STEP 3 — Remove wrong annotations
    # --------------------------------------------------------------------
    corrected_filenames = {img["file_name"] for img in img2}
    corrected_image_ids = {
        fname_to_id[f] for f in corrected_filenames if f in fname_to_id
    }

    all_ann = all_coco.get("annotations", [])
    cleaned_all_ann = [
        ann for ann in all_ann
        if ann["image_id"] not in corrected_image_ids
    ]

    # --------------------------------------------------------------------
    # STEP 4 — Add corrected annotations (with ID remap)
    # --------------------------------------------------------------------
    final["annotations"] = cleaned_all_ann
    next_ann_id = max([a["id"] for a in cleaned_all_ann], default=0) + 1

    # store annotations by (image, category) for IoU dup detection
    ann_index = {}
    for a in cleaned_all_ann:
        key = (a["image_id"], a["category_id"])
        ann_index.setdefault(key, []).append(a)

    ann_hashes = {hash_ann(a) for a in cleaned_all_ann}

    added = 0
    skipped_iou = 0

    for ann in corr_coco.get("annotations", []):
        new_ann = copy.deepcopy(ann)
        new_ann["id"] = next_ann_id
        new_ann["image_id"] = img_map[new_ann["image_id"]]
        new_ann["category_id"] = cat_map[new_ann["category_id"]]

        key = (new_ann["image_id"], new_ann["category_id"])

        # ---- Step A: Exact duplicate detection
        h = hash_ann(new_ann)
        if h in ann_hashes:
            continue

        # ---- Step B: Near-duplicate IoU filtering
        existing_list = ann_index.get(key, [])
        is_duplicate = False
        for ex in existing_list:
            if iou(ex["bbox"], new_ann["bbox"]) > iou_duplicate_threshold:
                is_duplicate = True
                skipped_iou += 1
                break

        if is_duplicate:
            continue

        # add annotation
        ann_hashes.add(h)
        final["annotations"].append(new_ann)
        ann_index.setdefault(key, []).append(new_ann)
        next_ann_id += 1
        added += 1

    print(f"Added corrected annotations: {added}")
    print(f"Skipped near-duplicate boxes (IoU): {skipped_iou}")

    # --------------------------------------------------------------------
    # Save
    # --------------------------------------------------------------------
    with open(out_path, "w") as f:
        json.dump(final, f, indent=2)

    print("✅ Merged JSON saved as", out_path)

all_json_path = "merged_cleaned_dataset/merged_coco.json"
corrected_json_path = "merged_cvat_all.json"
output_path = "cleaned_merged_coco.json"

fix_and_merge(all_json_path, corrected_json_path, output_path, verbose=True)

print("Done!")


Added corrected annotations: 2048
Skipped near-duplicate boxes (IoU): 91
✅ Merged JSON saved as cleaned_merged_coco.json
Done!


In [11]:
import json

# Load the COCO JSON file
with open("cleaned_merged_coco.json") as f:  # replace with your JSON file path
    coco = json.load(f)

# Count total annotations
total_annotations = len(coco["annotations"])
print("Total annotations in COCO JSON:", total_annotations)

# count total images too
total_images = len(coco["images"])
print("Total images in COCO JSON:", total_images)

Total annotations in COCO JSON: 4599
Total images in COCO JSON: 993


In [12]:
import os
import shutil
import csv
import json
from collections import defaultdict
from ultralytics import YOLO

# -----------------------------
# Paths
# -----------------------------
images_input_folder = "merged_cleaned_dataset/images"
coco_json_path = "cleaned_merged_coco.json"
output_folder = "new_misclassified_dataset"  # single output folder
images_output_folder = os.path.join(output_folder, "images")
os.makedirs(images_output_folder, exist_ok=True)

# -----------------------------
# Step 1. Run YOLO Predictions
# -----------------------------
model = YOLO("yolo11n.pt")

results = model.predict(
    source=images_input_folder,
    save=True,
    conf=0.5,
    classes=[58, 56, 41, 75, 73]  # COCO IDs for potted plant, chair, cup, vase, book
)

# -----------------------------
# Step 2. Class Mapping
# -----------------------------
coco_to_canonical = {58: 1, 56: 2, 41: 3, 75: 4, 73: 5}
canonical_names = {
    1: "potted plant", 2: "chair", 3: "cup", 4: "vase", 5: "book"
}

# -----------------------------
# Step 3. Load Ground Truth
# -----------------------------
with open(coco_json_path) as f:
    gt = json.load(f)

image_id_to_ann = defaultdict(list)
for ann in gt["annotations"]:
    image_id_to_ann[ann["image_id"]].append(ann)

image_id_to_name = {img["id"]: img["file_name"] for img in gt["images"]}

# -----------------------------
# Step 4. IOU Function
# -----------------------------
def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[0] + boxA[2], boxB[0] + boxB[2])
    yB = min(boxA[1] + boxA[3], boxB[1] + boxB[3])
    inter = max(0, xB - xA) * max(0, yB - yA)
    union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
    return inter / union if union > 0 else 0

# -----------------------------
# Step 5. Compare Predictions vs GT
# -----------------------------
misclassified_images = set()
report_rows = []
iou_threshold = 0.4

for result in results:
    file_name = os.path.basename(result.path)

    # Match GT image id
    image_id = next((img["id"] for img in gt["images"] if img["file_name"] == file_name), None)
    if image_id is None:
        continue

    gt_boxes = [(ann["bbox"], ann["category_id"]) for ann in image_id_to_ann[image_id]]

    preds = []
    for box in result.boxes:
        coco_id = int(box.cls.cpu().numpy()[0])
        if coco_id in coco_to_canonical:
            pred_class = coco_to_canonical[coco_id]
            xyxy = box.xyxy.cpu().numpy()[0]
            w, h = xyxy[2] - xyxy[0], xyxy[3] - xyxy[1]
            preds.append(([float(xyxy[0]), float(xyxy[1]), float(w), float(h)], pred_class))

    matched_preds = set()

    # GT vs Preds
    for gt_box, gt_cls in gt_boxes:
        for i, (pred_box, pred_cls) in enumerate(preds):
            if i not in matched_preds and iou(gt_box, pred_box) >= iou_threshold:
                matched_preds.add(i)
                if pred_cls != gt_cls:
                    misclassified_images.add(file_name)
                    report_rows.append([file_name, "Wrong Class",
                                        canonical_names[gt_cls],
                                        canonical_names[pred_cls],
                                        round(iou(gt_box, pred_box), 3)])

    # Extra Predictions = Missing GT annotations
    for i, (pred_box, pred_cls) in enumerate(preds):
        if i not in matched_preds:
            misclassified_images.add(file_name)
            report_rows.append([file_name, "Missing Annotation", "None",
                                canonical_names[pred_cls], 0.0])

# -----------------------------
# Step 6. Create COCO JSON with misclassified images
# -----------------------------
misclassified_gt = {
    "images": [img for img in gt["images"] if img["file_name"] in misclassified_images],
    "annotations": [ann for ann in gt["annotations"]
                    if image_id_to_name[ann["image_id"]] in misclassified_images],
    "categories": gt["categories"]
}

with open(os.path.join(output_folder, "misclassified_coco.json"), "w") as f:
    json.dump(misclassified_gt, f, indent=2)

# -----------------------------
# Step 7. Copy misclassified images to new_dataset/images
# -----------------------------
for img in misclassified_gt["images"]:
    src = os.path.join(images_input_folder, img["file_name"])
    dst = os.path.join(images_output_folder, img["file_name"])
    if os.path.exists(src):
        shutil.copy2(src, dst)

# -----------------------------
# Step 8. Save CSV report
# -----------------------------
with open(os.path.join(output_folder, "misclassified_report.csv"), "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Image", "Reason", "GroundTruth", "Prediction", "IOU"])
    writer.writerows(report_rows)

print(f"✅ {len(misclassified_gt['images'])} misclassified images found.")
print(f"📁 All outputs saved inside '{output_folder}/'")


image 1/993 /content/merged_cleaned_dataset/images/0067dc2e-4bb7-48a2-b9c5-9da631f86595_flux_1756081928.png: 640x640 1 potted plant, 1 vase, 8.2ms
image 2/993 /content/merged_cleaned_dataset/images/007abd03-fa5e-43d8-af0a-fbed58e17821_flux_1756081085.png: 640x640 (no detections), 8.4ms
image 3/993 /content/merged_cleaned_dataset/images/00f8644a-c4af-4097-a0bc-e8af706554ce_flux_1756082211.png: 640x640 1 vase, 8.2ms
image 4/993 /content/merged_cleaned_dataset/images/0142ef49-931a-46b3-a841-452224567d33_flux_1756079514.png: 640x640 1 chair, 8.2ms
image 5/993 /content/merged_cleaned_dataset/images/01a1e58d-cdf5-4de9-8c35-bf4a9e741350_flux_1756079529.png: 640x640 1 chair, 1 vase, 8.2ms
image 6/993 /content/merged_cleaned_dataset/images/02534685-590c-4899-a048-c9371666f4b6_flux_1756080506.png: 640x640 1 cup, 1 book, 8.2ms
image 7/993 /content/merged_cleaned_dataset/images/02de1ef0-1130-473d-9d2a-425b231ebdd2_flux_1756080597.png: 640x640 1 chair, 2 potted plants, 8.8ms
image 8/993 /content/m

In [13]:
import csv

wrong_class = 0
missing_annotation = 0

with open("new_misclassified_dataset/misclassified_report.csv", newline="") as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    for row in reader:
        reason = row[1]
        if reason == "Wrong Class":
            wrong_class += 1
        elif reason == "Missing Annotation":
            missing_annotation += 1

total = wrong_class + missing_annotation

print("Wrong Class:", wrong_class)
print("Missing Annotation:", missing_annotation)
print("Total Flagged:", total)

Wrong Class: 32
Missing Annotation: 59
Total Flagged: 91


In [None]:
# this is done because when we run yolo model it creates a seperate
# predict/ folder in runs/detect/ folder every single time and can cause confusion

# The below code deletes the detect folder if we are running the yolo model for second time and so on.

# !rm -rf runs/detect

In [14]:
# add image width and height in json file
from pathlib import Path
from PIL import Image
from tqdm import tqdm

coco_path = 'cleaned_merged_coco.json'
images_dir = 'merged_cleaned_dataset/images'

# Load COCO JSON
with open(coco_path, 'r') as f:
    coco_data = json.load(f)

print("Adding image dimensions...")
added_count = 0  # counter for images updated

for img in tqdm(coco_data['images']):
    if 'height' not in img or 'width' not in img:
        img_path = Path(images_dir) / img['file_name']
        try:
            with Image.open(img_path) as im:
                img['width'], img['height'] = im.size
                added_count += 1
        except Exception as e:
            print(f"Error with {img['file_name']}: {e}")

# Save updated JSON
with open(coco_path, 'w') as f:
    json.dump(coco_data, f, indent=4)

print(f"✅ Done! Added width and height for {added_count} images.")

Adding image dimensions...


100%|██████████| 993/993 [00:00<00:00, 16438.38it/s]


✅ Done! Added width and height for 993 images.


In [15]:
# reindex json file
import json

coco_path = "cleaned_merged_coco.json"

with open(coco_path, "r") as f:
    coco = json.load(f)

# Sort and reindex categories from 0
sorted_cats = sorted(coco["categories"], key=lambda x: x["id"])
id_map = {cat["id"]: i for i, cat in enumerate(sorted_cats)}
print("ID mapping:", id_map)

# Update category IDs in annotations
for ann in coco["annotations"]:
    ann["category_id"] = id_map[ann["category_id"]]

# Update categories list
for cat in coco["categories"]:
    cat["id"] = id_map[cat["id"]]

# Save updated file
reindexed_path = "merged_cleaned_dataset/merged_coco_reindexed.json"
with open(reindexed_path, "w") as f:
    json.dump(coco, f, indent=4)

print(f" Reindexed COCO file saved to {reindexed_path}")


ID mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4}
 Reindexed COCO file saved to merged_cleaned_dataset/merged_coco_reindexed.json


In [16]:
import json

coco_path = "merged_cleaned_dataset/merged_coco_reindexed.json"

with open(coco_path, "r") as f:
    coco = json.load(f)

valid_cat_ids = set(range(5))  # your reindexed categories: 0-4
invalid_annotations = []

for ann in coco["annotations"]:
    bbox = ann.get("bbox")
    cat_id = ann.get("category_id")

    # Check category_id
    cat_ok = cat_id in valid_cat_ids

    # Check bbox: 4 numbers, no None, all >=0, width/height > 0
    bbox_ok = (
        isinstance(bbox, list) and
        len(bbox) == 4 and
        all(isinstance(v, (int, float)) for v in bbox) and
        all(v >= 0 for v in bbox) and
        bbox[2] > 0 and bbox[3] > 0
    )

    if not (cat_ok and bbox_ok):
        invalid_annotations.append({
            "image_id": ann.get("image_id"),
            "category_id": cat_id,
            "bbox": bbox
        })

if invalid_annotations:
    print(f"⚠️ Found {len(invalid_annotations)} invalid annotations:")
    for i, ann in enumerate(invalid_annotations[:10]):
        print(f"{i+1}: {ann}")
else:
    print("✅ All annotations look good!")


✅ All annotations look good!


In [17]:
def coco_to_yolo_custom(coco_json_path, output_dir, image_dir):
    """
    Convert COCO format to YOLO format
    """
    # Load COCO annotations
    with open(coco_json_path, 'r') as f:
        coco = json.load(f)

    # Create output directory
    labels_dir = Path(output_dir) / 'labels'
    labels_dir.mkdir(parents=True, exist_ok=True)

    # Create image_id to filename mapping
    image_info = {img['id']: img for img in coco['images']}

    # Group annotations by image_id
    annotations_by_image = {}
    for ann in coco['annotations']:
        img_id = ann['image_id']
        if img_id not in annotations_by_image:
            annotations_by_image[img_id] = []
        annotations_by_image[img_id].append(ann)

    print(f"Converting {len(image_info)} images...")

    # Convert each image's annotations
    converted = 0
    for img_id, img_data in tqdm(image_info.items()):
        img_width = img_data['width']
        img_height = img_data['height']
        filename = img_data['file_name']

        # Get label filename (same as image but .txt)
        label_filename = Path(filename).stem + '.txt'
        label_path = labels_dir / label_filename

        # Get annotations for this image
        anns = annotations_by_image.get(img_id, [])

        # Write YOLO format annotations
        with open(label_path, 'w') as f:
            for ann in anns:
                category_id = ann['category_id']
                bbox = ann['bbox']  # [x, y, width, height] in pixels

                # Convert to YOLO format (normalized center x, center y, width, height)
                x, y, w, h = bbox
                x_center = (x + w / 2) / img_width
                y_center = (y + h / 2) / img_height
                norm_width = w / img_width
                norm_height = h / img_height

                # Write: class x_center y_center width height
                f.write(f"{category_id} {x_center:.6f} {y_center:.6f} {norm_width:.6f} {norm_height:.6f}\n")

        converted += 1

    print(f"Converted {converted} images to YOLO format")
    print(f"Labels saved to: {labels_dir}")

    return labels_dir

# Run the conversion
coco_to_yolo_custom(
    coco_json_path='merged_cleaned_dataset/merged_coco_reindexed.json',
    output_dir='YOLO_dataset_final',
    image_dir='merged_cleaned_dataset/images'
)

Converting 993 images...


100%|██████████| 993/993 [00:00<00:00, 3574.76it/s]

Converted 993 images to YOLO format
Labels saved to: YOLO_dataset_final/labels





PosixPath('YOLO_dataset_final/labels')

In [18]:
# copy images to yolo_dataset folder
# Source and destination
source_images = Path('merged_cleaned_dataset/images')
dest_images = Path('YOLO_dataset_final/images')

# Create destination if it doesn't exist
dest_images.mkdir(parents=True, exist_ok=True)

# Copy all images
print("Copying images...")
image_files = list(source_images.glob('*'))

for img in tqdm(image_files):
    if img.is_file():
        shutil.copy2(img, dest_images / img.name)

print(f"Done! Copied {len(image_files)} images to {dest_images}")

Copying images...


100%|██████████| 993/993 [00:00<00:00, 1403.00it/s]

Done! Copied 993 images to YOLO_dataset_final/images





In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# split the data into train test
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm

dataset_dir = Path('YOLO_dataset_final')
images_dir = dataset_dir / 'images'
labels_dir = dataset_dir / 'labels'

# create train/val directories
(dataset_dir / 'images' / 'train').mkdir(parents=True, exist_ok=True)
(dataset_dir / 'images' / 'val').mkdir(parents=True, exist_ok=True)
(dataset_dir / 'labels' / 'train').mkdir(parents=True, exist_ok=True)
(dataset_dir / 'labels' / 'val').mkdir(parents=True, exist_ok=True)

# get image files to a list
print("Finding images...")
image_files = [f for f in images_dir.glob('*') if f.is_file() and f.suffix.lower() in ['.jpg', '.jpeg', '.png']]
print(f"Found {len(image_files)} images")

# split to train/val
train_imgs, val_imgs = train_test_split(
    image_files,
    train_size=0.8,
    random_state=101
)

print(f"\nSplit:")
print(f"  Train: {len(train_imgs)} images")
print(f"  Val:   {len(val_imgs)} images")

#move train files
print("\nMoving train files...")
for img in tqdm(train_imgs):
    # Move image
    shutil.move(str(img), str(dataset_dir / 'images' / 'train' / img.name))

    # Move corresponding label
    label = labels_dir / f'{img.stem}.txt'
    if label.exists():
        shutil.move(str(label), str(dataset_dir / 'labels' / 'train' / label.name))

#move val files
print("Moving val files...")
for img in tqdm(val_imgs):
    # Move image
    shutil.move(str(img), str(dataset_dir / 'images' / 'val' / img.name))

    # Move corresponding label
    label = labels_dir / f'{img.stem}.txt'
    if label.exists():
        shutil.move(str(label), str(dataset_dir / 'labels' / 'val' / label.name))

# clean directory
if images_dir.exists() and not any(images_dir.iterdir()):
    images_dir.rmdir()
if labels_dir.exists() and not any(labels_dir.iterdir()):
    labels_dir.rmdir()

print("\n Dataset split complete!")



Finding images...
Found 993 images

Split:
  Train: 794 images
  Val:   199 images

Moving train files...


100%|██████████| 794/794 [00:00<00:00, 12876.11it/s]


Moving val files...


100%|██████████| 199/199 [00:00<00:00, 11983.73it/s]


 Dataset split complete!





In [21]:
import json
import yaml
from pathlib import Path

def create_yolo_yaml(coco_json_path, yolo_dataset_dir, yaml_filename='data.yaml'):
    """
    Create YAML config file for YOLO training with train/val split
    """
    # Load COCO annotations to get categories
    with open(coco_json_path, 'r') as f:
        coco = json.load(f)

    # Extract and sort categories by id
    categories = sorted(coco['categories'], key=lambda x: x['id'])

    print("📋 Category Information:")
    print("-" * 50)
    for cat in categories:
        print(f"ID {cat['id']}: {cat['name']}")
    print("-" * 50)
    print(f"Total categories: {len(categories)}\n")

    # Create names dictionary (id: name)
    names = {cat['id']: cat['name'] for cat in categories}

    # Get absolute path to dataset
    dataset_path = Path(yolo_dataset_dir).resolve()

    # Create YAML structure with val instead of test
    yaml_data = {
        'path': str(dataset_path),  # dataset root dir
        'train': 'images/train',     # train images (relative to 'path')
        'val': 'images/val',         # val images (relative to 'path')

        'nc': len(categories),       # number of classes
        'names': names               # class names
    }

    # Save YAML file
    yaml_path = Path(yolo_dataset_dir) / yaml_filename
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_data, f, default_flow_style=False, sort_keys=False)

    print(f"✅ YAML file created: {yaml_path}")
    print("\n📄 YAML Contents:")
    print("-" * 50)
    with open(yaml_path, 'r') as f:
        print(f.read())
    print("-" * 50)

    return yaml_path

# Recreate YAML with correct val path
create_yolo_yaml(
    coco_json_path='merged_cleaned_dataset/merged_coco_reindexed.json',
    yolo_dataset_dir='YOLO_dataset_final',
    yaml_filename='data.yaml'
)

📋 Category Information:
--------------------------------------------------
ID 0: potted plant
ID 1: chair
ID 2: cup
ID 3: vase
ID 4: book
--------------------------------------------------
Total categories: 5

✅ YAML file created: YOLO_dataset_final/data.yaml

📄 YAML Contents:
--------------------------------------------------
path: /content/YOLO_dataset_final
train: images/train
val: images/val
nc: 5
names:
  0: potted plant
  1: chair
  2: cup
  3: vase
  4: book

--------------------------------------------------


PosixPath('YOLO_dataset_final/data.yaml')

In [22]:
results = model.train(
    data='YOLO_dataset_final/data.yaml',
    epochs=50
)

Ultralytics 8.3.230 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=YOLO_dataset_final/data.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=50, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective=0.0, plots

In [23]:
import os

root = "YOLO_dataset_final"

folders = [
    os.path.join(root, "images/test"),
    os.path.join(root, "labels/test"),
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)
    print(f"Created: {folder}")

print("\nTest folder structure is ready. You can upload your test images and labels manually!")


Created: YOLO_dataset_final/images/test
Created: YOLO_dataset_final/labels/test

Test folder structure is ready. You can upload your test images and labels manually!


In [25]:
import yaml

yaml_path = "YOLO_dataset_final/data.yaml"   # path to your existing yaml
test_path = "images/test"                    # <-- correct relative path

# Load existing YAML
with open(yaml_path, "r") as f:
    data = yaml.safe_load(f)

# Add/update the test field
data["test"] = test_path

# Save updated YAML
with open(yaml_path, "w") as f:
    yaml.dump(data, f, sort_keys=False)

print("Updated data.yaml successfully!")

Updated data.yaml successfully!


In [27]:
from ultralytics import YOLO

model = YOLO("runs/detect/train/weights/best.pt")  # your fine-tuned model
results = model.val(data="YOLO_dataset_final/data.yaml", split="test")

Ultralytics 8.3.230 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1686.7±561.0 MB/s, size: 145.7 KB)
[K[34m[1mval: [0mScanning /content/YOLO_dataset_final/labels/test... 200 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 200/200 244.3it/s 0.8s
[34m[1mval: [0mNew cache created: /content/YOLO_dataset_final/labels/test.cache
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 13/13 1.0it/s 12.8s
                   all        200       1061      0.466      0.369      0.349      0.197
          potted plant        111        221      0.555      0.697      0.658      0.377
                 chair         68        101      0.324      0.347      0.246      0.122
                   cup         77        107      0.521       0.29      0.291      0.186
              

In [29]:
from ultralytics import YOLO

model = YOLO("yolo11n.pt")  # baseline pretrained model
results = model.val(data="YOLO_dataset_final/data.yaml", split="test")

Ultralytics 8.3.230 🚀 Python-3.12.12 torch-2.9.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
YOLO11n summary (fused): 100 layers, 2,616,248 parameters, 0 gradients, 6.5 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1823.4±615.3 MB/s, size: 1886.7 KB)
[K[34m[1mval: [0mScanning /content/YOLO_dataset_final/labels/test.cache... 200 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 200/200 116.8Kit/s 0.0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 13/13 1.3it/s 10.3s
                   all        200       1061      0.061      0.119     0.0401     0.0233
                person        111        221     0.0338       0.24     0.0231      0.013
               bicycle         68        101      0.107      0.188     0.0876     0.0438
                   car         77        107     0.0269      0.112     0.0158    0.00899
            motorcycle        122        231     0.0519     0.0346     0.0279     0.0178
