# Scripts to clean the original dataset used at "Bacterial-fungicidal vine disease detection with proximal aerial images" also called "InfectedLeaves"

## COCO json cleaning

- Deleting ids that are not present in the images folder
- Deleting ids that have an area below 1
- Rearange ids number
- Scale bounding boxes to match images real size (3840x2160)

In [None]:
import os
import re
import json
from pathlib import Path

# -------- CONFIG ----------
images_folder = "../../data/datasets/VeryReduced_InfectedLeaves/images"
input_json = "../../data/datasets/VeryReduced_InfectedLeaves/annotations/_annotations.coco.fixed.json"
output_json = "../../data/datasets/VeryReduced_InfectedLeaves/annotations/_annotations.coco-cleaned.fixed.json"

# Scaling factors (from 640x360 → 3840x2160)
scale_x = 3840 / 640
scale_y = 2160 / 360

reassign_image_ids = True
reassign_annotation_ids = True
keep_original_ids = False
keep_original_fname = False
# ---------------------------


def clean_filename(fname: str) -> str:
    return re.sub(r"_jpg\.rf\.[0-9a-f]+\.(jpg|jpeg|png)$", r".\1", fname, flags=re.IGNORECASE)

# Load COCO JSON
with open(input_json, "r", encoding="utf-8") as f:
    coco = json.load(f)

# List existing files (cleaned + lowercased)
p = Path(images_folder)
existing_files = {clean_filename(p_.name).lower().strip() for p_ in p.iterdir() if p_.is_file()}

# Filter images and create mapping old_id -> new_id
oldid_to_newid = {}
kept_images = []
dropped_images = []

next_img_id = 1
for img in coco.get("images", []):
    original_fname = os.path.basename(img.get("file_name", "")).strip()
    clean_fname = clean_filename(original_fname)

    if clean_fname.lower() in existing_files:
        img_copy = img.copy()
        if keep_original_fname:
            img_copy["original_file_name"] = img_copy["file_name"]
        img_copy["file_name"] = clean_fname

        old_id = img["id"]
        if reassign_image_ids:
            if keep_original_ids:
                img_copy["original_id"] = old_id
            img_copy["id"] = next_img_id
            oldid_to_newid[old_id] = next_img_id
            next_img_id += 1
        else:
            oldid_to_newid[old_id] = old_id

        kept_images.append(img_copy)
    else:
        dropped_images.append(original_fname)

# Filter & remap annotations
kept_annotations = []
dropped_annotations = []
dropped_small_annotations = []
next_ann_id = 1
for ann in coco.get("annotations", []):
    old_img_id = ann["image_id"]
    if old_img_id in oldid_to_newid:
        if ann.get("area", 0) < 1:
            dropped_small_annotations.append(ann.get("id"))
            continue

        ann_copy = ann.copy()
        ann_copy["image_id"] = oldid_to_newid[old_img_id]

        # Rescale bbox
        if "bbox" in ann_copy:
            x, y, w, h = ann_copy["bbox"]
            x *= scale_x
            y *= scale_y
            w *= scale_x
            h *= scale_y
            ann_copy["bbox"] = [x, y, w, h]

            # Update area as well
            ann_copy["area"] = w * h

        # optionally reassign annotation id
        if reassign_annotation_ids:
            if keep_original_ids:
                ann_copy["original_id"] = ann["id"]
            ann_copy["id"] = next_ann_id
            next_ann_id += 1

        kept_annotations.append(ann_copy)
    else:
        dropped_annotations.append(ann.get("id"))

# Build final COCO dict
filtered_coco = {
    "info": coco.get("info", {}),
    "licenses": coco.get("licenses", []),
    "categories": coco.get("categories", []),
    "images": kept_images,
    "annotations": kept_annotations,
}

# Save
with open(output_json, "w", encoding="utf-8") as f:
    json.dump(filtered_coco, f, indent=2, ensure_ascii=False)

# Summary
print(f"Images kept: {len(kept_images)}")
print(f"Images dropped (not found): {len(dropped_images)}")
if dropped_images:
    print("Dropped image files (examples):", dropped_images[:10])
print(f"Annotations kept: {len(kept_annotations)}")
print(f"Annotations dropped (missing images): {len(dropped_annotations)}")
if dropped_annotations:
    print("Dropped annotation ids (examples):", dropped_annotations[:10])
print(f"Annotations dropped (area < 1 px): {len(dropped_small_annotations)}")
if dropped_small_annotations:
    print("Dropped small annotation ids (examples):", dropped_small_annotations[:10])
print(f"✅ Rescaled bboxes saved to {output_json}")


## Slice windows with Sahi

In [None]:
from sahi.slicing import slice_coco

coco_dict, coco_path = slice_coco(
    coco_annotation_file_path="../../data/datasets/VeryReduced_InfectedLeaves/annotations/_annotations.coco-cleaned.fixed.json",
    image_dir="../../data/datasets/VeryReduced_InfectedLeaves/images/",
    slice_height=640,
    slice_width=640,
    overlap_height_ratio=0.2,
    overlap_width_ratio=0.2,
    ignore_negative_samples=True,
    output_coco_annotation_file_name="slice_coco_annotations.json",
    output_dir="sliced"
)

## Convert the original coco-style dataset to the expected by YOLO

In [None]:
import json
import os
import re
import random
from pathlib import Path

# Config
coco_json = Path("sliced/annotations/sliced.json")
images_dir = Path("sliced/images/")
output_dir = Path("veryreduced-yolodrone-sliced")
train_ratio = 0.8  # 80% train, 20% val

# Make directories
(output_dir / "images/train").mkdir(parents=True, exist_ok=True)
(output_dir / "images/val").mkdir(parents=True, exist_ok=True)
(output_dir / "labels/train").mkdir(parents=True, exist_ok=True)
(output_dir / "labels/val").mkdir(parents=True, exist_ok=True)

# Load COCO annotations
with open(coco_json, "r") as f:
    coco = json.load(f)

# Function to clean filenames
def clean_filename(filename: str) -> str:
    # Replace "_jpg..." until the end with ".jpg"
    return re.sub(r"_jpg.*", ".jpg", filename)

# Build lookup tables
image_id_to_filename = {img["id"]: clean_filename(img["file_name"]) for img in coco["images"]}
image_id_to_size = {img["id"]: (img["width"], img["height"]) for img in coco["images"]}

# Clean and deduplicate category names
cleaned_names = []
cat_id_to_newid = {}
for cat in coco["categories"]:
    name = cat["name"].strip()
    if name not in cleaned_names:
        cleaned_names.append(name)
    cat_id_to_newid[cat["id"]] = cleaned_names.index(name)

categories = cat_id_to_newid
class_names = cleaned_names

# Collect annotations per image
annotations_per_image = {img_id: [] for img_id in image_id_to_filename.keys()}

for ann in coco["annotations"]:
    img_id = ann["image_id"]
    cat_id = ann["category_id"]
    bbox = ann["bbox"]  # COCO: [x_min, y_min, width, height]

    # Get image size
    img_w, img_h = image_id_to_size[img_id]

    # Convert to YOLO format
    x_min, y_min, w, h = bbox
    x_center = (x_min + w / 2) / img_w
    y_center = (y_min + h / 2) / img_h
    w /= img_w
    h /= img_h

    class_id = categories[cat_id]
    annotations_per_image[img_id].append([class_id, x_center, y_center, w, h])

# Shuffle and split dataset
image_ids = list(image_id_to_filename.keys())
random.shuffle(image_ids)
split_idx = int(len(image_ids) * train_ratio)
train_ids, val_ids = image_ids[:split_idx], image_ids[split_idx:]

# Helper to copy and write annotations
def process_split(ids, split):
    for img_id in ids:
        filename = image_id_to_filename[img_id]
        src_img = images_dir / filename
        dst_img = output_dir / f"images/{split}/{filename}"

        if not src_img.exists():
            print(f"⚠️ Warning: Image not found {src_img}, skipping.")
            continue

        # Symlink image
        os.symlink(src_img.absolute(), dst_img)

        # Write label
        label_file = output_dir / f"labels/{split}/{Path(filename).stem}.txt"
        with open(label_file, "w") as f:
            for ann in annotations_per_image[img_id]:
                f.write(" ".join([f"{a:.6f}" if isinstance(a, float) else str(a) for a in ann]) + "\n")

# Process train and val splits
process_split(train_ids, "train")
process_split(val_ids, "val")

# Write data.yaml
yaml_content = f"""train: images/train
val: images/val

nc: {len(class_names)}
names: {class_names}
"""

with open(output_dir / "data.yaml", "w") as f:
    f.write(yaml_content)

print(f"✅ Conversion complete! YOLO dataset ready at: {output_dir}")