# Script that converts the original YOLO drone dataset to the expected dataset for YOLO

In [None]:
import json
import os
import re
import random
from pathlib import Path

# Config
coco_json = "../../data/datasets/InfectedLeaves_v14/annotations/coco/_annotations.coco.json"
images_dir = Path("../../data/datasets/InfectedLeaves_v14/images/")
output_dir = Path("../../data/datasets/yolo_drone/")
train_ratio = 0.8  # 80% train, 20% val

# Make directories
(output_dir / "images/train").mkdir(parents=True, exist_ok=True)
(output_dir / "images/val").mkdir(parents=True, exist_ok=True)
(output_dir / "labels/train").mkdir(parents=True, exist_ok=True)
(output_dir / "labels/val").mkdir(parents=True, exist_ok=True)

# Load COCO annotations
with open(coco_json, "r") as f:
    coco = json.load(f)

# Function to clean filenames
def clean_filename(filename: str) -> str:
    # Replace "_jpg..." until the end with ".jpg"
    return re.sub(r"_jpg.*", ".jpg", filename)

# Build lookup tables
image_id_to_filename = {img["id"]: clean_filename(img["file_name"]) for img in coco["images"]}
image_id_to_size = {img["id"]: (img["width"], img["height"]) for img in coco["images"]}

# Clean and deduplicate category names
cleaned_names = []
cat_id_to_newid = {}
for cat in coco["categories"]:
    name = cat["name"].strip()
    if name not in cleaned_names:
        cleaned_names.append(name)
    cat_id_to_newid[cat["id"]] = cleaned_names.index(name)

categories = cat_id_to_newid
class_names = cleaned_names

# Collect annotations per image
annotations_per_image = {img_id: [] for img_id in image_id_to_filename.keys()}

for ann in coco["annotations"]:
    img_id = ann["image_id"]
    cat_id = ann["category_id"]
    bbox = ann["bbox"]  # COCO: [x_min, y_min, width, height]

    # Get image size
    img_w, img_h = image_id_to_size[img_id]

    # Convert to YOLO format
    x_min, y_min, w, h = bbox
    x_center = (x_min + w / 2) / img_w
    y_center = (y_min + h / 2) / img_h
    w /= img_w
    h /= img_h

    class_id = categories[cat_id]
    annotations_per_image[img_id].append([class_id, x_center, y_center, w, h])

# Shuffle and split dataset
image_ids = list(image_id_to_filename.keys())
random.shuffle(image_ids)
split_idx = int(len(image_ids) * train_ratio)
train_ids, val_ids = image_ids[:split_idx], image_ids[split_idx:]

# Helper to copy and write annotations
def process_split(ids, split):
    for img_id in ids:
        filename = image_id_to_filename[img_id]
        src_img = images_dir / filename
        dst_img = output_dir / f"images/{split}/{filename}"

        if not src_img.exists():
            print(f"⚠️ Warning: Image not found {src_img}, skipping.")
            continue

        # Symlink image
        os.symlink(src_img.absolute(), dst_img)

        # Write label
        label_file = output_dir / f"labels/{split}/{Path(filename).stem}.txt"
        with open(label_file, "w") as f:
            for ann in annotations_per_image[img_id]:
                f.write(" ".join([f"{a:.6f}" if isinstance(a, float) else str(a) for a in ann]) + "\n")

# Process train and val splits
process_split(train_ids, "train")
process_split(val_ids, "val")

# Write data.yaml
yaml_content = f"""train: images/train
val: images/val

nc: {len(class_names)}
names: {class_names}
"""

with open(output_dir / "data.yaml", "w") as f:
    f.write(yaml_content)

print(f"✅ Conversion complete! YOLO dataset ready at: {output_dir}")