In [3]:
"""
README: YOLO City-wise and Class-wise Model Evaluation for Custom Split

This script:
- Evaluates a YOLO model on the test split for multiple cities (using city-specific data.yaml files),
- Collects and saves per-class metrics (Precision, Recall, mAP50, mAP50-95) for each city,
- Exports a summary CSV per city, and an aggregated CSV for all cities, including a row with the overall mean.

How to use:
- Set 'model_name', 'weights_path', and the dataset base path as needed.
- Define your city list and folder conventions.
- Run the script or call the function for the desired split/model.
- CSV outputs will be available under [city_results_{split_ratio}].

Requirements:
- ultralytics
- torch
- pandas

Author: Bahadir Akin Akgul
Date: 13.07.2025
"""

import torch
from ultralytics import YOLO
from pathlib import Path
import pandas as pd

def evaluate_model_on_split(split_ratio: str, model_name: str):
    # === Paths ===
    weights_path = f'/PATH/TO/runs/detect/{model_name}/weights/best.pt'
    results_base = f'/PATH/TO/runs/detect/{model_name}/city_results_{split_ratio}'

    # === List of cities ===
    cities = ['istanbul', 'paris', 'munich', 'marseille']

    # === Dataset root
    city_dataset_base = Path('/PATH/TO/dataset-root')

    # === Create results folder
    city_results_base = Path(results_base)
    city_results_base.mkdir(parents=True, exist_ok=True)

    # === Load model
    model = YOLO(weights_path)

    # === Collect all results
    all_results = []

    for city in cities:
        print(f"\nTesting {city.upper()} ({split_ratio})...")

        # Dataset/data.yaml path for the city
        city_dataset = city_dataset_base / f'roadtr-YYYYMMDD-{city}'
        city_data_yaml = city_dataset / 'data.yaml'

        # Output directory for this city
        city_save_dir = city_results_base / city
        city_save_dir.mkdir(parents=True, exist_ok=True)

        # model.val() — use test split
        results = model.val(
            data=str(city_data_yaml),
            split='test',
            imgsz=1024,
            batch=64,
            device=[0, 1],        # Adjust as needed
            save_dir=str(city_save_dir),
            save_json=False,
            verbose=True,
            plots=True,
            conf=0.001,
            rect=True,
            name=f'{model_name}-{split_ratio}-{city}'
        )

        # Collect per-class metrics
        names = results.names
        p = results.box.p
        r = results.box.r
        ap50 = results.box.all_ap[:, 0]
        ap = results.box.ap
        n_classes = len(names)

        city_result_rows = []
        for class_id in range(n_classes):
            result = {
                'Model': model_name,
                'Split': split_ratio,
                'City': city,
                'Class': names[class_id],
                'Precision': round(float(p[class_id]), 3),
                'Recall': round(float(r[class_id]), 3),
                'mAP50': round(float(ap50[class_id]), 3),
                'mAP50-95': round(float(ap[class_id]), 3),
            }
            all_results.append(result)
            city_result_rows.append(result)

        # City-specific class results CSV
        city_df = pd.DataFrame(city_result_rows)
        city_csv_path = city_save_dir / f'{city}_class_results.csv'
        city_df.to_csv(city_csv_path, index=False)
        print(f"Class results for {city.upper()} saved to: {city_csv_path}")

        print(f"Finished testing {city.upper()} ({split_ratio})")

    # Combine all cities results
    df = pd.DataFrame(all_results)

    # Add overall mean row
    mean_row = {
        'Model': model_name,
        'Split': split_ratio,
        'City': 'ALL',
        'Class': 'ALL_MEAN',
        'Precision': round(df['Precision'].mean(), 3),
        'Recall': round(df['Recall'].mean(), 3),
        'mAP50': round(df['mAP50'].mean(), 3),
        'mAP50-95': round(df['mAP50-95'].mean(), 3)
    }
    df = pd.concat([df, pd.DataFrame([mean_row])], ignore_index=True)

    # Save all results
    csv_path = city_results_base / f'city_class_results_{split_ratio}.csv'
    df.to_csv(csv_path, index=False)

    print(f"\nOverall mean row added.")
    print(f"All city/class results for split {split_ratio} saved to '{csv_path}'")


# === Usage Example ===
# Uncomment and edit as needed to run for your experiments:

# evaluate_model_on_split('65-35', 'yolo-10-65-35-batch-17')
# evaluate_model_on_split('65-35', 'exp_6535')
# evaluate_model_on_split('95-5', 'yolo-10-95-52')
# evaluate_model_on_split('95-5', 'exp_955')
# evaluate_model_on_split('75-25', 'yolo-10-75-25-2')
# evaluate_model_on_split('75-25', 'yolo-8-75-252')
# evaluate_model_on_split('70-30', 'yolov10-70-30')
# evaluate_model_on_split('70-30', 'yolo-8-75-252')



🔵 ISTANBUL (70-30) test ediliyor...
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)
Model summary (fused): 112 layers, 43,608,921 parameters, 0 gradients, 164.8 GFLOPs


[34m[1mval: [0mScanning /truba/home/baakgul/roadtr-14032025-istanbul/test/labels.cache... 751 images, 0 backgrounds, 0 corrupt: 100%|██████████| 751/751 [00:00<?, ?it/s]




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 12/12 [00:27<00:00,  2.33s/it]


                   all        751      18766      0.827      0.653        0.7      0.505
            pedestrian        560       8383       0.74      0.349      0.417      0.209
                  road        750        804      0.935       0.93      0.948      0.815
               vehicle        739       9579      0.806       0.68      0.735       0.49
Speed: 0.8ms preprocess, 25.8ms inference, 0.0ms loss, 0.8ms postprocess per image
Results saved to [1mruns/detect/yolo-8-75-252-70-30-istanbul[0m
📄 ISTANBUL için class sonuçları: /arf/home/baakgul/runs/detect/yolo-8-75-252/city_results_70-30/istanbul/istanbul_class_results.csv
✅ ISTANBUL (70-30) test tamamlandı!

🔵 PARIS (70-30) test ediliyor...
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)


[34m[1mval: [0mScanning /truba/home/baakgul/roadtr-14032025-paris/test/labels.cache... 48 images, 0 backgrounds, 0 corrupt: 100%|██████████| 48/48 [00:00<?, ?it/s]




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:02<00:00,  2.94s/it]


                   all         48       1374      0.744      0.663      0.689      0.485
            pedestrian         42        598      0.578      0.363      0.384      0.188
                  road         48         58       0.88      0.882      0.912      0.781
               vehicle         48        718      0.774      0.744      0.771      0.488
Speed: 0.2ms preprocess, 24.1ms inference, 0.0ms loss, 1.0ms postprocess per image
Results saved to [1mruns/detect/yolo-8-75-252-70-30-paris[0m
📄 PARIS için class sonuçları: /arf/home/baakgul/runs/detect/yolo-8-75-252/city_results_70-30/paris/paris_class_results.csv
✅ PARIS (70-30) test tamamlandı!

🔵 MUNIH (70-30) test ediliyor...
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)


[34m[1mval: [0mScanning /truba/home/baakgul/roadtr-14032025-munih/test/labels.cache... 245 images, 1 backgrounds, 0 corrupt: 100%|██████████| 245/245 [00:00<?, ?it/s]




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 4/4 [00:10<00:00,  2.66s/it]


                   all        245       2656      0.794      0.796      0.817      0.561
            pedestrian         97        431      0.661      0.639      0.638       0.32
                  road        244        269      0.944      0.943      0.976       0.82
               vehicle        236       1956      0.777      0.807      0.836      0.541
Speed: 2.1ms preprocess, 26.3ms inference, 0.0ms loss, 1.9ms postprocess per image
Results saved to [1mruns/detect/yolo-8-75-252-70-30-munih[0m
📄 MUNIH için class sonuçları: /arf/home/baakgul/runs/detect/yolo-8-75-252/city_results_70-30/munih/munih_class_results.csv
✅ MUNIH (70-30) test tamamlandı!

🔵 MARSILYA (70-30) test ediliyor...
Ultralytics 8.3.91 🚀 Python-3.10.15 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-16GB, 16269MiB)
                                                       CUDA:1 (Tesla P100-PCIE-16GB, 16269MiB)


[34m[1mval: [0mScanning /truba/home/baakgul/roadtr-14032025-marsilya/test/labels.cache... 273 images, 0 backgrounds, 0 corrupt: 100%|██████████| 273/273 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 5/5 [00:11<00:00,  2.32s/it]


                   all        273       4969      0.819      0.695      0.728      0.509
            pedestrian        144       1247      0.675      0.331      0.385      0.169
                  road        265        269      0.971      0.989      0.981      0.859
               vehicle        272       3453      0.812      0.764      0.817        0.5
Speed: 1.9ms preprocess, 26.2ms inference, 0.0ms loss, 0.8ms postprocess per image
Results saved to [1mruns/detect/yolo-8-75-252-70-30-marsilya[0m
📄 MARSILYA için class sonuçları: /arf/home/baakgul/runs/detect/yolo-8-75-252/city_results_70-30/marsilya/marsilya_class_results.csv
✅ MARSILYA (70-30) test tamamlandı!

📊 Toplam ortalama eklendi.
🏁 70-30 split için tüm şehir class sonuçları '/arf/home/baakgul/runs/detect/yolo-8-75-252/city_results_70-30/city_class_results_70-30.csv' dosyasına kaydedildi!
