# YOLO Model Testing and Evaluation

This notebook provides comprehensive testing and evaluation of trained YOLO models on the BDD100K dataset.

## Features:
- ‚úÖ Load fine-tuned models
- ‚úÖ Run inference on test set
- ‚úÖ Comprehensive metrics (mAP, precision, recall, F1)
- ‚úÖ Per-class performance analysis
- ‚úÖ Confusion matrix visualization
- ‚úÖ Speed benchmarking (FPS, latency)
- ‚úÖ Sample predictions visualization
- ‚úÖ PDF report generation

## Workflow:
1. Import libraries and configuration
2. Load fine-tuned model
3. Verify test dataset
4. Run model evaluation
5. Analyze performance metrics
6. Generate visualizations
7. Create comprehensive PDF report

## 1. Import Libraries

In [None]:
# Install required libraries (uncomment if running in Colab)
# !pip install -q ultralytics pyyaml reportlab pillow

import os
import sys
import yaml
import json
import torch
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from datetime import datetime
from tqdm import tqdm
import pandas as pd
import random
import shutil
import time
import glob
from typing import Dict, Tuple, Any
from matplotlib.patches import Rectangle

# YOLO imports
from ultralytics import YOLO
from ultralytics.utils.torch_utils import get_flops

# PDF generation imports
from reportlab.lib.pagesizes import letter, A4
from reportlab.lib import colors
from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_CENTER, TA_LEFT
from PIL import Image as PILImage

import warnings
warnings.filterwarnings('ignore')

# Configure matplotlib for notebook display
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (15, 10)

# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('‚úì Libraries imported successfully')
print(f'‚úì Device: {device}')
if device == 'cuda':
    print(f'  GPU: {torch.cuda.get_device_name(0)}')
    print(f'  CUDA Version: {torch.version.cuda}')
    print(f'  Available Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB')

## 2. Configuration

In [None]:
# Base directories
BASE_DIR = Path.cwd().parent

# from pathlib import Path
# BASE_DIR = Path("/computer_vision_yolo")


MODEL_NAME = "yolov8n"  # Model name without .pt extension

# Choose YOLO model versions that are fully supported with ultralytics:
# ‚úÖ YOLOv8: 'yolov8n', 'yolov8s', 'yolov8m', 'yolov8l', 'yolov8x'
# ‚úÖ YOLOv9: 'yolov9s', 'yolov9m', 'yolov9l', 'yolov9x'
# ‚úÖ YOLOv10: 'yolov10n', 'yolov10s', 'yolov10m', 'yolov10l', 'yolov10x'
# ‚úÖ YOLO11: 'yolo11n', 'yolo11s', 'yolo11m', 'yolo11l', 'yolo11x'
# ‚úÖ YOLO12: 'yolo12n', 'yolo12s', 'yolo12m', 'yolo12l', 'yolo12x'
#
# Model sizes: n=nano, s=small, m=medium, l=large, x=extra-large

MODELS_DIR = BASE_DIR / 'models' / MODEL_NAME
TMP_DIR = BASE_DIR / 'tmp' / MODEL_NAME
RUNS_DIR = BASE_DIR / 'yolo_test' / 'runs'

# Dataset Selection - Choose one:
# Option 1: Full dataset (~100k images)
# YOLO_DATASET_ROOT = BASE_DIR / 'bdd100k_yolo'
# DATA_YAML_PATH = YOLO_DATASET_ROOT / 'data.yaml'

# Option 2: Limited dataset (representative samples - for quick testing)
YOLO_DATASET_ROOT = BASE_DIR / 'bdd100k_yolo_limited'
DATA_YAML_PATH = YOLO_DATASET_ROOT / 'data.yaml'

# Choose data split
USED_DATA_SPLIT = "test"  # 'train', 'val', or 'test'

# Dataset paths
IMAGES_DIR = YOLO_DATASET_ROOT / 'images' / USED_DATA_SPLIT
LABELS_DIR = YOLO_DATASET_ROOT / 'labels' / USED_DATA_SPLIT

# Verify dataset exists
if not DATA_YAML_PATH.exists():
    raise FileNotFoundError(
        f"Dataset not found: {DATA_YAML_PATH}\n\n"
        f"Please run the dataset preparation script first:\n"
        f"  python3 process_bdd100k_to_yolo_dataset.py\n"
    )

USED_DATASET = YOLO_DATASET_ROOT.name
RUN_TIMESTAMP = datetime.now().strftime('%Y%m%d_%H%M%S')
RUN_NAME = f'{MODEL_NAME}_{YOLO_DATASET_ROOT.name}_{USED_DATA_SPLIT}'
W_B_RUN_NAME = f'{MODEL_NAME}_{USED_DATASET}_{USED_DATA_SPLIT}_{RUN_TIMESTAMP}'

# Create run-specific directory
RUN_DIR = RUNS_DIR / RUN_NAME
RUN_DIR.mkdir(parents=True, exist_ok=True)
TMP_DIR.mkdir(parents=True, exist_ok=True)
RUNS_DIR.mkdir(parents=True, exist_ok=True)

# Load class names and build deterministic lookups
with open(DATA_YAML_PATH, 'r') as f:
    dataset_config = yaml.safe_load(f)

raw_names = dataset_config.get('names', {})
if isinstance(raw_names, dict):
    CLASS_NAMES = {int(class_id): name for class_id, name in raw_names.items()}
elif isinstance(raw_names, list):
    CLASS_NAMES = {idx: name for idx, name in enumerate(raw_names)}
else:
    raise ValueError('Unsupported class name structure in data.yaml')

NUM_CLASSES = len(CLASS_NAMES)

rng = np.random.default_rng(42)
COLORS = {
    class_id: tuple(int(channel) for channel in rng.integers(40, 255, size=3))
    for class_id in CLASS_NAMES.keys()
}
CLASS_NAME_TO_ID = {name: class_id for class_id, name in CLASS_NAMES.items()}

print('=' * 80)
print('CONFIGURATION SUMMARY')
print('=' * 80)
print(f'Model: {MODEL_NAME}')
print(f'Dataset: {YOLO_DATASET_ROOT.name}')
print(f'Split: {USED_DATA_SPLIT}')
print(f'Classes: {NUM_CLASSES}')
print(f'Class Names: {list(CLASS_NAMES.values())}')
print(f'Device: {device}')
print(f'Images dir: {IMAGES_DIR}')
print(f'Labels dir: {LABELS_DIR}')
print(f'Run directory: {RUN_DIR}')
print('=' * 80)

## 3. Load YOLO Model

In [None]:
# Load YOLO model with automatic download
model_path = MODELS_DIR / f'{MODEL_NAME}.pt'

if not model_path.exists():
    print(f'Model not found at {model_path}')
    print(f'Downloading {MODEL_NAME} ...')
    
    try:
        # Download model - it will be cached by ultralytics
        MODEL_NAME_n = MODEL_NAME 
        if MODEL_NAME.startswith('yolov11') or MODEL_NAME.startswith('yolov12'):
            MODEL_NAME_n = MODEL_NAME + '.pt'
        model = YOLO(MODEL_NAME_n)
        
        # Create models directory
        MODELS_DIR.mkdir(parents=True, exist_ok=True)
        
        # Save model to our directory using export/save
        try:
            # Try to save using the model's save method
            if hasattr(model, 'save'):
                model.save(str(model_path))
                print(f'‚úì Model downloaded and saved to {model_path}')
                print(f'  Size: {model_path.stat().st_size / (1024*1024):.1f} MB')
            else:
                # Fallback: copy from cache
                cache_patterns = [
                    str(Path.home() / '.cache' / 'ultralytics' / '**' / f'{MODEL_NAME}.pt'),
                    str(Path.home() / '.config' / 'Ultralytics' / '**' / f'{MODEL_NAME}.pt'),
                ]
                
                model_found = False
                for pattern in cache_patterns:
                    cache_paths = glob.glob(pattern, recursive=True)
                    if cache_paths:
                        shutil.copy(cache_paths[0], model_path)
                        print(f'‚úì Model downloaded and saved to {model_path}')
                        print(f'  Size: {model_path.stat().st_size / (1024*1024):.1f} MB')
                        model_found = True
                        break
                
                if not model_found:
                    print(f'‚úì Model loaded from ultralytics cache')
                    print(f'  Note: Model is in cache, not copied to {model_path}')
                    print(f'  This is normal and the model will work correctly')
        except Exception as save_error:
            print(f'‚ö†Ô∏è  Could not save model to custom location: {save_error}')
            print(f'‚úì Model loaded successfully from ultralytics cache')
            
    except Exception as e:
        print(f'\n‚ùå Error downloading model: {e}')
        raise
else:
    model = YOLO(str(model_path))
    print(f'‚úì Model loaded from {model_path}')

# Get model information
model_params = sum(p.numel() for p in model.model.parameters())
model_size_mb = model_path.stat().st_size / (1024*1024) if model_path.exists() else 0

# Calculate FLOPs using model.info() method which is more accurate
try:
    # FLOPs for 640x640 input (standard YOLO input size)
    flops = get_flops(model.model, imgsz=(1, 3, 640, 640))
    flops_gflops = flops / 1e9
except Exception as e:
    print(f'‚ö†Ô∏è  Could not calculate FLOPs: {e}')
    flops_gflops = 0

print(f'\nüìä Model Information:')
print(f'  Model: {MODEL_NAME}')
print(f'  Classes in model: {len(model.names)}')
print(f'  Task: {model.task}')
print(f'  Parameters: {model_params / 1e6:.1f}M')
print(f'  Model Size: {model_size_mb:.1f} MB')
print(f'  FLOPs (640x640): {flops_gflops:.2f} GFLOPs')


## 4. Load Dataset

In [None]:
# Get all images and labels
image_files = sorted(list(IMAGES_DIR.glob('*.jpg')) + list(IMAGES_DIR.glob('*.png')))
label_files = sorted([LABELS_DIR / f'{img.stem}.txt' for img in image_files if (LABELS_DIR / f'{img.stem}.txt').exists()])

# Filter to only images with labels
valid_images = [img for img in image_files if (LABELS_DIR / f'{img.stem}.txt').exists()]

print(f'‚úì Dataset loaded')
print(f'  Total images: {len(image_files)}')
print(f'  Images with labels: {len(valid_images)}')
print(f'  Label files: {len(label_files)}')

# Load performance metadata for per-image attribute analysis
METADATA_DIR = YOLO_DATASET_ROOT / 'representative_json'
PERFORMANCE_FILE = METADATA_DIR / f'{USED_DATA_SPLIT}_performance_analysis.json'

if PERFORMANCE_FILE.exists():
    with open(PERFORMANCE_FILE, 'r') as f:
        performance_data = json.load(f)
    
    print(f'\n‚úì Performance metadata loaded: {PERFORMANCE_FILE.name}')
    print(f'  Images with attributes: {performance_data["total_images"]}')
    print(f'  Attributes available: weather, scene, timeofday')
    print(f'  Per-image class distribution available')
    
    # Create quick lookup dictionary for attributes
    image_attributes = {img['basename']: img for img in performance_data['images']}
else:
    print(f'\n‚ö†Ô∏è  Performance metadata not found: {PERFORMANCE_FILE}')
    print(f'  Attribute-based analysis will not be available')
    print(f'  Run: python3 process_bdd100k_to_yolo_dataset.py')
    performance_data = None
    image_attributes = {}

## 6. Run Official YOLO Validation

### IMPORTANT: Optimized Validation Approach
This notebook uses **YOLO's official validation method** to calculate all metrics, confusion matrix, and predictions in a single pass. This approach:
- ‚úÖ **Faster**: Single validation pass instead of multiple loops
- ‚úÖ **No Duplicates**: Uses YOLO's built-in validation logic
- ‚úÖ **Official Metrics**: Provides mAP, precision, recall directly from YOLO
- ‚úÖ **Confusion Matrix**: Extracted from YOLO validation results

In [None]:
# Run official YOLO validation with W&B tracking
print('=' * 80)
print('RUNNING OFFICIAL YOLO VALIDATION WITH W&B TRACKING')
print('=' * 80)

# Create a dataset structure that YOLO expects
validation_dataset_root = TMP_DIR / 'yolo_validation_dataset'
validation_images_dir = validation_dataset_root / 'images' / USED_DATA_SPLIT
validation_labels_dir = validation_dataset_root / 'labels' / USED_DATA_SPLIT

# Create directories
validation_images_dir.mkdir(parents=True, exist_ok=True)
validation_labels_dir.mkdir(parents=True, exist_ok=True)

# Create symbolic links to images and labels
print(f'Setting up validation dataset structure...')

# Link images
for img_file in IMAGES_DIR.glob('*.jpg'):
    link_path = validation_images_dir / img_file.name
    if not link_path.exists():
        try:
            link_path.symlink_to(img_file)
        except:
            shutil.copy2(img_file, link_path)

for img_file in IMAGES_DIR.glob('*.png'):
    link_path = validation_images_dir / img_file.name
    if not link_path.exists():
        try:
            link_path.symlink_to(img_file)
        except:
            shutil.copy2(img_file, link_path)

# Link labels
for label_file in LABELS_DIR.glob('*.txt'):
    if label_file.name != 'classes.txt':
        link_path = validation_labels_dir / label_file.name
        if not link_path.exists():
            try:
                link_path.symlink_to(label_file)
            except:
                shutil.copy2(label_file, link_path)

print(f'‚úì Validation dataset prepared')
print(f'  Images: {len(list(validation_images_dir.glob("*")))}')
print(f'  Labels: {len(list(validation_labels_dir.glob("*.txt")))}')

# Create data.yaml file
data_yaml_path = validation_dataset_root / 'data.yaml'
iou_threshold = 0.5
data_yaml_content = f"""path: {validation_dataset_root}
train: images/train
val: images/{USED_DATA_SPLIT}
test: images/test

nc: {NUM_CLASSES}
names: {list(CLASS_NAMES.values())}
"""

with open(data_yaml_path, 'w') as f:
    f.write(data_yaml_content)

print(f'‚úì Created data.yaml at: {data_yaml_path}')

# Initialize Weights & Biases
try:
    import wandb
    wandb.init(
        project="yolo-bdd100k-validation",
        name=W_B_RUN_NAME,
        config={
            "model": MODEL_NAME,
            "dataset": USED_DATASET,
            "split": USED_DATA_SPLIT,
            "iou_threshold": iou_threshold,
            "num_classes": NUM_CLASSES,
            "model_params": model_params,
            "model_size_mb": model_size_mb,
            "flops_gflops": flops_gflops
        }
    )
    print(f'\n‚úì Weights & Biases initialized: {W_B_RUN_NAME}')
    print(f'  Project: yolo-bdd100k-validation')
    print(f'  Run: {W_B_RUN_NAME}')
except ImportError:
    print('\n‚ö†Ô∏è  Weights & Biases not available. Install with: pip install wandb')
    wandb = None
except Exception as e:
    print(f'\n‚ö†Ô∏è  W&B initialization error: {e}')
    print('  Continuing without W&B tracking...')
    wandb = None

# Run validation with timing
print('\nRunning YOLO validation...')
start_time = time.time()

validation_results = model.val(
    data=str(data_yaml_path),
    split=USED_DATA_SPLIT,
    device=device,
    save_json=False,
    save_txt=False,
    conf=0.001,
    iou=iou_threshold,
    verbose=True,
    plots=True,
    project=str(RUN_DIR),
    name='yolo_validation'
)

end_time = time.time()
total_time = end_time - start_time

# Extract speed metrics from YOLO validation
num_images = len(list(validation_images_dir.glob("*.jpg"))) + len(list(validation_images_dir.glob("*.png")))
avg_inference_time = total_time / num_images if num_images > 0 else 0
fps = 1 / avg_inference_time if avg_inference_time > 0 else 0

# Extract GFLOPs from validation results if available
if hasattr(validation_results, 'speed') and hasattr(validation_results.speed, 'flops'):
    flops_gflops = validation_results.speed.flops / 1e9
    print(f'\n‚úì GFLOPs extracted from validation results: {flops_gflops:.2f} GFLOPs')
elif hasattr(model, 'info'):
    # Fallback: use model.info() if validation doesn't have GFLOPs
    try:
        model_info = model.info(verbose=False)
        if hasattr(model_info, 'flops'):
            flops_gflops = model_info.flops / 1e9
            print(f'\n‚úì GFLOPs extracted from model info: {flops_gflops:.2f} GFLOPs')
    except:
        print(f'\n‚úì Using pre-calculated GFLOPs: {flops_gflops:.2f} GFLOPs')
else:
    print(f'\n‚úì Using pre-calculated GFLOPs: {flops_gflops:.2f} GFLOPs')

# Extract overall metrics
yolo_metrics = {
    'precision': float(validation_results.box.mp),
    'recall': float(validation_results.box.mr),
    'map50': float(validation_results.box.map50),
    'map50_95': float(validation_results.box.map),
    'fitness': float(validation_results.fitness)
}

# Extract per-class metrics from YOLO
yolo_class_metrics = {}
class_tp = {}
class_fp = {}
class_fn = {}

if hasattr(validation_results.box, 'ap_class_index') and len(validation_results.box.ap_class_index) > 0:
    for i, class_idx in enumerate(validation_results.box.ap_class_index):
        class_idx = int(class_idx)
        class_name = CLASS_NAMES.get(class_idx, f'class_{class_idx}')
        
        precision = float(validation_results.box.p[i]) if i < len(validation_results.box.p) else 0.0
        recall = float(validation_results.box.r[i]) if i < len(validation_results.box.r) else 0.0
        ap50 = float(validation_results.box.ap50[i]) if i < len(validation_results.box.ap50) else 0.0
        ap50_95 = float(validation_results.box.ap[i]) if i < len(validation_results.box.ap) else 0.0
        
        yolo_class_metrics[class_name] = {
            'precision': precision,
            'recall': recall,
            'ap50': ap50,
            'ap50_95': ap50_95
        }
        
        class_tp[class_idx] = 0
        class_fp[class_idx] = 0
        class_fn[class_idx] = 0

# Extract confusion matrix from YOLO validation results
confusion_matrix = validation_results.confusion_matrix.matrix if hasattr(validation_results, 'confusion_matrix') else np.zeros((NUM_CLASSES, NUM_CLASSES), dtype=int)

# Verify confusion matrix exists
if confusion_matrix is not None and confusion_matrix.size > 0:
    print(f'\n‚úì Confusion matrix extracted successfully')
    print(f'  Shape: {confusion_matrix.shape}')
    print(f'  Diagonal sum (correct): {np.trace(confusion_matrix)}')
else:
    print(f'\n‚ö†Ô∏è  Warning: Confusion matrix is empty')

# Calculate TP, FP, FN from confusion matrix
for i in range(NUM_CLASSES):
    class_tp[i] = int(confusion_matrix[i, i]) if i < confusion_matrix.shape[0] and i < confusion_matrix.shape[1] else 0
    
    if i < confusion_matrix.shape[1]:
        class_fp[i] = int(confusion_matrix[:, i].sum() - confusion_matrix[i, i])
    else:
        class_fp[i] = 0
    
    if i < confusion_matrix.shape[0]:
        class_fn[i] = int(confusion_matrix[i, :].sum() - confusion_matrix[i, i])
    else:
        class_fn[i] = 0

# Create results_data for compatibility with report generation
results_data = []
for img_path in validation_images_dir.glob("*.jpg"):
    results_data.append({'image_path': img_path})
for img_path in validation_images_dir.glob("*.png"):
    results_data.append({'image_path': img_path})

print('\n' + '=' * 80)
print('OFFICIAL YOLO VALIDATION RESULTS')
print('=' * 80)
print(f"Precision (mean): {yolo_metrics['precision']:.4f}")
print(f"Recall (mean):    {yolo_metrics['recall']:.4f}")
print(f"mAP@0.5:          {yolo_metrics['map50']:.4f}")
print(f"mAP@0.5:0.95:     {yolo_metrics['map50_95']:.4f}")
print(f"Fitness:          {yolo_metrics['fitness']:.4f}")
print(f'\n‚ö° Performance Metrics:')
print(f'  Total Time: {total_time:.2f}s')
print(f'  Average Inference Time: {avg_inference_time*1000:.2f}ms per image')
print(f'  FPS (Frames Per Second): {fps:.2f}')
print('=' * 80)

# Log comprehensive metrics to Weights & Biases
if wandb:
    try:
        # Log overall metrics
        wandb.log({
            # Accuracy metrics
            "metrics/precision": yolo_metrics['precision'],
            "metrics/recall": yolo_metrics['recall'],
            "metrics/mAP@0.5": yolo_metrics['map50'],
            "metrics/mAP@0.5:0.95": yolo_metrics['map50_95'],
            "metrics/fitness": yolo_metrics['fitness'],
            
            # Performance metrics
            "performance/total_time_seconds": total_time,
            "performance/avg_inference_time_ms": avg_inference_time * 1000,
            "performance/fps": fps,
            "performance/images_processed": num_images,
            
            # Model info
            "model/parameters_millions": model_params / 1e6,
            "model/size_mb": model_size_mb,
            "model/flops_gflops": flops_gflops
        })
        
        # Log per-class metrics
        for class_name, metrics in yolo_class_metrics.items():
            wandb.log({
                f"class/{class_name}/precision": metrics['precision'],
                f"class/{class_name}/recall": metrics['recall'],
                f"class/{class_name}/ap50": metrics['ap50'],
                f"class/{class_name}/ap50_95": metrics['ap50_95']
            })
        
        print(f'\n‚úì Metrics logged to Weights & Biases')
        print(f'  Dashboard: https://wandb.ai/yolo-bdd100k-validation/{W_B_RUN_NAME}')
    except Exception as e:
        print(f'\n‚ö†Ô∏è  Error logging to W&B: {e}')

# Save YOLO validation results with performance metrics
yolo_results_path = RUN_DIR / 'yolo_validation_metrics.json'
with open(yolo_results_path, 'w') as f:
    json.dump({
        'overall_metrics': yolo_metrics,
        'per_class_metrics': yolo_class_metrics,
        'performance': {
            'total_time_seconds': float(total_time),
            'avg_inference_time_ms': float(avg_inference_time * 1000),
            'fps': float(fps),
            'images_processed': int(num_images)
        },
        'model_info': {
            'parameters': int(model_params),
            'model_size_mb': float(model_size_mb),
            'flops_gflops': float(flops_gflops)
        }
    }, f, indent=2)

print(f'\n‚úì YOLO validation results saved to: {yolo_results_path}')
print(f'‚úì YOLO validation plots saved to: {RUN_DIR / "yolo_validation"}')
print(f'‚úì Confusion matrix extracted and will be visualized in next cell')
print(f'  Diagonal sum (correct predictions): {np.trace(confusion_matrix)}')
print(f'  Total predictions: {confusion_matrix.sum()}')

# Finish W&B run
if wandb:
    try:
        wandb.finish()
        print(f'\n‚úì Weights & Biases run completed successfully')
    except Exception as e:
        print(f'\n‚ö†Ô∏è  Error finishing W&B run: {e}')


## 7. Display Results Summary

In [None]:
# Calculate per-class metrics from YOLO validation results
metrics_data = []

for class_id in sorted(CLASS_NAMES.keys()):
    tp = class_tp.get(class_id, 0)
    fp = class_fp.get(class_id, 0)
    fn = class_fn.get(class_id, 0)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    map50 = yolo_class_metrics.get(CLASS_NAMES[class_id], {}).get('ap50', 0.0)

    metrics_data.append({
        'Class': CLASS_NAMES[class_id],
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'mAP@0.5': map50
    })

df_metrics = pd.DataFrame(metrics_data)

# Overall metrics
total_tp = sum(class_tp.values())
total_fp = sum(class_fp.values())
total_fn = sum(class_fn.values())

overall_precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
overall_recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
overall_f1 = 2 * overall_precision * overall_recall / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

print('=' * 80)
print('YOLOv8 VALIDATION RESULTS SUMMARY')
print('=' * 80)
print(f'\nDataset: {USED_DATASET} - {USED_DATA_SPLIT} split')
print(f'Images processed: {len(list(validation_images_dir.glob("*")))}')
print(f'IoU Threshold: {iou_threshold}')
print(f'\nOVERALL METRICS (From YOLO Validation):')
print(f'  Precision: {yolo_metrics["precision"]:.4f}')
print(f'  Recall:    {yolo_metrics["recall"]:.4f}')
print(f'  mAP@0.5:   {yolo_metrics["map50"]:.4f}')
print(f'  mAP@0.5:0.95: {yolo_metrics["map50_95"]:.4f}')
print(f'\nOVERALL METRICS (From Confusion Matrix):')
print(f'  Precision: {overall_precision:.4f}')
print(f'  Recall:    {overall_recall:.4f}')
print(f'  F1-Score:  {overall_f1:.4f}')
print(f'\nPER-CLASS METRICS (including mAP@0.5):')
print(df_metrics.to_string(index=False))
print('\n' + '=' * 80)

## 8. Visualize Metrics

In [None]:
# Figure 1: Core Metrics (Precision, Recall, F1-Score)
fig1, axes1 = plt.subplots(2, 2, figsize=(18, 12))
ax_precision, ax_recall, ax_f1, ax_counts = axes1.flatten()

# Precision by class
precision_sorted = df_metrics.sort_values('Precision')
ax_precision.barh(precision_sorted['Class'], precision_sorted['Precision'], color='#5BC0EB')
ax_precision.set_title('Precision by Class', fontweight='bold', fontsize=16)
ax_precision.set_xlabel('Precision', fontweight='bold')
ax_precision.set_xlim(0, 1)
ax_precision.grid(axis='x', alpha=0.3)

# Recall by class
recall_sorted = df_metrics.sort_values('Recall')
ax_recall.barh(recall_sorted['Class'], recall_sorted['Recall'], color='#F25F5C')
ax_recall.set_title('Recall by Class', fontweight='bold', fontsize=16)
ax_recall.set_xlabel('Recall', fontweight='bold')
ax_recall.set_xlim(0, 1)
ax_recall.grid(axis='x', alpha=0.3)

# F1-score by class
f1_sorted = df_metrics.sort_values('F1-Score')
ax_f1.barh(f1_sorted['Class'], f1_sorted['F1-Score'], color='#9BC53D')
ax_f1.set_title('F1-Score by Class', fontweight='bold', fontsize=16)
ax_f1.set_xlabel('F1-Score', fontweight='bold')
ax_f1.set_xlim(0, 1)
ax_f1.grid(axis='x', alpha=0.3)

# Ground truth object counts / detections distribution
ax_counts.bar(['TP', 'FP', 'FN'], [total_tp, total_fp, total_fn], color=['#177E89', '#ED6A5A', '#F4A259'])
ax_counts.set_title('Overall Detection Outcomes', fontweight='bold', fontsize=16)
ax_counts.set_ylabel('Count', fontweight='bold')
for bar in ax_counts.patches:
    ax_counts.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(total_tp, total_fp, total_fn)*0.01,
                   f'{int(bar.get_height())}', ha='center', fontweight='bold')
ax_counts.grid(axis='y', alpha=0.3)

plt.tight_layout()

# Save Figure 1
metrics_fig_path = RUN_DIR / 'core_metrics_charts.png'
plt.savefig(metrics_fig_path, dpi=150, bbox_inches='tight')
plt.show()

print(f'‚úì Core metrics visualization saved: {metrics_fig_path}')

# Figure 2: mAP Metrics
fig2, axes2 = plt.subplots(1, 2, figsize=(18, 6))
ax_map, ax_overall = axes2.flatten()

# mAP@0.5 by class
map_sorted = df_metrics.sort_values('mAP@0.5')
ax_map.barh(map_sorted['Class'], map_sorted['mAP@0.5'], color='#B388EB')
ax_map.set_title('mAP@0.5 by Class', fontweight='bold', fontsize=16)
ax_map.set_xlabel('mAP@0.5', fontweight='bold')
ax_map.set_xlim(0, 1)
ax_map.grid(axis='x', alpha=0.3)

# Overall metrics bar chart (including mAP metrics)
overall_plot_values = {
    'Precision': overall_precision,
    'Recall': overall_recall,
    'F1-Score': overall_f1,
    'mAP@0.5': yolo_metrics['map50'],
    'mAP@0.5:0.95': yolo_metrics['map50_95']
}
ax_overall.bar(overall_plot_values.keys(), overall_plot_values.values(), color='#FFA630')
ax_overall.set_ylim(0, 1)
ax_overall.set_title('Overall Metrics', fontweight='bold', fontsize=16)
ax_overall.set_ylabel('Score', fontweight='bold')
for idx, value in enumerate(overall_plot_values.values()):
    ax_overall.text(idx, value + 0.02, f'{value:.3f}', ha='center', fontweight='bold')
ax_overall.grid(axis='y', alpha=0.3)

plt.tight_layout()

# Save Figure 2
map_fig_path = RUN_DIR / 'map_metrics_charts.png'
plt.savefig(map_fig_path, dpi=150, bbox_inches='tight')
plt.show()

print(f'‚úì mAP metrics visualization saved: {map_fig_path}')

## 8.5. Confusion Matrix

In [None]:
# Visualize confusion matrix from YOLO validation - centered and compact
fig, ax = plt.subplots(figsize=(10, 8))

# Draw each cell manually with solid colors
for i in range(NUM_CLASSES):
    for j in range(NUM_CLASSES):
        value = confusion_matrix[i, j]
        
        # Determine cell color
        if value == 0:
            # White for empty cells
            cell_color = 'white'
        elif i == j:
            cell_color = '#00A676'  # Correct predictions
        else:
            cell_color = '#D7263D'  # Misclassifications
        rect = Rectangle((j - 0.5, i - 0.5), 1, 1,
                         facecolor=cell_color,
                         edgecolor='black',
                         linewidth=1.5)
        ax.add_patch(rect)
        
        # Add text annotations with smaller font
        if value > 0:
            text_color = 'white' if i == j else '#F7F7F7'
            ax.text(j, i, str(value), ha='center', va='center',
                    color=text_color, fontsize=9, fontweight='bold')

# Set axis limits and properties
ax.set_xlim(-0.5, NUM_CLASSES - 0.5)
ax.set_ylim(NUM_CLASSES - 0.5, -0.5)
ax.set_aspect('equal')

# Set ticks and labels with smaller font
class_labels = [CLASS_NAMES[i] for i in range(NUM_CLASSES)]
ax.set_xticks(np.arange(NUM_CLASSES))
ax.set_yticks(np.arange(NUM_CLASSES))
ax.set_xticklabels(class_labels, fontsize=8, fontweight='bold', rotation=45, ha='right')
ax.set_yticklabels(class_labels, fontsize=8, fontweight='bold')
ax.set_xlabel('Predicted Class', fontweight='bold', fontsize=11)
ax.set_ylabel('True Class', fontweight='bold', fontsize=11)
ax.set_title(f'Confusion Matrix ({MODEL_NAME} validation)', fontweight='bold', fontsize=13)
ax.grid(False)

# Center the confusion matrix in the figure
plt.tight_layout()

# Save confusion matrix figure for PDF report
confusion_matrix_path = RUN_DIR / 'confusion_matrix.png'
plt.savefig(confusion_matrix_path, dpi=150, bbox_inches='tight')
plt.show()

print("(Green = Correct Predictions, Red = Incorrect Predictions, White = No Predictions)")
print(f'‚úì Confusion matrix visualized (from {MODEL_NAME} validation)')
print(f'  Diagonal sum (correct predictions): {np.trace(confusion_matrix)}')
print(f'  Total predictions: {confusion_matrix.sum()}')
print(f'  Saved to: {confusion_matrix_path}')

## 9. Detailed Comparison: Ground Truth vs Predictions

In [None]:
comparisons_dir = RUN_DIR / 'sample_comparisons'
comparisons_dir.mkdir(parents=True, exist_ok=True)

def draw_ground_truth(img_path: Path, label_path: Path,
                      class_names: Dict[int, str], 
                      colors: Dict[int, Tuple[int, int, int]]) -> Tuple[np.ndarray, int]:
    """Draw ground-truth boxes using deterministic colors.
    
    Args:
        img_path: Path to the image file
        label_path: Path to the YOLO format label file
        class_names: Mapping from class ID to class name
        colors: Mapping from class ID to BGR color tuple
    
    Returns:
        Tuple of (annotated RGB image, object count)
    """
    img_bgr = cv2.imread(str(img_path))
    if img_bgr is None:
        raise FileNotFoundError(f'Image not found: {img_path}')
    h, w = img_bgr.shape[:2]
    object_count = 0

    if label_path.exists():
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5:
                    object_count += 1
                    class_id = int(parts[0])
                    x_center, y_center, width, height = map(float, parts[1:5])
                    x1 = int((x_center - width / 2) * w)
                    y1 = int((y_center - height / 2) * h)
                    x2 = int((x_center + width / 2) * w)
                    y2 = int((y_center + height / 2) * h)
                    color = tuple(int(c) for c in colors.get(class_id, (255, 255, 255)))
                    cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 3)
                    label = class_names.get(class_id, f'class_{class_id}')
                    (label_w, label_h), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
                    cv2.rectangle(img_bgr,
                                  (x1, max(0, y1 - label_h - baseline - 6)),
                                  (x1 + label_w + 8, y1), color, -1)
                    cv2.putText(img_bgr, label, (x1 + 4, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX,
                                0.7, (0, 0, 0) if sum(color) > 500 else (255, 255, 255), 2)

    return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB), object_count

def draw_predictions_with_consistent_colors(result: Any, 
                                           colors: Dict[int, Tuple[int, int, int]],
                                           class_names: Dict[int, str]) -> np.ndarray:
    """Draw model predictions using same palette as ground truth.
    
    Args:
        result: YOLO prediction result object
        colors: Mapping from class ID to BGR color tuple
        class_names: Mapping from class ID to class name
    
    Returns:
        Annotated RGB image
    """
    img_bgr = result.orig_img.copy()
    if img_bgr.ndim == 2:
        img_bgr = cv2.cvtColor(img_bgr, cv2.COLOR_GRAY2BGR)

    for box in result.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].tolist())
        conf = float(box.conf[0])
        class_id = int(box.cls[0])
        color = tuple(int(c) for c in colors.get(class_id, (255, 255, 255)))
        cv2.rectangle(img_bgr, (x1, y1), (x2, y2), color, 3)
        label = f"{class_names.get(class_id, f'class_{class_id}')} {conf:.2f}"
        (label_w, label_h), baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        cv2.rectangle(img_bgr,
                      (x1, max(0, y1 - label_h - baseline - 6)),
                      (x1 + label_w + 8, y1), color, -1)
        cv2.putText(img_bgr, label, (x1 + 4, y1 - 4), cv2.FONT_HERSHEY_SIMPLEX,
                    0.7, (0, 0, 0) if sum(color) > 500 else (255, 255, 255), 2)

    return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

def add_attributes_text(img_rgb: np.ndarray, attrs: Dict[str, Any]) -> np.ndarray:
    """Overlay weather/scene/time attributes at the bottom of the image.
    
    Args:
        img_rgb: Input RGB image
        attrs: Dictionary containing attribute information
    
    Returns:
        RGB image with overlaid attribute text
    """
    if not attrs:
        return img_rgb
    
    img_with_text = img_rgb.copy()
    h, w = img_with_text.shape[:2]
    
    # Create text overlay
    weather = attrs.get('weather', 'unknown')
    scene = attrs.get('scene', 'unknown')
    timeofday = attrs.get('timeofday', 'unknown')
    attr_text = f"Weather: {weather} | Scene: {scene} | Time: {timeofday}"
    
    # Convert to BGR for OpenCV
    img_bgr = cv2.cvtColor(img_with_text, cv2.COLOR_RGB2BGR)
    
    # Add semi-transparent background for text
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.6
    thickness = 2
    (text_w, text_h), baseline = cv2.getTextSize(attr_text, font, font_scale, thickness)
    
    # Position at bottom of image
    text_x = 10
    text_y = h - 15
    
    # Draw background rectangle
    overlay = img_bgr.copy()
    cv2.rectangle(overlay, (0, h - text_h - baseline - 20), (w, h), (0, 0, 0), -1)
    cv2.addWeighted(overlay, 0.6, img_bgr, 0.4, 0, img_bgr)
    
    # Draw text
    cv2.putText(img_bgr, attr_text, (text_x, text_y), font, font_scale, (255, 255, 255), thickness)
    
    return cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

def render_comparison(idx: int, img_path: Path,
                     gt_img: np.ndarray, gt_count: int,
                     pred_img: np.ndarray, pred_count: int,
                     attrs: Dict[str, Any]) -> plt.Figure:
    """Render side-by-side comparison figure.
    
    Args:
        idx: Comparison index
        img_path: Path to the original image
        gt_img: Ground truth annotated image (RGB)
        gt_count: Number of ground truth objects
        pred_img: Prediction annotated image (RGB)
        pred_count: Number of predicted objects
        attrs: Image attributes (weather, scene, time)
    
    Returns:
        Matplotlib figure object
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
    
    ax1.imshow(gt_img)
    ax1.set_title(f'Ground Truth ({gt_count} objects)', fontweight='bold', fontsize=14)
    ax1.axis('off')
    
    ax2.imshow(pred_img)
    ax2.set_title(f'Prediction ({pred_count} objects)', fontweight='bold', fontsize=14)
    ax2.axis('off')
    
    # Overall title with environmental context
    weather = attrs.get('weather', 'unknown')
    scene = attrs.get('scene', 'unknown')
    timeofday = attrs.get('timeofday', 'unknown')
    fig.suptitle(
        f'Comparison #{idx}: {img_path.name}\n{weather} | {scene} | {timeofday}',
        fontsize=16,
        fontweight='bold'
    )
    
    plt.tight_layout()
    return fig

def analyze_comparison(gt_count: int, det_count: int, 
                      result: Any, attrs: Dict[str, Any],
                      class_names: Dict[int, str]) -> str:
    """Generate textual analysis for a comparison.
    
    Args:
        gt_count: Number of ground truth objects
        det_count: Number of detected objects
        result: YOLO prediction result
        attrs: Image attributes
        class_names: Mapping from class ID to class name
    
    Returns:
        Analysis string
    """
    analysis = []
    
    # Detection summary
    if det_count == gt_count:
        analysis.append(f"‚úì Perfect count match: {det_count} objects detected (same as ground truth)")
    elif det_count > gt_count:
        analysis.append(f"‚ö†Ô∏è Over-detection: {det_count} objects detected vs {gt_count} ground truth (difference: +{det_count - gt_count})")
    else:
        analysis.append(f"‚ö†Ô∏è Under-detection: {det_count} objects detected vs {gt_count} ground truth (difference: -{gt_count - det_count})")
    
    # Class distribution
    if result.boxes:
        class_counts = {}
        all_confs = []
        for box in result.boxes:
            cls_id = int(box.cls[0])
            cls_name = class_names.get(cls_id, f'class_{cls_id}')
            class_counts[cls_name] = class_counts.get(cls_name, 0) + 1
            all_confs.append(float(box.conf[0]))
        
        class_dist = ', '.join([f'{k}={v}' for k, v in sorted(class_counts.items())])
        analysis.append(f"üìä Detected classes: {class_dist}")
        
        avg_conf = sum(all_confs) / len(all_confs) if all_confs else 0
        min_conf = min(all_confs) if all_confs else 0
        max_conf = max(all_confs) if all_confs else 0
        analysis.append(f"üéØ Confidence: avg={avg_conf:.2f}, min={min_conf:.2f}, max={max_conf:.2f}")
    else:
        analysis.append("‚ö†Ô∏è No objects detected in this image")
    
    # Environmental context
    if attrs:
        weather = attrs.get('weather', 'unknown')
        scene = attrs.get('scene', 'unknown')
        timeofday = attrs.get('timeofday', 'unknown')
        analysis.append(f"üåç Context: {weather} weather, {scene}, {timeofday}")
        
        # Performance insights based on conditions
        if timeofday == 'night' and det_count < gt_count:
            analysis.append("üí° Lower detection in night conditions - typical challenge for vision models")
        elif weather in ['rainy', 'snowy'] and det_count < gt_count:
            analysis.append("üí° Adverse weather may affect detection performance")
    
    return '\n'.join(analysis)

num_comparisons = min(12, len(valid_images))
comparison_image_paths = []
comparison_analyses = []  # Store analyses for report

if num_comparisons == 0:
    print('‚ö†Ô∏è  No labeled images available for comparison generation.')
else:
    sample_images = random.sample(valid_images, num_comparisons) if len(valid_images) > num_comparisons else valid_images
    print(f'Generating {len(sample_images)} comparison figures...')
    sample_results = []
    for img_path in tqdm(sample_images, desc='Running inference for comparisons'):
        result = model(str(img_path), verbose=False, device=device)[0]
        sample_results.append({'image_path': img_path,
                               'result': result,
                               'num_detections': len(result.boxes)})

    for idx, sample_data in enumerate(sample_results, 1):
        img_path = sample_data['image_path']
        label_path = LABELS_DIR / f'{img_path.stem}.txt'
        attrs = image_attributes.get(img_path.stem, {})

        gt_img, gt_count = draw_ground_truth(img_path, label_path, CLASS_NAMES, COLORS)
        gt_img = add_attributes_text(gt_img, attrs)

        pred_img = draw_predictions_with_consistent_colors(sample_data['result'], COLORS, CLASS_NAMES)
        pred_img = add_attributes_text(pred_img, attrs)

        fig = render_comparison(idx, img_path, gt_img, gt_count,
                                pred_img, sample_data['num_detections'], attrs)
        comparison_path = comparisons_dir / f'comparison_{idx:02d}.png'
        plt.savefig(comparison_path, dpi=150, bbox_inches='tight')
        comparison_image_paths.append(comparison_path)
        plt.show()
        
        # Generate and display analysis for this comparison
        analysis = analyze_comparison(gt_count, sample_data['num_detections'], 
                                      sample_data['result'], attrs, CLASS_NAMES)
        print(f'\nüìù Analysis for Comparison #{idx}:')
        print('-' * 80)
        print(analysis)
        print('-' * 80)
        
        # Store analysis for report
        comparison_analyses.append({
            'comparison_id': idx,
            'image_name': img_path.name,
            'analysis': analysis
        })
        
        plt.close(fig)
    
    # Save all comparison analyses to a text file
    analyses_file = comparisons_dir / 'comparison_analyses.txt'
    with open(analyses_file, 'w') as f:
        f.write('YOLO VALIDATION - COMPARISON ANALYSES\n')
        f.write('=' * 80 + '\n')
        f.write(f'Model: {MODEL_NAME}\n')
        f.write(f'Dataset: {USED_DATASET} - {USED_DATA_SPLIT} split\n')
        f.write(f'Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')
        f.write('=' * 80 + '\n\n')
        
        for comp in comparison_analyses:
            f.write(f'Comparison #{comp["comparison_id"]}: {comp["image_name"]}\n')
            f.write('-' * 80 + '\n')
            f.write(comp['analysis'] + '\n')
            f.write('\n' + '=' * 80 + '\n\n')

    print('=' * 80)
    print(f'‚úì Generated {len(comparison_image_paths)} individual comparison figures')
    print(f'  Saved to: {comparisons_dir}')
    print(f'‚úì Comparison analyses saved to: {analyses_file.name}')

## 10. Attribute-Based Performance Analysis

Analyze model performance across different environmental conditions and scenes using the metadata.

In [None]:
if performance_data and image_attributes:
    print("=" * 80)
    print("ATTRIBUTE-BASED PERFORMANCE ANALYSIS")
    print("=" * 80)

    results_by_weather: Dict[str, list] = {}
    results_by_scene: Dict[str, list] = {}
    results_by_timeofday: Dict[str, list] = {}
    results_by_class: Dict[str, list] = {}

    for result in results_data:
        basename = Path(result['image_path']).stem
        attrs = image_attributes.get(basename, {})
        weather = attrs.get('weather', 'unknown')
        scene = attrs.get('scene', 'unknown')
        timeofday = attrs.get('timeofday', 'unknown')
        classes_present = attrs.get('classes_present', [])

        results_by_weather.setdefault(weather, []).append(result)
        results_by_scene.setdefault(scene, []).append(result)
        results_by_timeofday.setdefault(timeofday, []).append(result)
        for cls in classes_present:
            results_by_class.setdefault(cls, []).append(result)

    def calculate_group_metrics(group_results):
        """Calculate metrics for a group of results using per-class confusion matrix data"""
        if not group_results:
            return None
        total_tp = total_fp = total_fn = 0
        for result in group_results:
            basename = Path(result['image_path']).stem
            attrs = image_attributes.get(basename, {})
            classes_present = attrs.get('classes_present', [])
            objects_per_class = attrs.get('objects_per_class', {})
            for class_name in classes_present:
                class_id = CLASS_NAME_TO_ID.get(class_name)
                if class_id is None:
                    continue
                tp = class_tp.get(class_id, 0)
                fp = class_fp.get(class_id, 0)
                fn = class_fn.get(class_id, 0)
                obj_count = objects_per_class.get(class_name, 1)
                total_tp += tp * obj_count
                total_fp += fp * obj_count
                total_fn += fn * obj_count
        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        return {
            'count': len(group_results),
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': total_tp,
            'fp': total_fp,
            'fn': total_fn
        }

    def print_group_stats(title, metrics_dict):
        """Print statistics for a group of metrics organized by some attribute"""
        print("\n" + "=" * 80)
        print(title)
        print("=" * 80)
        for key, group in sorted(metrics_dict.items()):
            metrics = calculate_group_metrics(group)
            if not metrics:
                continue
            print(f"\n{key.upper()}:")
            print(f"  Images: {metrics['count']}")
            print(f"  Precision: {metrics['precision']:.3f} (TP={metrics['tp']}, FP={metrics['fp']})")
            print(f"  Recall: {metrics['recall']:.3f} (TP={metrics['tp']}, FN={metrics['fn']})")
            print(f"  F1 Score: {metrics['f1']:.3f}")

    print_group_stats("PERFORMANCE BY WEATHER CONDITION", results_by_weather)
    print_group_stats("PERFORMANCE BY SCENE TYPE", results_by_scene)
    print_group_stats("PERFORMANCE BY TIME OF DAY", results_by_timeofday)

    class_metrics = {}
    for cls, group in results_by_class.items():
        metrics = calculate_group_metrics(group)
        if metrics:
            class_metrics[cls] = metrics
    top_classes = sorted(class_metrics.items(), key=lambda x: x[1]['count'], reverse=True)[:10]
    print("\n" + "=" * 80)
    print("PERFORMANCE BY CLASS (Top 10 by image count)")
    print("=" * 80)
    for cls, metrics in top_classes:
        print(f"\n{cls.upper()}:")
        print(f"  Images: {metrics['count']}")
        print(f"  Precision: {metrics['precision']:.3f}")
        print(f"  Recall: {metrics['recall']:.3f}")
        print(f"  F1 Score: {metrics['f1']:.3f}")

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))

    def plot_horizontal(ax, labels, values, title, cmap):
        """Plot horizontal bar chart for attribute-based performance"""
        if not labels:
            ax.text(0.5, 0.5, 'No data', ha='center', va='center')
            ax.set_axis_off()
            return
        bars = ax.barh(labels, values, color=cmap(np.linspace(0.4, 0.8, len(labels))))
        for bar, value in zip(bars, values):
            ax.text(value + 0.02, bar.get_y() + bar.get_height() / 2,
                    f'{value:.3f}', va='center', fontsize=9)
        ax.set_xlim(0, 1.05)
        ax.set_xlabel('F1 Score')
        ax.set_title(title, fontweight='bold')

    weather_labels = list(results_by_weather.keys())
    weather_values = [calculate_group_metrics(results_by_weather[w])['f1'] for w in weather_labels]
    plot_horizontal(axes[0, 0], weather_labels, weather_values, 'Performance by Weather', plt.cm.Blues)

    scene_labels = list(results_by_scene.keys())
    scene_values = [calculate_group_metrics(results_by_scene[s])['f1'] for s in scene_labels]
    plot_horizontal(axes[0, 1], scene_labels, scene_values, 'Performance by Scene', plt.cm.Greens)

    time_labels = list(results_by_timeofday.keys())
    time_values = [calculate_group_metrics(results_by_timeofday[t])['f1'] for t in time_labels]
    plot_horizontal(axes[1, 0], time_labels, time_values, 'Performance by Time of Day', plt.cm.Oranges)

    class_labels = [c[0] for c in top_classes[:8]]
    class_values = [c[1]['f1'] for c in top_classes[:8]]
    plot_horizontal(axes[1, 1], class_labels, class_values, 'Performance by Class (Top 8)', plt.cm.Purples)

    plt.tight_layout()
    attribute_perf_path = RUN_DIR / 'attribute_performance.png'
    plt.savefig(attribute_perf_path, dpi=150, bbox_inches='tight')
    plt.show()

    print("\n‚úì Saved attribute performance visualization:", attribute_perf_path)
    print("=" * 80)
    print("‚úì ATTRIBUTE-BASED ANALYSIS COMPLETE")
    print("=" * 80)
else:
    print("=" * 80)
    print("Performance metadata not loaded")
    print("‚ö†Ô∏è  Attribute-based analysis not available")
    print("=" * 80)

# 11. Report

In [None]:
# Generate comprehensive PDF report with all visualizations
pdf_report_path = RUN_DIR / 'report.pdf'

# Create PDF document
doc = SimpleDocTemplate(str(pdf_report_path), pagesize=A4,
                       rightMargin=30, leftMargin=30,
                       topMargin=30, bottomMargin=30)

# Container for PDF elements
story = []
styles = getSampleStyleSheet()

# Custom styles
title_style = ParagraphStyle(
    'CustomTitle',
    parent=styles['Heading1'],
    fontSize=24,
    textColor=colors.HexColor('#2c3e50'),
    spaceAfter=30,
    alignment=TA_CENTER
)

heading_style = ParagraphStyle(
    'CustomHeading',
    parent=styles['Heading2'],
    fontSize=16,
    textColor=colors.HexColor('#34495e'),
    spaceAfter=12,
    spaceBefore=20
)

# Title
story.append(Paragraph('YOLO Validation Report', title_style))
story.append(Spacer(1, 12))

# Info section with model details
info_data = [
    ['Model:', MODEL_NAME],
    ['Model Size:', f'{model_size_mb:.1f} MB'],
    ['Parameters:', f'{model_params / 1e6:.1f}M'],
    ['FLOPs (640x640):', f'{flops_gflops:.2f} GFLOPs'],
    ['Run Name:', RUN_NAME],
    ['W&B Run Name:', W_B_RUN_NAME],
    ['Timestamp:', datetime.now().strftime('%Y-%m-%d %H:%M:%S')],
    ['Dataset:', f'{USED_DATASET} - {USED_DATA_SPLIT} split'],
    ['Images Processed:', str(len(results_data))],
    ['IoU Threshold:', str(iou_threshold)]
]

info_table = Table(info_data, colWidths=[2.2*inch, 3.8*inch])
info_table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, -1), colors.HexColor('#ecf0f1')),
    ('TEXTCOLOR', (0, 0), (-1, -1), colors.black),
    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
    ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
    ('FONTSIZE', (0, 0), (-1, -1), 10),
    ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
    ('TOPPADDING', (0, 0), (-1, -1), 8),
    ('GRID', (0, 0), (-1, -1), 1, colors.white)
]))
story.append(info_table)
story.append(Spacer(1, 20))

# Performance metrics section
story.append(Paragraph('Inference Performance', heading_style))
perf_data = [
    ['Metric', 'Value'],
    ['Total Execution Time', f'{total_time:.2f}s'],
    ['Average Inference Time', f'{avg_inference_time*1000:.2f}ms per image'],
    ['FPS (Frames Per Second)', f'{fps:.2f}']
]

perf_table = Table(perf_data, colWidths=[3*inch, 3*inch])
perf_table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#27ae60')),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('FONTSIZE', (0, 0), (-1, 0), 12),
    ('FONTSIZE', (0, 1), (-1, -1), 10),
    ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
    ('TOPPADDING', (0, 0), (-1, -1), 8),
    ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#d5f4e6')),
    ('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(perf_table)
story.append(Spacer(1, 20))

# Overall accuracy metrics
story.append(Paragraph('Overall Accuracy Metrics', heading_style))
acc_data = [
    ['Metric', 'Value'],
    ['Precision', f'{overall_precision:.4f}'],
    ['Recall', f'{overall_recall:.4f}'],
    ['F1-Score', f'{overall_f1:.4f}'],
    ['mAP@0.5', f'{yolo_metrics["map50"]:.4f}'],
    ['mAP@0.5:0.95', f'{yolo_metrics["map50_95"]:.4f}'],
    ['True Positives', str(total_tp)],
    ['False Positives', str(total_fp)],
    ['False Negatives', str(total_fn)]
]

acc_table = Table(acc_data, colWidths=[3*inch, 3*inch])
acc_table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('FONTSIZE', (0, 0), (-1, 0), 12),
    ('FONTSIZE', (0, 1), (-1, -1), 10),
    ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
    ('TOPPADDING', (0, 0), (-1, -1), 8),
    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
    ('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(acc_table)
story.append(Spacer(1, 20))

# Add metrics visualizations to PDF
story.append(PageBreak())
story.append(Paragraph('Performance Visualizations', heading_style))
story.append(Spacer(1, 12))

# Core Metrics Chart (Precision, Recall, F1, Detection Outcomes)
story.append(Paragraph('Core Metrics (Precision, Recall, F1-Score)', ParagraphStyle('SubHeading', parent=styles['Normal'], fontSize=12, textColor=colors.HexColor('#34495e'), spaceAfter=8, spaceBefore=8)))
core_metrics_path = RUN_DIR / 'core_metrics_charts.png'
if core_metrics_path.exists():
    try:
        with PILImage.open(core_metrics_path) as img:
            img_width, img_height = img.size
            aspect_ratio = img_height / img_width
            pdf_width = 7 * inch
            pdf_height = pdf_width * aspect_ratio
            story.append(Image(str(core_metrics_path), width=pdf_width, height=pdf_height))
    except Exception as e:
        print(f'Warning: Could not load core metrics chart with PIL: {e}')
        story.append(Image(str(core_metrics_path), width=7*inch, height=5*inch))
else:
    story.append(Paragraph('Core metrics chart not found.', styles['Normal']))

story.append(Spacer(1, 12))

# mAP Metrics Chart
story.append(Paragraph('mAP Metrics', ParagraphStyle('SubHeading', parent=styles['Normal'], fontSize=12, textColor=colors.HexColor('#34495e'), spaceAfter=8, spaceBefore=8)))
map_metrics_path = RUN_DIR / 'map_metrics_charts.png'
if map_metrics_path.exists():
    try:
        with PILImage.open(map_metrics_path) as img:
            img_width, img_height = img.size
            aspect_ratio = img_height / img_width
            pdf_width = 7 * inch
            pdf_height = pdf_width * aspect_ratio
            story.append(Image(str(map_metrics_path), width=pdf_width, height=pdf_height))
    except Exception as e:
        print(f'Warning: Could not load mAP metrics chart with PIL: {e}')
        story.append(Image(str(map_metrics_path), width=7*inch, height=5*inch))
else:
    story.append(Paragraph('mAP metrics chart not found.', styles['Normal']))

story.append(Spacer(1, 20))

# Per-class metrics table
story.append(PageBreak())
story.append(Paragraph('Per-Class Performance', heading_style))

table_data = [['Class', 'TP', 'FP', 'FN', 'Precision', 'Recall', 'F1-Score', 'mAP@0.5']]
for _, row in df_metrics.iterrows():
    class_name = row['Class']
    map50_val = yolo_class_metrics.get(class_name, {}).get('ap50', 0.0)
    table_data.append([
        row['Class'],
        str(row['TP']),
        str(row['FP']),
        str(row['FN']),
        f"{row['Precision']:.4f}",
        f"{row['Recall']:.4f}",
        f"{row['F1-Score']:.4f}",
        f"{map50_val:.4f}"
    ])

per_class_table = Table(table_data, colWidths=[1.0*inch, 0.5*inch, 0.5*inch, 0.5*inch, 0.8*inch, 0.8*inch, 0.8*inch, 0.8*inch])
per_class_table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#3498db')),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('FONTSIZE', (0, 0), (-1, 0), 8),
    ('FONTSIZE', (0, 1), (-1, -1), 7),
    ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
    ('TOPPADDING', (0, 0), (-1, -1), 6),
    ('ROWBACKGROUNDS', (0, 1), (-1, -1), [colors.white, colors.lightgrey]),
    ('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(per_class_table)
story.append(Spacer(1, 20))

# Confusion Matrix
story.append(PageBreak())
story.append(Paragraph('Confusion Matrix', heading_style))
story.append(Paragraph(f'Correct Predictions (Diagonal Sum): {np.trace(confusion_matrix)}', styles['Normal']))
story.append(Paragraph(f'Total Matched Predictions: {confusion_matrix.sum()}', styles['Normal']))
story.append(Spacer(1, 12))

confusion_matrix_img_path = RUN_DIR / 'confusion_matrix.png'
if confusion_matrix_img_path.exists():
    try:
        with PILImage.open(confusion_matrix_img_path) as img:
            img_width, img_height = img.size
            aspect_ratio = img_height / img_width
            pdf_width = 6.5 * inch
            pdf_height = pdf_width * aspect_ratio
            if pdf_height > 7 * inch:
                pdf_height = 7 * inch
                pdf_width = pdf_height / aspect_ratio
            story.append(Image(str(confusion_matrix_img_path), width=pdf_width, height=pdf_height))
    except Exception as e:
        print(f'Warning: Could not load confusion matrix with PIL: {e}')
        story.append(Image(str(confusion_matrix_img_path), width=6.5*inch, height=6.5*inch))
else:
    story.append(Paragraph('Confusion matrix image not found.', styles['Normal']))

story.append(Spacer(1, 20))

# Sample predictions comparison - multiple images per page
comparisons_dir = RUN_DIR / 'sample_comparisons'
if comparisons_dir.exists():
    comparison_files = sorted(comparisons_dir.glob('comparison_*.png'))
    
    if comparison_files:
        story.append(PageBreak())
        story.append(Paragraph('Sample Predictions: Ground Truth vs Predictions', heading_style))
        story.append(Paragraph(f'{len(comparison_files)} detailed side-by-side comparisons', styles['Normal']))
        story.append(Spacer(1, 12))
        
        # Add comparison images - 2 per page with page breaks between pairs
        for i, comparison_path in enumerate(comparison_files):
            # Add page break after every 3 images (except before the first)
            if i > 0 and i % 3 == 0:
                story.append(PageBreak())
            
            try:
                with PILImage.open(comparison_path) as img:
                    img_width, img_height = img.size
                    aspect_ratio = img_height / img_width
                    
                    # Smaller width to fit 2 per page
                    pdf_width = 7 * inch
                    pdf_height = pdf_width * aspect_ratio
                    
                    # Limit height to fit 2 images per page
                    if pdf_height > 3.5 * inch:
                        pdf_height = 3.5 * inch
                        pdf_width = pdf_height / aspect_ratio
                    
                    story.append(Image(str(comparison_path), width=pdf_width, height=pdf_height))
                    story.append(Spacer(1, 8))
            except Exception as e:
                print(f'Warning: Could not load {comparison_path.name} with PIL: {e}')
                story.append(Image(str(comparison_path), width=7*inch, height=3*inch))
                story.append(Spacer(1, 8))
    else:
        story.append(PageBreak())
        story.append(Paragraph('Sample Predictions: Ground Truth vs Predictions', heading_style))
        story.append(Paragraph('No comparison images found.', styles['Normal']))
else:
    story.append(PageBreak())
    story.append(Paragraph('Sample Predictions: Ground Truth vs Predictions', heading_style))
    story.append(Paragraph('Comparison directory not found.', styles['Normal']))

story.append(Spacer(1, 12))
story.append(Paragraph('Additional validation plots available in: yolo_validation folder', styles['Normal']))

# Footer
story.append(Spacer(1, 30))
story.append(Paragraph('Generated by YOLO Quick Test Notebook', 
                      ParagraphStyle('Footer', parent=styles['Normal'], 
                                   alignment=TA_CENTER, textColor=colors.grey)))
story.append(Paragraph('BDD100K Dataset - Computer Vision Project', 
                      ParagraphStyle('Footer2', parent=styles['Normal'], 
                                   alignment=TA_CENTER, textColor=colors.grey)))

# Build PDF
doc.build(story)

# Generate JSON file with comprehensive comparison data
json_report_path = RUN_DIR / 'metrics_data.json'

comparison_data = {
    'metadata': {
        'model_name': MODEL_NAME,
        'run_name': RUN_NAME,
        'wb_run_name': W_B_RUN_NAME,
        'timestamp': datetime.now().isoformat(),
        'dataset': USED_DATASET,
        'data_split': USED_DATA_SPLIT,
        'images_processed': len(results_data),
        'iou_threshold': iou_threshold,
        'num_classes': NUM_CLASSES
    },
    'model_info': {
        'parameters': int(model_params),
        'model_size_mb': float(model_size_mb),
        'flops_gflops': float(flops_gflops)
    },
    'performance': {
        'total_time_seconds': float(total_time),
        'avg_inference_time_ms': float(avg_inference_time * 1000),
        'fps': float(fps),
        'images_processed': int(len(results_data))
    },
    'custom_metrics': {
        'overall': {
            'precision': float(overall_precision),
            'recall': float(overall_recall),
            'f1_score': float(overall_f1),
            'true_positives': int(total_tp),
            'false_positives': int(total_fp),
            'false_negatives': int(total_fn)
        },
        'per_class': {}
    },
    'yolo_official_metrics': {
        'overall': yolo_metrics,
        'per_class': yolo_class_metrics
    },
    'confusion_matrix': {
        'matrix': confusion_matrix.tolist(),
        'diagonal_sum': int(np.trace(confusion_matrix)),
        'total_predictions': int(confusion_matrix.sum())
    },
    'class_names': CLASS_NAMES
}

# Add per-class custom metrics
for _, row in df_metrics.iterrows():
    class_name = row['Class']
    comparison_data['custom_metrics']['per_class'][class_name] = {
        'true_positives': int(row['TP']),
        'false_positives': int(row['FP']),
        'false_negatives': int(row['FN']),
        'precision': float(row['Precision']),
        'recall': float(row['Recall']),
        'f1_score': float(row['F1-Score'])
    }

# Save JSON file
with open(json_report_path, 'w') as f:
    json.dump(comparison_data, f, indent=2)

# Count comparison images
comparisons_dir = RUN_DIR / 'sample_comparisons'
num_comparison_images = len(list(comparisons_dir.glob('comparison_*.png'))) if comparisons_dir.exists() else 0
analyses_file_exists = (comparisons_dir / 'comparison_analyses.txt').exists() if comparisons_dir.exists() else False

# Summary output
print('=' * 80)
print('‚úì COMPREHENSIVE REPORT GENERATED')
print('=' * 80)
print(f'\nAll outputs saved to: {RUN_DIR}')
print(f'\nGenerated files:')
print(f'  üìÑ PDF Report: report.pdf ({pdf_report_path.stat().st_size / 1024:.2f} KB)')
print(f'  üìä Metrics JSON: metrics_data.json ({json_report_path.stat().st_size / 1024:.2f} KB)')
print(f'  üìà YOLO Validation: yolo_validation_metrics.json')
print(f'  üñºÔ∏è  Confusion Matrix: confusion_matrix.png')
print(f'  üìä Core Metrics Chart: core_metrics_charts.png')
print(f'  üìä mAP Metrics Chart: map_metrics_charts.png')
print(f'  üñºÔ∏è  Sample Comparisons: sample_comparisons/ ({num_comparison_images} images)')
if analyses_file_exists:
    print(f'  üìù Comparison Analyses: sample_comparisons/comparison_analyses.txt')
print(f'  üìÅ YOLO Outputs: yolo_validation/ (plots, curves, etc.)')
print(f'\nReport contents:')
print(f'  - Model Information (Size: {model_size_mb:.1f}MB, Params: {model_params/1e6:.1f}M, FLOPs: {flops_gflops:.2f}G)')
print(f'  - Performance Metrics (FPS: {fps:.2f}, Avg: {avg_inference_time*1000:.2f}ms)')
print(f'  - Overall accuracy metrics (Precision, Recall, F1, mAP)')
print(f'  - Per-class performance with IoU metrics')
print(f'  - Core metrics visualization (Precision, Recall, F1, Detection Outcomes)')
print(f'  - mAP metrics visualization (mAP@0.5, Overall Metrics)')
print(f'  - Confusion matrix visualization')
print(f'  - {num_comparison_images} individual side-by-side prediction comparisons (full-width, high-resolution)')
if analyses_file_exists:
    print(f'  - Detailed analyses for each comparison (saved to text file)')
print(f'  - Custom & YOLO official metrics')
print(f'\nüíæ Metrics saved to JSON for future comparison')

print(f'üìä Weights & Biases tracking: {W_B_RUN_NAME}')
print('=' * 80)