In [None]:
%load_ext autoreload

%autoreload 2

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# YOLO Model Variant Comparison

Compare YOLO26 (n/s/m/l/x) and two legacy YOLOv8 variants (s, x) on the same broadcast footage.
We evaluate detection quality (count, confidence distribution), inference speed,
and visual output to pick the best speed-accuracy trade-off for player tracking.

Performance reference: [Ultralytics YOLO26 docs](https://docs.ultralytics.com/models/yolo26/)

## Imports

In [None]:
import time

import cv2
import matplotlib.pyplot as plt
import numpy as np
import pyrootutils
from matplotlib.patches import Rectangle

from football_tracking_demo.config import load_config
from football_tracking_demo.detector import PlayerDetector

## Parameters

In [None]:
root = pyrootutils.setup_root(
    search_from=".",
    indicator="pyproject.toml",
    project_root_env_var=True,
    dotenv=True,
    pythonpath=True,
    cwd=True,
)

VIDEO_PATH = str(root / "data" / "match.mp4")
CONFIG_PATH = str(root / "config" / "config.yaml")

# Model variants to compare.
# YOLO26 performance (COCO val, 640px): https://docs.ultralytics.com/models/yolo26/
# YOLOv8 kept as legacy baseline for comparison.
#
# mAP and CPU/GPU figures sourced from Ultralytics official docs.
MODEL_VARIANTS = [
    # --- YOLO26 (recommended) ---
    {
        "name": "yolo26n",
        "file": "yolo26n.pt",
        "params": "2.4M",
        "flops_b": 5.4,
        "map": 40.9,
        "cpu_ms": 38.9,
        "gpu_ms": 1.7,
    },
    {
        "name": "yolo26s",
        "file": "yolo26s.pt",
        "params": "9.5M",
        "flops_b": 20.7,
        "map": 48.6,
        "cpu_ms": 87.2,
        "gpu_ms": 2.5,
    },
    {
        "name": "yolo26m",
        "file": "yolo26m.pt",
        "params": "20.4M",
        "flops_b": 68.2,
        "map": 53.1,
        "cpu_ms": 220.0,
        "gpu_ms": 4.7,
    },
    {
        "name": "yolo26l",
        "file": "yolo26l.pt",
        "params": "24.8M",
        "flops_b": 86.4,
        "map": 55.0,
        "cpu_ms": 286.2,
        "gpu_ms": 6.2,
    },
    {
        "name": "yolo26x",
        "file": "yolo26x.pt",
        "params": "55.7M",
        "flops_b": 193.9,
        "map": 57.5,
        "cpu_ms": 525.8,
        "gpu_ms": 11.8,
    },
    # --- YOLOv8 legacy ---
    {
        "name": "yolov8s",
        "file": "yolov8s.pt",
        "params": "11.2M",
        "flops_b": 28.6,
        "map": 44.9,
        "cpu_ms": 128.4,
        "gpu_ms": 1.2,
    },
    {
        "name": "yolov8x",
        "file": "yolov8x.pt",
        "params": "68.2M",
        "flops_b": 257.8,
        "map": 53.9,
        "cpu_ms": 479.1,
        "gpu_ms": 3.5,
    },
]

# Sample frames for visual comparison
SAMPLE_FRAME_INDICES = [0, 1500, 3000, 4500, 6000]

# Frames for speed & count sweep
SWEEP_N_FRAMES = 100

# Fixed confidence threshold across all models (isolate model quality)
CONF_THRESHOLD = 0.10

## Load Config & Sample Frames

Extract sample frames from the video for visual comparison.

In [None]:
config = load_config(CONFIG_PATH)

cap = cv2.VideoCapture(VIDEO_PATH)
frames = {}
max_idx = max(SAMPLE_FRAME_INDICES)

for i in range(max_idx + 1):
    ret, frame = cap.read()
    if not ret:
        break
    if i in SAMPLE_FRAME_INDICES:
        frames[i] = frame

cap.release()
print(f"Loaded {len(frames)} sample frames: {sorted(frames.keys())}")

## Build Detectors for Each Model Variant

Create one `PlayerDetector` per YOLO variant. All other settings (confidence, NMS, HUD mask, playing field filter) are held constant so only the backbone size varies. Weights are loaded from / cached in `checkpoints/`.

In [None]:
detectors = {}
for variant in MODEL_VARIANTS:
    print(f"Loading {variant['name']} ({variant['params']} params)...")
    detectors[variant["name"]] = PlayerDetector(
        model_name=variant["file"],
        model_dir=config["detection"].get("model_dir", "checkpoints"),
        conf_threshold=CONF_THRESHOLD,
        iou_threshold=config["detection"]["nms_iou_threshold"],
        device=config["detection"]["device"],
        hud_top=config["hud_mask"]["top_percent"],
        hud_bottom=config["hud_mask"]["bottom_percent"],
        hud_enabled=config["hud_mask"]["enabled"],
        shape_filter_config=config.get("detection_shape_filter"),
        field_mask_config=config.get("playing_field_mask"),
    )

print(f"\nLoaded {len(detectors)} model variants")

## Visual Comparison per Frame

For each sample frame show all model variants side-by-side with bounding boxes. Larger models should produce more confident and spatially accurate detections, especially for distant / partially occluded players.

In [None]:
def draw_dets_on_ax(ax, frame_bgr, detections, title=""):
    """Draw detection boxes on a matplotlib axis."""
    rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    _ = ax.imshow(rgb)
    for det in detections:
        x1, y1, x2, y2, conf = det
        rect = Rectangle(
            (x1, y1),
            x2 - x1,
            y2 - y1,
            linewidth=1.5,
            edgecolor="lime",
            facecolor="none",
        )
        _ = ax.add_patch(rect)
        _ = ax.text(
            x1,
            y1 - 3,
            f"{conf:.2f}",
            color="lime",
            fontsize=6,
            bbox=dict(boxstyle="round,pad=0.15", facecolor="black", alpha=0.6),
        )
    _ = ax.set_title(title, fontsize=10)
    _ = ax.set_axis_off()


variant_names = [v["name"] for v in MODEL_VARIANTS]

for idx, frame in sorted(frames.items()):
    n = len(variant_names)
    fig, axes = plt.subplots(1, n, figsize=(5 * n, 5))
    if n == 1:
        axes = [axes]

    for ax, name in zip(axes, variant_names):
        dets = detectors[name].detect_and_filter(frame)
        draw_dets_on_ax(ax, frame, dets, title=f"{name}  ({len(dets)} dets)")

    fig.suptitle(f"Frame {idx}  —  conf={CONF_THRESHOLD}", fontsize=13, y=1.01)
    plt.tight_layout()
    plt.show()

## Inference Speed Benchmark

Time each model over N frames and report mean / std inference time per frame. This includes HUD masking, YOLO forward pass, NMS, and playing field filtering — the full `detect_and_filter` call.

In [None]:
speed_results = {name: [] for name in variant_names}

cap = cv2.VideoCapture(VIDEO_PATH)
bench_frames = []
for i in range(SWEEP_N_FRAMES):
    ret, frame = cap.read()
    if not ret:
        break
    bench_frames.append(frame)
cap.release()

print(f"Benchmarking {len(bench_frames)} frames per model...\n")

for name in variant_names:
    det = detectors[name]
    # Warm-up pass (first inference is slower due to model init)
    _ = det.detect_and_filter(bench_frames[0])

    times = []
    for frame in bench_frames:
        t0 = time.perf_counter()
        _ = det.detect_and_filter(frame)
        t1 = time.perf_counter()
        times.append((t1 - t0) * 1000)  # ms

    speed_results[name] = times
    arr = np.array(times)
    print(
        f"{name:>10}  mean={arr.mean():.1f} ms  std={arr.std():.1f} ms  fps={1000 / arr.mean():.1f}"
    )

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))

positions = range(len(variant_names))
means = [np.mean(speed_results[n]) for n in variant_names]
stds = [np.std(speed_results[n]) for n in variant_names]

bars = ax.bar(positions, means, yerr=stds, capsize=5, color="steelblue", alpha=0.8)
_ = ax.set_xticks(positions)
_ = ax.set_xticklabels(
    [f"{n}\n({v['params']})" for n, v in zip(variant_names, MODEL_VARIANTS)]
)
_ = ax.set_ylabel("Inference Time (ms / frame)")
_ = ax.set_title("Inference Speed per Model Variant")
_ = ax.grid(True, axis="y", alpha=0.3)

# Annotate FPS on bars
for bar, m in zip(bars, means):
    _ = ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 2,
        f"{1000 / m:.1f} fps",
        ha="center",
        fontsize=10,
    )

plt.tight_layout()
plt.show()

fig.savefig("outputs/model_speed_comparison.png")

## Detection Count Sweep

Run each model variant over N frames and plot the number of filtered detections per frame. A better model should give a stable count close to the expected number of visible players (~15-25 for a wide broadcast view).

In [None]:
det_counts = {name: [] for name in variant_names}

for frame in bench_frames:
    for name in variant_names:
        dets = detectors[name].detect_and_filter(frame)
        det_counts[name].append(len(dets))

fig, ax = plt.subplots(figsize=(14, 5))
for name in variant_names:
    _ = ax.plot(det_counts[name], label=name, alpha=0.8)

_ = ax.set_xlabel("Frame")
_ = ax.set_ylabel("Detection Count (after filtering)")
_ = ax.set_title(f"Detections per Frame by Model Variant  (conf={CONF_THRESHOLD})")
_ = ax.legend()
_ = ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Confidence Distribution per Model

Histogram of detection confidences on a single frame, overlaid for all variants. Larger models should push real-player confidences higher, making it easier to separate players from noise with a threshold.

In [None]:
# TODO: change this plot to a density plot using seaborn
sample_frame = frames[SAMPLE_FRAME_INDICES[1]]

fig, ax = plt.subplots(figsize=(12, 5))

for name in variant_names:
    # Use raw detections (before filtering) to see full confidence range
    raw_dets = detectors[name].detect(sample_frame)
    confs = [d[4] for d in raw_dets]
    _ = ax.hist(
        confs,
        bins=25,
        alpha=0.4,
        label=f"{name} ({len(confs)} dets)",
        edgecolor="black",
    )

_ = ax.axvline(
    x=CONF_THRESHOLD,
    color="red",
    linestyle="--",
    linewidth=2,
    label=f"threshold={CONF_THRESHOLD}",
)
_ = ax.set_xlabel("Confidence")
_ = ax.set_ylabel("Count")
_ = ax.set_title(f"Detection Confidence Distribution — Frame {SAMPLE_FRAME_INDICES[1]}")
_ = ax.legend()
_ = ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Mean Confidence per Model

Box plot of per-frame mean confidence across the sweep. Higher mean confidence with low variance indicates a model that is consistently sure about its detections.

In [None]:
mean_confs = {name: [] for name in variant_names}

for frame in bench_frames:
    for name in variant_names:
        dets = detectors[name].detect_and_filter(frame)
        if dets:
            mean_confs[name].append(np.mean([d[4] for d in dets]))
        else:
            mean_confs[name].append(0.0)

fig, ax = plt.subplots(figsize=(10, 5))
data = [mean_confs[name] for name in variant_names]
bp = ax.boxplot(data, labels=variant_names, patch_artist=True)

colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(variant_names)))
for patch, color in zip(bp["boxes"], colors):
    patch.set_facecolor(color)

ax.set_ylabel("Mean Detection Confidence")
ax.set_title("Per-Frame Mean Confidence by Model Variant")
ax.grid(True, axis="y", alpha=0.3)
plt.tight_layout()
plt.show()

## Summary Statistics

Aggregate table comparing all variants across key metrics: detection count (mean/std), mean confidence, and inference speed.

In [None]:
print(
    f"{'Model':>10} {'Params':>8} {'FLOPs(B)':>9} {'mAP':>6} {'Dets/fr':>8} {'Std':>6} "
    f"{'Mean Conf':>10} {'ms/fr':>8} {'FPS':>8}"
)
print("-" * 88)

for variant in MODEL_VARIANTS:
    name = variant["name"]
    dc = np.array(det_counts[name])
    mc = np.array(mean_confs[name])
    sp = np.array(speed_results[name])
    print(
        f"{name:>10} {variant['params']:>8} {variant['flops_b']:>9.1f} {variant['map']:>6.1f} "
        f"{dc.mean():>8.1f} {dc.std():>6.1f} {mc.mean():>10.3f} "
        f"{sp.mean():>8.1f} {1000 / sp.mean():>8.1f}"
    )

## Speed vs Accuracy Trade-off

Scatter plot of mean inference time vs mean detection count. The ideal model sits in the bottom-right: high detection count (good recall) and low inference time (fast). The annotated Pareto front helps identify the best trade-off.

In [None]:
# TODO: use seaborn for this plot
fig, ax = plt.subplots(figsize=(11, 6))

# Color YOLO26 differently from YOLOv8
for variant in MODEL_VARIANTS:
    name = variant["name"]
    x = np.mean(speed_results[name])
    y = np.mean(det_counts[name])
    is_yolo26 = name.startswith("yolo26")
    color = "steelblue" if is_yolo26 else "tomato"
    _ = ax.scatter(
        x,
        y,
        s=variant["flops_b"] * 1.5,
        alpha=0.75,
        zorder=5,
        color=color,
        label="YOLO26"
        if (is_yolo26 and name == "yolo26n")
        else ("YOLOv8 (legacy)" if (not is_yolo26 and name == "yolov8s") else None),
    )
    _ = ax.annotate(
        f"{name}\n({variant['params']}, mAP {variant['map']})",
        (x, y),
        textcoords="offset points",
        xytext=(10, 5),
        fontsize=8,
    )

_ = ax.set_xlabel("Mean Inference Time (ms / frame)")
_ = ax.set_ylabel("Mean Detections / Frame")
_ = ax.set_title("Speed vs Detection Count  (bubble size = FLOPs)")
_ = ax.legend()
_ = ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Unique Detections: What Do Larger Models Find That Smaller Ones Miss?

Compare the smallest (nano) and largest (xlarge) model on a sample frame. Green boxes = both models agree. Blue boxes = only the larger model detects. Red boxes = only the smaller model detects. This highlights the recall gain from using a bigger backbone.

In [None]:
def iou(box_a, box_b):
    """Compute IoU between two [x1,y1,x2,y2,...] boxes."""
    x1 = max(box_a[0], box_b[0])
    y1 = max(box_a[1], box_b[1])
    x2 = min(box_a[2], box_b[2])
    y2 = min(box_a[3], box_b[3])
    inter = max(0, x2 - x1) * max(0, y2 - y1)
    area_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
    area_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
    union = area_a + area_b - inter
    return inter / union if union > 0 else 0.0


def match_detections(dets_a, dets_b, iou_thresh=0.5):
    """Return (matched_a, matched_b, only_a, only_b)."""
    matched_a, matched_b = set(), set()
    for i, da in enumerate(dets_a):
        for j, db in enumerate(dets_b):
            if j not in matched_b and iou(da, db) >= iou_thresh:
                matched_a.add(i)
                matched_b.add(j)
                break
    only_a = [dets_a[i] for i in range(len(dets_a)) if i not in matched_a]
    only_b = [dets_b[j] for j in range(len(dets_b)) if j not in matched_b]
    common_a = [dets_a[i] for i in matched_a]
    return common_a, only_a, only_b


small_name = variant_names[0]  # nano
large_name = variant_names[-1]  # xlarge

for idx, frame in sorted(frames.items()):
    dets_small = detectors[small_name].detect_and_filter(frame)
    dets_large = detectors[large_name].detect_and_filter(frame)
    common, only_small, only_large = match_detections(dets_small, dets_large)

    fig, ax = plt.subplots(figsize=(14, 8))
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    _ = ax.imshow(rgb)

    for det in common:
        x1, y1, x2, y2 = det[:4]
        _ = ax.add_patch(
            Rectangle((x1, y1), x2 - x1, y2 - y1, lw=2, ec="lime", fc="none")
        )
    for det in only_small:
        x1, y1, x2, y2 = det[:4]
        _ = ax.add_patch(
            Rectangle((x1, y1), x2 - x1, y2 - y1, lw=2, ec="red", fc="none", ls="--")
        )
    for det in only_large:
        x1, y1, x2, y2 = det[:4]
        _ = ax.add_patch(
            Rectangle(
                (x1, y1), x2 - x1, y2 - y1, lw=2, ec="dodgerblue", fc="none", ls="--"
            )
        )

    _ = ax.set_title(
        f"Frame {idx} — green=both ({len(common)})  "
        f"red={small_name} only ({len(only_small)})  "
        f"blue={large_name} only ({len(only_large)})",
        fontsize=12,
    )
    _ = ax.set_axis_off()
    plt.tight_layout()
    plt.show()