
# RBK Football2025 â€” YOLO Setup (Merged Multi-Match)

This notebook is tailored for the dataset structure you described:

```
Football2025/
  RBK-AALESUND/
    aalesund_cvat_overlay.mp4
    aalesund.mp4
    annotations.xml
    data/
      images/train/*.png
    labels/
      train/*.txt
    data.yaml
    train.txt
  RBK-BODO/
  RBK-FREDRIKSTAD/
  RBK-HamKam/
  RBK-VIKING/
```

It will:
1. Auto-discover the `RBK-*` folders under `Football2025`.
2. Collect images from `data/images/train` and matched labels from `labels/train`.
3. Create a **merged YOLO dataset** with `images/train`, `images/val`, `labels/train`, `labels/val` using **symlinks** (no copying).
4. Write a `data.yaml` pointing to these merged splits.
5. Provide training/eval cells for Ultralytics YOLO.


## 0) Environment check

In [1]:

# %pip install --upgrade ultralytics opencv-python pandas pyyaml matplotlib

import os, sys, json
from pathlib import Path

# Torch/Ultralytics check (optional)
try:
    import torch
    print("Torch:", torch.__version__)
    print("CUDA available:", torch.cuda.is_available())
except Exception as e:
    print("PyTorch not available yet:", e)

try:
    import ultralytics
    from ultralytics import YOLO
    print("Ultralytics:", ultralytics.__version__)
except Exception as e:
    print("Ultralytics not available yet:", e)


Torch: 2.9.1
CUDA available: False
Ultralytics not available yet: dlopen(/Users/olejacobmellgren/football_env/lib/python3.12/site-packages/cv2/cv2.abi3.so, 0x0002): Library not loaded: @loader_path/libaom.3.12.1.dylib
  Referenced from: <3AE50086-BBA4-3A54-88C5-5C602E06B65F> /Users/olejacobmellgren/football_env/lib/python3.12/site-packages/cv2/.dylibs/libavif.16.3.0.dylib
  Reason: tried: '/Users/olejacobmellgren/football_env/lib/python3.12/site-packages/cv2/.dylibs/libaom.3.12.1.dylib' (no such file), '/usr/lib/libaom.3.12.1.dylib' (no such file, not in dyld cache)Library not loaded: @loader_path/libaom.3.12.1.dylib
  Referenced from: <B2C96617-93FD-3626-B71B-B0C15D095AE8> /Users/olejacobmellgren/football_env/lib/python3.12/site-packages/cv2/.dylibs/libavformat.61.7.100.dylib
  Reason: tried: '/Users/olejacobmellgren/football_env/lib/python3.12/site-packages/cv2/.dylibs/libaom.3.12.1.dylib' (no such file), '/usr/lib/libaom.3.12.1.dylib' (no such file, not in dyld cache)Library not loa

## 1) Paths & discovery

In [2]:
from pathlib import Path
import random, numpy as np

# POINT STRAIGHT TO YOUR LOCAL WORKDIR
WORK_DIR = (Path.home() / "Documents" / "football_analysis" / "football_yolo_workdir").resolve()
MERGED_ROOT = WORK_DIR / "merged_yolo"
DATA_YAML_PATH = WORK_DIR / "football_merged_data.yaml"

print("WORK_DIR:", WORK_DIR)
print("MERGED_ROOT:", MERGED_ROOT)
print("DATA_YAML_PATH:", DATA_YAML_PATH)

NAMES = ['player', 'referee', 'ball']
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# optional seeding for torch
try:
    import torch
    torch.manual_seed(SEED)
except Exception:
    pass


WORK_DIR: /Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir
MERGED_ROOT: /Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/merged_yolo
DATA_YAML_PATH: /Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/football_merged_data.yaml


## 2) Merge per-match folders into a single YOLO dataset

In [3]:
# ONLY RUN THIS CELL ONCE TO BUILD MERGED DATASET

import os, re, shutil, random
from pathlib import Path
from typing import List, Tuple

def has_yolo_unit_dir(d: Path) -> bool:
    """Return True if d contains data/images/train and labels/train."""
    return (d / 'data' / 'images' / 'train').exists() and (d / 'labels' / 'train').exists()

def find_match_roots(root: Path) -> List[Path]:
    """Top-level RBK-* folders."""
    return [p for p in sorted(root.iterdir()) if p.is_dir() and p.name.startswith('RBK-')]

def find_dataset_units(root: Path) -> List[Tuple[Path, str]]:
    """
    Find all usable dataset 'units'.
    Each unit is a directory that itself contains data/images/train and labels/train.
    Returns list of (unit_dir, unit_tag) where unit_tag is used to keep paths unique.
    """
    units: List[Tuple[Path, str]] = []
    for mdir in find_match_roots(root):
        # Case A: the RBK-* folder itself is a unit
        if has_yolo_unit_dir(mdir):
            units.append((mdir, mdir.name))
        
        # Case B: nested units (e.g., RBK-BODO/part1/RBK_BODO_PART1)
        # Search up to a reasonable depth
        for sub in mdir.rglob('*'):
            if sub.is_dir() and has_yolo_unit_dir(sub):
                # Tag keeps RBK-* and the unit dir name to avoid name collisions
                tag = f"{mdir.name}/{sub.name}"
                # Avoid duplicating if we already added mdir itself
                if (sub, tag) not in units:
                    units.append((sub, tag))
    # Remove duplicates (if any)
    uniq = []
    seen = set()
    for u, tag in units:
        key = (u.resolve(), tag)
        if key not in seen:
            seen.add(key)
            uniq.append((u, tag))
    return uniq

def pair_images_labels(img_dir: Path, lbl_dir: Path) -> List[Tuple[Path, Path]]:
    pairs = []
    exts = ('.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG')
    for img_path in sorted(img_dir.rglob('*')):
        if img_path.is_file() and img_path.suffix in exts:
            rel = img_path.relative_to(img_dir)  # preserve any subfolders
            lbl_candidate = lbl_dir / rel.with_suffix('.txt')
            if lbl_candidate.exists():
                pairs.append((img_path, lbl_candidate))
    return pairs

def symlink(src: Path, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists() or dst.is_symlink():
        dst.unlink()
    os.symlink(src, dst)

# --- Discover all dataset units (handles RBK-BODO/part{1,2,3}/RBK_BODO_PART*) ---
# dataset_units = find_dataset_units(DATA_ROOT)
# print("Discovered units:")
# for udir, tag in dataset_units:
#     print("  ", tag, "->", udir)

# # --- Build (img, lbl, tag) triples across all units ---
# all_triples = []
# for unit_dir, tag in dataset_units:
#     img_dir = unit_dir / 'data' / 'images' / 'train'
#     lbl_dir = unit_dir / 'labels' / 'train'
#     pairs = pair_images_labels(img_dir, lbl_dir)
#     print(f"{tag}: {len(pairs)} pairs")
#     # keep tag so we can build unique relative names
#     all_triples.extend([(ip, lp, tag) for (ip, lp) in pairs])

# print("Total pairs:", len(all_triples))

# # --- Global 90/10 split ---
# random.seed(42)
# random.shuffle(all_triples)
# n_total = len(all_triples)
# n_val = max(1, int(0.1 * n_total))
# val_triples = all_triples[:n_val]
# train_triples = all_triples[n_val:]

# # --- Prepare merged dirs ---
# img_train = MERGED_ROOT / 'images' / 'train'
# img_val   = MERGED_ROOT / 'images' / 'val'
# lbl_train = MERGED_ROOT / 'labels' / 'train'
# lbl_val   = MERGED_ROOT / 'labels' / 'val'
# for d in [img_train, img_val, lbl_train, lbl_val]:
#     d.mkdir(parents=True, exist_ok=True)

def rel_name_from_triple(img_path: Path, unit_dir: Path, unit_tag: str) -> str:
    """
    Produce a unique relative path for the merged dataset.
    We keep the unit_tag (e.g., 'RBK-BODO/RBK_BODO_PART1') and the relative path under images/train.
    """
    rel_img = img_path.relative_to(unit_dir / 'data' / 'images' / 'train')
    return f"{unit_tag}/{rel_img.as_posix()}"

# --- Symlink into merged dataset ---
def link_split(triples, split_name: str):
    for img_path, lbl_path, tag in triples:
        # Recover the unit_dir for this triple (by reverse lookup)
        # Faster approach: pass unit_dir along in triples, but we can reconstruct by tag match:
        unit_dir = None
        for udir, utag in dataset_units:
            if utag == tag:
                unit_dir = udir
                break
        if unit_dir is None:
            # Fallback: put under tag flat
            rel_name = f"{tag}/{img_path.name}"
        else:
            rel_name = rel_name_from_triple(img_path, unit_dir, tag)

        dst_img = MERGED_ROOT / 'images' / split_name / rel_name
        dst_lbl = MERGED_ROOT / 'labels' / split_name / Path(rel_name).with_suffix('.txt')
        symlink(img_path, dst_img)
        symlink(lbl_path, dst_lbl)

# link_split(train_triples, 'train')
# link_split(val_triples, 'val')

# print(f"Train pairs: {len(train_triples)} | Val pairs: {len(val_triples)}")


## 3) Write `data.yaml` for Ultralytics

In [4]:
import json, yaml

def _expected_yaml():
    return {
        'path': MERGED_ROOT.as_posix(),
        'train': (MERGED_ROOT / 'images' / 'train').as_posix(),
        'val':   (MERGED_ROOT / 'images' / 'val').as_posix(),
        'names': {0:'player', 1:'referee', 2:'ball'}
    }

if DATA_YAML_PATH.exists():
    with open(DATA_YAML_PATH) as f:
        data_yaml = yaml.safe_load(f) or {}
    expected = _expected_yaml()
    needs_write = False
    for key, value in expected.items():
        if data_yaml.get(key) != value:
            data_yaml[key] = value
            needs_write = True
    if needs_write:
        with open(DATA_YAML_PATH, 'w') as f:
            yaml.safe_dump(data_yaml, f, sort_keys=False)
        print("Updated data.yaml with local paths:", DATA_YAML_PATH)
    else:
        print("Loaded existing data.yaml:", DATA_YAML_PATH)
else:
    data_yaml = _expected_yaml()
    with open(DATA_YAML_PATH, 'w') as f:
        yaml.safe_dump(data_yaml, f, sort_keys=False)
    print("Wrote", DATA_YAML_PATH)

print(json.dumps(data_yaml, indent=2))



Wrote /Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/football_merged_data.yaml
{
  "path": "/Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/merged_yolo",
  "train": "/Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/merged_yolo/images/train",
  "val": "/Users/olejacobmellgren/Documents/football_analysis/football_yolo_workdir/merged_yolo/images/val",
  "names": {
    "0": "player",
    "1": "referee",
    "2": "ball"
  }
}


## 4) Quick stats

In [5]:
from collections import Counter

def class_counts_for_existing_pairs(root: Path, split="train"):
    img_dir = root / "images" / split
    lbl_dir = root / "labels" / split
    counts = Counter()
    n_pairs = 0
    # count only images with a matching label .txt
    for img in img_dir.rglob("*.*"):
        if img.suffix.lower() not in {".png",".jpg",".jpeg"}:
            continue
        rel = img.relative_to(img_dir)
        lbl = lbl_dir / rel.with_suffix(".txt")
        if not lbl.exists():
            continue
        n_pairs += 1
        with open(lbl) as f:
            for line in f:
                s = line.strip()
                if not s: 
                    continue
                cid = int(float(s.split()[0]))
                counts[cid]+=1
    print(f"[{split}] paired images: {n_pairs} | boxes by class:", dict(counts))

class_counts_for_existing_pairs(MERGED_ROOT, "train")
class_counts_for_existing_pairs(MERGED_ROOT, "val")



[train] paired images: 10640 | boxes by class: {0: 239341, 1: 9452, 2: 68}
[val] paired images: 1721 | boxes by class: {0: 38695, 1: 1521, 2: 12}


## 5) Train YOLO

## 6) Evaluate (Precision, Recall, mAP@50, mAP@[0.5:0.95])


## Notes
- We used **symlinks** to avoid duplicating data. If your environment disallows symlinks, switch `symlink(...)` to copy files instead.
- We performed a global **90/10 train/val split**. If you prefer per-match splits or using the existing `train.txt` files, we can adapt easily.
- The labels are assumed to already be in **YOLO detection format** with classes `[0=player, 1=referee, 2=ball]`. If not, we can parse `annotations.xml` (CVAT) and convert explicitly.
