<a href="https://colab.research.google.com/github/mehri-satari/Data-Mining-Course-Project/blob/main/change(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================================
# STEP 0 — Colab installs + imports + Drive mount
# ============================================================

# --- Colab installs (run once) ---
!pip -q install ultralytics opencv-python pillow tqdm
!pip -q install av2

from google.colab import drive
drive.mount('/content/drive')

import os
import math
import json
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Tuple, List, Optional, Any, Set

import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

from ultralytics import YOLO
from av2.utils import io as io_utils


In [None]:
LOG_IDS = [
    "0526e68e-2ff1-3e53-b0f8-45df02e45a93",
    "04973bcf-fc64-367c-9642-6d6c5f363b61",
    "03fba633-8085-30bc-b675-687a715536ac",
    "03b2cf2d-fb61-36fe-936f-36bbf197a8ac",
    "0322b098-7e42-34db-bcec-9a4d072191e9",
    "022af476-9937-3e70-be52-f65420d52703",
    "01bb304d-7bd8-35f8-bbef-7086b688e35e",
    "00a6ffc1-6ce9-3bc3-a060-6006e9893a1a",
]
from pathlib import Path

def find_av2_root_by_log_ids(
    log_ids,
    search_roots=(Path("/content/drive/MyDrive"), Path("/content/drive/Shareddrives")),
    max_matches_per_root=5
) -> Path:
    """
    Finds the parent directory that contains the AV2 log folders.
    We search for the first log_id directory, then validate that the other log_ids
    exist alongside it.
    Returns: AV2_ROOT such that AV2_ROOT/<log_id>/annotations.feather exists.
    """
    first = log_ids[0]

    for root in search_roots:
        if not root.exists():
            continue

        # Find candidate directories named like the first log id
        candidates = []
        for p in root.rglob(first):
            if p.is_dir():
                candidates.append(p)
                if len(candidates) >= max_matches_per_root:
                    break

        for log_dir in candidates:
            av2_root = log_dir.parent  # should be .../Argoverse2
            # Validate: all logs exist and look like AV2 logs
            ok = True
            for lid in log_ids:
                lid_dir = av2_root / lid
                if not lid_dir.is_dir():
                    ok = False
                    break
                if not (lid_dir / "annotations.feather").exists():
                    ok = False
                    break
                if not (lid_dir / "calibration" / "intrinsics.feather").exists():
                    ok = False
                    break
                if not (lid_dir / "calibration" / "egovehicle_SE3_sensor.feather").exists():
                    ok = False
                    break

            if ok:
                return av2_root

    raise FileNotFoundError(
        "Could not locate the AV2 root folder in mounted Drive.\n"
        "Make sure the shared folder is added as a Shortcut into MyDrive or is in a Shared Drive, "
        "and that the log folders contain annotations.feather and calibration/*.feather."
    )

AV2_ROOT = find_av2_root_by_log_ids(LOG_IDS)
print("✅ Found AV2_ROOT:", AV2_ROOT)


In [None]:
# ============================================================
# STEP 1 — Configure logs + thresholds + output
# ============================================================

AV2_ROOT = Path("/content/drive/MyDrive/Argoverse2")

LOG_IDS = [
    "0526e68e-2ff1-3e53-b0f8-45df02e45a93",
    "04973bcf-fc64-367c-9642-6d6c5f363b61",
    "03fba633-8085-30bc-b675-687a715536ac",
    "03b2cf2d-fb61-36fe-936f-36bbf197a8ac",
    "0322b098-7e42-34db-bcec-9a4d072191e9",
    "022af476-9937-3e70-be52-f65420d52703",
    "01bb304d-7bd8-35f8-bbef-7086b688e35e",
    "00a6ffc1-6ce9-3bc3-a060-6006e9893a1a",
]

# pruning thresholds requested
TAU_LIST = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]

# output root
OUT_ROOT = Path("/content/drive/MyDrive/av2_redundancy_yolo_multi_logs")

# global split seed
SPLIT_SEED = 7

# max allowable timestamp mismatch when pairing annotation timestamp to image timestamp (ns)
MAX_TS_DIFF_NS = 50_000_000  # 50ms

# If you also want to train YOLO for each tau, set TRAIN_MODELS=True.
# (This can take a long time.)
TRAIN_MODELS = False
EPOCHS = 30
IMGSZ = 640
BATCH = 16


In [None]:
# ============================================================
# STEP 2 — Geometry utilities (same logic as your notebook)
# ============================================================

def quat_to_rotmat(qw, qx, qy, qz) -> np.ndarray:
    q = np.array([qw, qx, qy, qz], dtype=np.float64)
    q = q / (np.linalg.norm(q) + 1e-12)
    w, x, y, z = q
    R = np.array([
        [1-2*(y*y+z*z), 2*(x*y - z*w), 2*(x*z + y*w)],
        [2*(x*y + z*w), 1-2*(x*x+z*z), 2*(y*z - x*w)],
        [2*(x*z - y*w), 2*(y*z + x*w), 1-2*(x*x+y*y)],
    ], dtype=np.float64)
    return R

@dataclass
class SE3:
    R: np.ndarray  # 3x3
    t: np.ndarray  # (3,)

    def inverse(self) -> "SE3":
        R_inv = self.R.T
        t_inv = -R_inv @ self.t
        return SE3(R=R_inv, t=t_inv)

    def transform_points(self, pts: np.ndarray) -> np.ndarray:
        # pts: (N,3)
        return (pts @ self.R.T) + self.t.reshape(1, 3)

@dataclass
class CameraIntrinsics:
    fx: float
    fy: float
    cx: float
    cy: float

def build_intrinsics_dict(intr_df: pd.DataFrame) -> Dict[str, CameraIntrinsics]:
    req = ["sensor_name", "fx_px", "fy_px", "cx_px", "cy_px"]
    for c in req:
        if c not in intr_df.columns:
            raise ValueError(f"Missing intrinsics column: {c}")

    intr = {}
    for _, r in intr_df.iterrows():
        intr[str(r["sensor_name"])] = CameraIntrinsics(
            fx=float(r["fx_px"]),
            fy=float(r["fy_px"]),
            cx=float(r["cx_px"]),
            cy=float(r["cy_px"]),
        )
    return intr

def build_extrinsics_dict(extr_df: pd.DataFrame) -> Dict[str, SE3]:
    if "sensor_name" not in extr_df.columns:
        raise ValueError("extrinsics missing 'sensor_name'")

    quat_cols = [c for c in ["qw", "qx", "qy", "qz"] if c in extr_df.columns]
    trans_cols = [c for c in ["tx_m", "ty_m", "tz_m"] if c in extr_df.columns]

    extr: Dict[str, SE3] = {}
    for _, r in extr_df.iterrows():
        name = str(r["sensor_name"])
        if len(quat_cols) == 4 and len(trans_cols) == 3:
            R = quat_to_rotmat(float(r["qw"]), float(r["qx"]), float(r["qy"]), float(r["qz"]))
            t = np.array([float(r["tx_m"]), float(r["ty_m"]), float(r["tz_m"])], dtype=np.float64)
            extr[name] = SE3(R=R, t=t)
        else:
            mat_col = None
            for cand in ["T_egovehicle_sensor", "egovehicle_SE3_sensor", "transform_matrix"]:
                if cand in extr_df.columns:
                    mat_col = cand
                    break
            if mat_col is None:
                raise ValueError(
                    "Extrinsics format not recognized. Expected qw/qx/qy/qz + tx_m/ty_m/tz_m OR a 4x4 matrix column."
                )
            T = np.array(r[mat_col], dtype=np.float64).reshape(4, 4)
            extr[name] = SE3(R=T[:3, :3], t=T[:3, 3])
    return extr

def get_col(available_cols, candidates):
    for c in candidates:
        if c in available_cols:
            return c
    raise KeyError(f"None of {candidates} found in columns.")

def cuboid_corners_ego(row: pd.Series) -> np.ndarray:
    cols = set(row.index)

    cx = float(row[get_col(cols, ["center_x", "tx_m", "x", "translation_x"])])
    cy = float(row[get_col(cols, ["center_y", "ty_m", "y", "translation_y"])])
    cz = float(row[get_col(cols, ["center_z", "tz_m", "z", "translation_z"])])

    length = float(row[get_col(cols, ["length_m", "length"])])
    width  = float(row[get_col(cols, ["width_m", "width"])])
    height = float(row[get_col(cols, ["height_m", "height"])])

    qw = float(row[get_col(cols, ["qw", "rotation_qw"])])
    qx = float(row[get_col(cols, ["qx", "rotation_qx"])])
    qy = float(row[get_col(cols, ["qy", "rotation_qy"])])
    qz = float(row[get_col(cols, ["qz", "rotation_qz"])])

    R = quat_to_rotmat(qw, qx, qy, qz)
    center = np.array([cx, cy, cz], dtype=np.float64)

    l2, w2, h2 = length / 2, width / 2, height / 2
    corners_local = np.array([
        [ l2,  w2,  h2],
        [ l2, -w2,  h2],
        [-l2, -w2,  h2],
        [-l2,  w2,  h2],
        [ l2,  w2, -h2],
        [ l2, -w2, -h2],
        [-l2, -w2, -h2],
        [-l2,  w2, -h2],
    ], dtype=np.float64)

    return (corners_local @ R.T) + center.reshape(1, 3)

@dataclass
class Box2D:
    xmin: float
    ymin: float
    xmax: float
    ymax: float
    bcs: float

def project_points_to_image(pts_cam: np.ndarray, intr: CameraIntrinsics) -> np.ndarray:
    x, y, z = pts_cam[:, 0], pts_cam[:, 1], pts_cam[:, 2]
    eps = 1e-9
    u = intr.fx * (x / (z + eps)) + intr.cx
    v = intr.fy * (y / (z + eps)) + intr.cy
    return np.stack([u, v, z], axis=1)

def bbox_and_bcs_from_cuboid(
    corners_ego: np.ndarray,
    intr: CameraIntrinsics,
    T_sensor_ego: SE3,   # ego -> camera
    img_w: int,
    img_h: int
) -> Optional[Box2D]:
    corners_cam = T_sensor_ego.transform_points(corners_ego)

    if np.all(corners_cam[:, 2] <= 0.1):
        return None

    uvz = project_points_to_image(corners_cam, intr)
    u, v, z = uvz[:, 0], uvz[:, 1], uvz[:, 2]

    valid = z > 0.1
    if valid.sum() < 2:
        return None

    u_full = u[valid]
    v_full = v[valid]

    xmin_full, xmax_full = float(u_full.min()), float(u_full.max())
    ymin_full, ymax_full = float(v_full.min()), float(v_full.max())

    full_w = max(0.0, xmax_full - xmin_full)
    full_h = max(0.0, ymax_full - ymin_full)
    area_full = full_w * full_h
    if area_full <= 1e-6:
        return None

    xmin_clip = max(0.0, min(float(img_w - 1), xmin_full))
    xmax_clip = max(0.0, min(float(img_w - 1), xmax_full))
    ymin_clip = max(0.0, min(float(img_h - 1), ymin_full))
    ymax_clip = max(0.0, min(float(img_h - 1), ymax_full))

    clip_w = max(0.0, xmax_clip - xmin_clip)
    clip_h = max(0.0, ymax_clip - ymin_clip)
    area_clip = clip_w * clip_h

    bcs = float(area_clip / area_full)

    if area_clip <= 1.0:
        return None

    return Box2D(xmin=xmin_clip, ymin=ymin_clip, xmax=xmax_clip, ymax=ymax_clip, bcs=bcs)



In [None]:
# ============================================================
# STEP 3 — Camera image indexing + overlap pairs (same logic)
# ============================================================

def find_camera_root(scene_path: Path) -> Path:
    candidates = [
        scene_path / "sensors" / "cameras",
        scene_path / "sensor" / "cameras",
        scene_path / "cameras",
    ]
    for p in candidates:
        if p.exists():
            return p
    raise FileNotFoundError(f"Cannot find camera root under {scene_path} (tried: {candidates})")

def parse_timestamp_from_filename(p: Path) -> Optional[int]:
    stem = p.stem
    return int(stem) if stem.isdigit() else None

def index_images(cam_root: Path, cameras: List[str]) -> Dict[str, Dict[int, Path]]:
    idx: Dict[str, Dict[int, Path]] = {}
    for cam in cameras:
        cam_dir = cam_root / cam
        if not cam_dir.exists():
            # keep consistent key presence
            idx[cam] = {}
            continue
        ts_map: Dict[int, Path] = {}
        for ext in ["*.jpg", "*.jpeg", "*.png"]:
            for p in cam_dir.glob(ext):
                ts = parse_timestamp_from_filename(p)
                if ts is not None:
                    ts_map[ts] = p
        idx[cam] = ts_map
    return idx

def nearest_timestamp(target: int, available_sorted: List[int], max_diff_ns: int = 50_000_000) -> Optional[int]:
    if not available_sorted:
        return None
    arr = np.array(available_sorted, dtype=np.int64)
    i = int(np.searchsorted(arr, target))
    cand = []
    if i < len(arr): cand.append(int(arr[i]))
    if i > 0: cand.append(int(arr[i-1]))
    best = min(cand, key=lambda x: abs(int(x) - int(target)))
    return int(best) if abs(int(best) - int(target)) <= max_diff_ns else None

# overlap helpers
def wrap_pi(a: float) -> float:
    return float((a + np.pi) % (2*np.pi) - np.pi)

def fov_segments(center: float, hfov: float):
    a1 = wrap_pi(center - hfov/2)
    a2 = wrap_pi(center + hfov/2)
    if a1 <= a2:
        return [(a1, a2)]
    return [(a1, np.pi), (-np.pi, a2)]

def seg_overlap(s1, s2) -> float:
    left = max(s1[0], s2[0])
    right = min(s1[1], s2[1])
    return max(0.0, right - left)

def circular_overlap(center1: float, hfov1: float, center2: float, hfov2: float) -> float:
    segs1 = fov_segments(center1, hfov1)
    segs2 = fov_segments(center2, hfov2)
    ov = 0.0
    for a in segs1:
        for b in segs2:
            ov += seg_overlap(a, b)
    return float(min(ov, min(hfov1, hfov2)))

def camera_yaw_center_in_ego(T_ego_sensor: SE3) -> float:
    # assumes camera forward is +Z in camera frame
    forward_cam = np.array([0.0, 0.0, 1.0], dtype=np.float64)
    forward_ego = T_ego_sensor.R @ forward_cam
    return float(np.arctan2(forward_ego[1], forward_ego[0]))

def hfov_from_intrinsics(intr: CameraIntrinsics, img_w: int) -> float:
    return float(2.0 * np.arctan(img_w / (2.0 * intr.fx)))

def compute_overlap_pairs(
    cameras: List[str],
    INTR: Dict[str, CameraIntrinsics],
    T_EGO_SENSOR: Dict[str, SE3],
    IMG_INDEX: Dict[str, Dict[int, Path]],
    min_overlap_deg: float = 5.0
) -> List[Tuple[str, str, float]]:
    min_overlap = math.radians(min_overlap_deg)

    # one image per camera to get W,H
    cam_sizes = {}
    for cam in cameras:
        ts_map = IMG_INDEX.get(cam, {})
        if not ts_map:
            continue
        any_path = next(iter(ts_map.values()))
        with Image.open(any_path) as im:
            cam_sizes[cam] = im.size  # (W,H)

    cam_info = {}
    for cam in cameras:
        if cam not in cam_sizes or cam not in INTR or cam not in T_EGO_SENSOR:
            continue
        W, _ = cam_sizes[cam]
        yaw = camera_yaw_center_in_ego(T_EGO_SENSOR[cam])
        hfov = hfov_from_intrinsics(INTR[cam], W)
        cam_info[cam] = (yaw, hfov)

    cams = list(cam_info.keys())
    pairs = []
    for i in range(len(cams)):
        for j in range(i+1, len(cams)):
            c1, c2 = cams[i], cams[j]
            yaw1, hfov1 = cam_info[c1]
            yaw2, hfov2 = cam_info[c2]
            ov = circular_overlap(yaw1, hfov1, yaw2, hfov2)
            if ov > np.pi + 1e-6:
                raise RuntimeError(f"Impossible overlap > 180deg for {c1},{c2}: {ov} rad")
            if ov >= min_overlap:
                pairs.append((c1, c2, ov))

    pairs.sort(key=lambda x: x[2], reverse=True)
    return pairs

def yolo_line_from_box(box: Box2D, cls_id: int, img_w: int, img_h: int) -> str:
    cx = ((box.xmin + box.xmax) / 2.0) / img_w
    cy = ((box.ymin + box.ymax) / 2.0) / img_h
    w  = (box.xmax - box.xmin) / img_w
    h  = (box.ymax - box.ymin) / img_h
    cx = min(max(float(cx), 0.0), 1.0)
    cy = min(max(float(cy), 0.0), 1.0)
    w  = min(max(float(w),  0.0), 1.0)
    h  = min(max(float(h),  0.0), 1.0)
    return f"{cls_id} {cx:.6f} {cy:.6f} {w:.6f} {h:.6f}"


In [None]:
# ============================================================
# STEP 4 — Load metadata for ALL logs + build GLOBAL class map
# ============================================================

def load_initial_data(scene_path: Path):
    ann_df = io_utils.read_feather(scene_path / "annotations.feather")
    intr_df = io_utils.read_feather(scene_path / "calibration" / "intrinsics.feather")
    extr_df = io_utils.read_feather(scene_path / "calibration" / "egovehicle_SE3_sensor.feather")
    return ann_df, intr_df, extr_df

# Load each log’s ann_df (for categories + splits)
ANN_BY_LOG: Dict[str, pd.DataFrame] = {}

all_categories: Set[str] = set()
CATEGORY_COL = "category"  # confirmed by your printout, keep fixed for all logs

for log_id in LOG_IDS:
    scene_path = AV2_ROOT / log_id
    ann_df, intr_df, extr_df = load_initial_data(scene_path)
    ANN_BY_LOG[log_id] = ann_df

    if CATEGORY_COL not in ann_df.columns:
        raise ValueError(f"{log_id}: missing '{CATEGORY_COL}' in annotations.feather")

    cats = ann_df[CATEGORY_COL].dropna().astype(str).unique().tolist()
    all_categories.update(cats)

# GLOBAL classes across all logs (important!)
NAMES = sorted(list(all_categories))
CLASS_MAP = {c: i for i, c in enumerate(NAMES)}

print("GLOBAL Num classes:", len(NAMES))
print("GLOBAL Example class map (first 15):", list(CLASS_MAP.items())[:15])


In [None]:
# ============================================================
# STEP 5 — Shared train/val split across ALL logs (timestamp keys)
# ============================================================

def make_train_val_split_by_log_timestamp(
    ann_by_log: Dict[str, pd.DataFrame],
    train_ratio: float = 0.8,
    seed: int = 7
) -> Tuple[Set[Tuple[str, int]], Set[Tuple[str, int]]]:
    keys: List[Tuple[str, int]] = []
    for log_id, ann_df in ann_by_log.items():
        if "timestamp_ns" not in ann_df.columns:
            raise ValueError(f"{log_id}: annotations missing 'timestamp_ns'")
        for ts in ann_df["timestamp_ns"].dropna().astype(np.int64).unique():
            keys.append((log_id, int(ts)))

    rng = np.random.RandomState(seed)
    rng.shuffle(keys)

    n_train = int(len(keys) * train_ratio)
    train_keys = set(keys[:n_train])
    val_keys   = set(keys[n_train:])
    return train_keys, val_keys

TRAIN_KEYS, VAL_KEYS = make_train_val_split_by_log_timestamp(ANN_BY_LOG, train_ratio=0.8, seed=SPLIT_SEED)
print(f"Total (log,timestamp) keys: {len(TRAIN_KEYS) + len(VAL_KEYS)}")
print(f"Train keys: {len(TRAIN_KEYS)} | Val keys: {len(VAL_KEYS)}")


In [None]:
# ============================================================
# STEP 6 — Build YOLO dataset for ALL logs (baseline + pruned taus)
#         Also compute deletion statistics.
# ============================================================

def reset_dir(p: Path):
    if p.exists():
        shutil.rmtree(p)
    p.mkdir(parents=True, exist_ok=True)

def build_yolo_from_av2_logs(
    log_ids: List[str],
    av2_root: Path,
    ann_by_log: Dict[str, pd.DataFrame],
    out_root: Path,
    tau_bcs: float,
    train_keys: Set[Tuple[str, int]],
    val_keys: Set[Tuple[str, int]],
    class_map: Dict[str, int],
    names: List[str],
    max_ts_diff_ns: int = 50_000_000,
    drop_empty_images: bool = False,
) -> Dict[str, Any]:
    """
    Builds a single YOLO dataset combining multiple logs.
    Pruning rule (same as your code):
      For overlap camera pair, if same (timestamp, track_uuid) appears in both cams and
      |BCS_A - BCS_B| > tau_bcs -> drop the lower-BCS label instance (keep higher-BCS).

    Reports:
      - total_candidate_labels_before_pruning
      - total_deleted_label_instances (unique (cam, track, ts_img) drops)
      - total_unique_3d_objects_seen (unique (log, timestamp, track_uuid) with at least 1 camera projection)
    """

    dataset_dir = out_root
    reset_dir(dataset_dir)
    for split in ["train", "val"]:
        (dataset_dir / "images" / split).mkdir(parents=True, exist_ok=True)
        (dataset_dir / "labels" / split).mkdir(parents=True, exist_ok=True)

    total_candidate_labels = 0
    total_deleted_labels = 0
    total_unique_3d_objects = set()  # (log_id, ts, track_uuid) that got at least one projected box

    # Iterate each log and export its (log,timestamp) keys
    for log_id in log_ids:
        scene_path = av2_root / log_id
        ann_df = ann_by_log[log_id]

        # Load per-log intr/extr and build dictionaries
        _, intr_df, extr_df = load_initial_data(scene_path)
        INTR = build_intrinsics_dict(intr_df)
        T_EGO_SENSOR = build_extrinsics_dict(extr_df)             # sensor -> ego (as in your notebook)
        T_SENSOR_EGO = {k: v.inverse() for k, v in T_EGO_SENSOR.items()}  # ego -> sensor

        # Cameras and images for this log
        CAMERAS = intr_df["sensor_name"].astype(str).unique().tolist()
        CAM_ROOT = find_camera_root(scene_path)
        IMG_INDEX = index_images(CAM_ROOT, CAMERAS)

        # per-log overlap pairs
        OVERLAP_PAIRS = compute_overlap_pairs(CAMERAS, INTR, T_EGO_SENSOR, IMG_INDEX, min_overlap_deg=5.0)

        # keep only cameras that actually have images indexed
        cameras = [c for c in INTR.keys() if c in IMG_INDEX and len(IMG_INDEX[c]) > 0]
        if not cameras:
            print(f"[WARN] {log_id}: no cameras with images. Skipping.")
            continue

        # one image size per camera
        cam_size = {}
        for cam in cameras:
            any_path = next(iter(IMG_INDEX[cam].values()))
            with Image.open(any_path) as im:
                cam_size[cam] = im.size  # (W,H)

        cam_ts_sorted = {cam: sorted(IMG_INDEX[cam].keys()) for cam in cameras}

        # process only timestamps in our global train/val split
        keys_for_log = sorted(list({k for k in (train_keys | val_keys) if k[0] == log_id}), key=lambda x: x[1])

        for (_, ts) in tqdm(keys_for_log, desc=f"Building log {log_id[:8]}... tau={tau_bcs}", leave=False):
            split = "train" if (log_id, ts) in train_keys else ("val" if (log_id, ts) in val_keys else None)
            if split is None:
                continue

            ann_rows = ann_df[ann_df["timestamp_ns"] == ts]
            if ann_rows.empty:
                continue

            # per-camera candidates for this timestamp
            # per_cam_boxes[cam][track_uuid] = (Box2D, cls_id, ts_img, img_path, W, H)
            per_cam_boxes: Dict[str, Dict[str, Tuple[Box2D, int, int, Path, int, int]]] = {cam: {} for cam in cameras}

            # Build candidates
            for _, row in ann_rows.iterrows():
                track = str(row["track_uuid"])
                cat = row[CATEGORY_COL]
                if pd.isna(cat):
                    continue
                cat_str = str(cat)
                if cat_str not in class_map:
                    continue
                cls_id = int(class_map[cat_str])

                corners = cuboid_corners_ego(row)

                any_projection = False
                for cam in cameras:
                    ts_img = nearest_timestamp(ts, cam_ts_sorted[cam], max_diff_ns=max_ts_diff_ns)
                    if ts_img is None:
                        continue
                    img_path = IMG_INDEX[cam][ts_img]
                    W, H = cam_size[cam]

                    box = bbox_and_bcs_from_cuboid(
                        corners_ego=corners,
                        intr=INTR[cam],
                        T_sensor_ego=T_SENSOR_EGO[cam],  # ego -> camera
                        img_w=W,
                        img_h=H,
                    )
                    if box is None:
                        continue

                    per_cam_boxes[cam][track] = (box, cls_id, ts_img, img_path, W, H)
                    any_projection = True

                if any_projection:
                    total_unique_3d_objects.add((log_id, int(ts), track))

            # Count candidate label instances before pruning
            # (one label instance == one track in one camera at that ts)
            candidates_this_ts = sum(len(per_cam_boxes[cam]) for cam in cameras)
            total_candidate_labels += candidates_this_ts

            # Prune: mark (cam, track, ts_img) to drop
            to_drop: Set[Tuple[str, str, int]] = set()

            for camA, camB, _ in OVERLAP_PAIRS:
                if camA not in per_cam_boxes or camB not in per_cam_boxes:
                    continue
                common_tracks = set(per_cam_boxes[camA].keys()) & set(per_cam_boxes[camB].keys())
                for track in common_tracks:
                    boxA, clsA, tsA, _, _, _ = per_cam_boxes[camA][track]
                    boxB, clsB, tsB, _, _, _ = per_cam_boxes[camB][track]
                    if clsA != clsB:
                        continue

                    if abs(boxA.bcs - boxB.bcs) > tau_bcs:
                        # drop the lower-BCS instance
                        if boxA.bcs >= boxB.bcs:
                            to_drop.add((camB, track, tsB))
                        else:
                            to_drop.add((camA, track, tsA))

            total_deleted_labels += len(to_drop)

            # Write images + labels
            # IMPORTANT: unique filenames across logs to avoid collisions
            for cam in cameras:
                entries = list(per_cam_boxes[cam].items())
                if not entries:
                    continue

                # Use first entry to select the image to copy
                _, (_, _, ts_img, img_path, W, H) = entries[0]

                out_img_name = f"{log_id}_{cam}_{ts_img}.jpg"
                out_lbl_name = f"{log_id}_{cam}_{ts_img}.txt"

                lines = []
                for track, (box, cls_id, ts_img2, _, W2, H2) in per_cam_boxes[cam].items():
                    if (cam, track, ts_img2) in to_drop:
                        continue
                    lines.append(yolo_line_from_box(box, cls_id, W2, H2))

                if drop_empty_images and len(lines) == 0:
                    continue

                dst_img = dataset_dir / "images" / split / out_img_name
                shutil.copy(img_path, dst_img)

                dst_lbl = dataset_dir / "labels" / split / out_lbl_name
                with open(dst_lbl, "w") as f:
                    f.write("\n".join(lines))

    # data.yaml
    data_yaml = dataset_dir / "data.yaml"
    yaml_text = (
        f"path: {dataset_dir}\n"
        f"train: images/train\n"
        f"val: images/val\n"
        f"nc: {len(names)}\n"
        f"names: {json.dumps(names)}\n"
    )
    with open(data_yaml, "w") as f:
        f.write(yaml_text)

    return {
        "dataset_dir": dataset_dir,
        "data_yaml": data_yaml,
        "total_candidate_labels_before_pruning": int(total_candidate_labels),
        "total_deleted_label_instances": int(total_deleted_labels),
        "total_unique_3d_objects_seen": int(len(total_unique_3d_objects)),
    }


In [None]:
# ============================================================
# STEP 7 — Build UNPRUNED (baseline) dataset once (for evaluation)
#         Then build pruned datasets for tau in [0.1..0.6]
#         Report deletion statistics.
# ============================================================

# "unpruned" baseline: use a very large tau so abs(BCS_A-BCS_B) > tau never triggers
BASELINE_TAU = 1e9

baseline_info = build_yolo_from_av2_logs(
    log_ids=LOG_IDS,
    av2_root=AV2_ROOT,
    ann_by_log=ANN_BY_LOG,
    out_root=OUT_ROOT / "baseline_unpruned",
    tau_bcs=BASELINE_TAU,
    train_keys=TRAIN_KEYS,
    val_keys=VAL_KEYS,
    class_map=CLASS_MAP,
    names=NAMES,
    max_ts_diff_ns=MAX_TS_DIFF_NS,
    drop_empty_images=False,
)

print("\n=== BASELINE (UNPRUNED) DATASET BUILT ===")
print("Dataset:", baseline_info["dataset_dir"])
print("Total candidate label instances (before pruning):", baseline_info["total_candidate_labels_before_pruning"])
print("Total deleted label instances:", baseline_info["total_deleted_label_instances"])
print("Total unique 3D objects seen:", baseline_info["total_unique_3d_objects_seen"])

PRUNED_INFOS = {}

for tau in TAU_LIST:
    info = build_yolo_from_av2_logs(
        log_ids=LOG_IDS,
        av2_root=AV2_ROOT,
        ann_by_log=ANN_BY_LOG,
        out_root=OUT_ROOT / f"pruned_tau{tau:.1f}",
        tau_bcs=float(tau),
        train_keys=TRAIN_KEYS,
        val_keys=VAL_KEYS,
        class_map=CLASS_MAP,
        names=NAMES,
        max_ts_diff_ns=MAX_TS_DIFF_NS,
        drop_empty_images=False,
    )
    PRUNED_INFOS[tau] = info

    print(f"\n=== PRUNED DATASET tau={tau:.1f} BUILT ===")
    print("Dataset:", info["dataset_dir"])
    print("Total candidate label instances (before pruning):", info["total_candidate_labels_before_pruning"])
    print("Total deleted label instances:", info["total_deleted_label_instances"])
    print("Total unique 3D objects seen:", info["total_unique_3d_objects_seen"])

print("\n=== SUMMARY (All taus) ===")
base_before = baseline_info["total_candidate_labels_before_pruning"]
print("Baseline candidate labels:", base_before)
for tau in TAU_LIST:
    deleted = PRUNED_INFOS[tau]["total_deleted_label_instances"]
    before  = PRUNED_INFOS[tau]["total_candidate_labels_before_pruning"]
    uniq3d  = PRUNED_INFOS[tau]["total_unique_3d_objects_seen"]
    print(f"tau={tau:.1f}  before={before}  deleted={deleted}  remaining={before-deleted}  uniq3d={uniq3d}")


In [None]:
# ============================================================
# STEP 7.5 — Build UNPRUNED baseline once, then pruned datasets for all taus
# ============================================================

baseline_info = build_yolo_from_av2_logs(
    log_ids=LOG_IDS,
    av2_root=AV2_ROOT,
    ann_by_log=ANN_BY_LOG,
    out_root=OUT_ROOT / "baseline_unpruned",
    tau_bcs=BASELINE_TAU,
    train_keys=TRAIN_KEYS,
    val_keys=VAL_KEYS,
    class_map=CLASS_MAP,
    names=NAMES,
    max_ts_diff_ns=MAX_TS_DIFF_NS,
    drop_empty_images=False,
)

print("\n=== BASELINE (UNPRUNED) BUILT ===")
print(baseline_info)

PRUNED_INFOS = {}
for tau in TAU_LIST:
    info = build_yolo_from_av2_logs(
        log_ids=LOG_IDS,
        av2_root=AV2_ROOT,
        ann_by_log=ANN_BY_LOG,
        out_root=OUT_ROOT / f"pruned_tau{tau:.1f}",
        tau_bcs=float(tau),
        train_keys=TRAIN_KEYS,
        val_keys=VAL_KEYS,
        class_map=CLASS_MAP,
        names=NAMES,
        max_ts_diff_ns=MAX_TS_DIFF_NS,
        drop_empty_images=False,
    )
    PRUNED_INFOS[tau] = info
    print(f"\n=== PRUNED tau={tau:.1f} BUILT ===")
    print(info)

print("\n=== DELETION SUMMARY ===")
for tau in TAU_LIST:
    before = PRUNED_INFOS[tau]["total_candidate_labels_before_pruning"]
    deleted = PRUNED_INFOS[tau]["total_deleted_label_instances"]
    print(f"tau={tau:.1f}  before={before}  deleted={deleted}  remaining={before-deleted}")


In [None]:
# ============================================================
# STEP 8 — Train on baseline/pruned datasets (ALL 8 LOGS),
#          evaluate ALL models on UNPRUNED validation
#          and write a single report table + CSV.
# ============================================================

from pathlib import Path
import pandas as pd
from ultralytics import YOLO

# ----------------------------
# CONFIG (edit if you want)
# ----------------------------
EPOCHS = 30
IMGSZ  = 640
BATCH  = 16
SEED   = 7

# Make sure these exist from Step 7.5:
# baseline_info, PRUNED_INFOS, TAU_LIST, OUT_ROOT

BASELINE_DIR = Path(baseline_info["dataset_dir"])      # unpruned (train+val splits exist inside)
EVAL_DIR     = BASELINE_DIR                            # evaluate on UNPRUNED validation for everyone

print("Baseline (unpruned) dataset:", BASELINE_DIR)
print("Eval dataset (unpruned):", EVAL_DIR)
print("Pruned dataset dirs:")
for tau in TAU_LIST:
    print(f"  tau={tau:.1f} -> {Path(PRUNED_INFOS[tau]['dataset_dir'])}")

# ----------------------------
# TRAIN + EVAL helper
# ----------------------------
def train_and_eval_yolo(
    train_data_dir: Path,
    eval_data_dir: Path,
    run_name: str,
    epochs: int = 30,
    imgsz: int = 640,
    batch: int = 16,
    seed: int = 7
):
    train_data_dir = Path(train_data_dir)
    eval_data_dir  = Path(eval_data_dir)

    # 1) train
    model = YOLO("yolov8n.pt")
    train_res = model.train(
        data=str(train_data_dir / "data.yaml"),
        epochs=epochs,
        imgsz=imgsz,
        batch=batch,
        device=0,            # GPU
        cache=True,
        workers=4,
        name=run_name,
        project=str(train_data_dir / "runs"),
        verbose=True,
        seed=seed,
    )

    # 2) eval on UNPRUNED validation (same eval dir for all)
    metrics = model.val(
        data=str(eval_data_dir / "data.yaml"),
        device=0
    )

    precision = float(metrics.box.mp)
    recall    = float(metrics.box.mr)
    map50     = float(metrics.box.map50)
    map5095   = float(metrics.box.map)
    f1        = (2 * precision * recall / (precision + recall + 1e-12))

    # optional: read last train box loss from Ultralytics results.csv
    box_loss = None
    results_csv = Path(train_res.save_dir) / "results.csv"
    if results_csv.exists():
        df = pd.read_csv(results_csv)
        # Ultralytics naming can vary; this catches common cases
        cand_cols = [c for c in df.columns if "train/box_loss" in c.lower() or c.lower() == "train/box_loss"]
        if cand_cols:
            box_loss = float(df[cand_cols[0]].iloc[-1])

    return {
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1,
        "mAP50": map50,
        "mAP50-95": map5095,
        "Box Loss": box_loss
    }

# ----------------------------
# RUN ALL MODELS
# ----------------------------
rows = []

# (A) Baseline: train unpruned -> eval unpruned
base_metrics = train_and_eval_yolo(
    train_data_dir=BASELINE_DIR,
    eval_data_dir=EVAL_DIR,
    run_name="baseline_train_unpruned__eval_unpruned",
    epochs=EPOCHS, imgsz=IMGSZ, batch=BATCH, seed=SEED
)
rows.append({
    "Model Strategy": "Baseline (train unpruned → eval unpruned)",
    "tau_BCS": "unpruned",
    "candidate_labels_before": baseline_info["total_candidate_labels_before_pruning"],
    "deleted_labels": baseline_info["total_deleted_label_instances"],
    "remaining_labels": baseline_info["total_candidate_labels_before_pruning"] - baseline_info["total_deleted_label_instances"],
    "unique_3d_objects_seen": baseline_info["total_unique_3d_objects_seen"],
    **base_metrics
})

# (B) Pruned: train pruned(tau) -> eval unpruned
for tau in TAU_LIST:
    train_dir = Path(PRUNED_INFOS[tau]["dataset_dir"])

    pr_metrics = train_and_eval_yolo(
        train_data_dir=train_dir,
        eval_data_dir=EVAL_DIR,
        run_name=f"train_pruned_tau{tau:.1f}__eval_unpruned",
        epochs=EPOCHS, imgsz=IMGSZ, batch=BATCH, seed=SEED
    )

    before  = PRUNED_INFOS[tau]["total_candidate_labels_before_pruning"]
    deleted = PRUNED_INFOS[tau]["total_deleted_label_instances"]

    rows.append({
        "Model Strategy": f"Pruned (train tau={tau:.1f} → eval unpruned)",
        "tau_BCS": float(tau),
        "candidate_labels_before": before,
        "deleted_labels": deleted,
        "remaining_labels": before - deleted,
        "unique_3d_objects_seen": PRUNED_INFOS[tau]["total_unique_3d_objects_seen"],
        **pr_metrics
    })

report_df = pd.DataFrame(rows, columns=[
    "Model Strategy", "tau_BCS",
    "candidate_labels_before", "deleted_labels", "remaining_labels", "unique_3d_objects_seen",
    "Precision", "Recall", "F1-Score", "mAP50", "mAP50-95", "Box Loss"
])

print("\n=== FINAL YOLO THRESHOLD REPORT (train varies, eval always unpruned) ===")
print(report_df.to_string(index=False))

# Save to Drive
report_path = Path(OUT_ROOT) / "yolo_threshold_report.csv"
report_df.to_csv(report_path, index=False)
print("\nSaved report to:", report_path)

# Also keep it in memory for later use
REPORT_DF = report_df
