In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
matinmo_cataract_101_path = kagglehub.dataset_download('matinmo/cataract-101')

print('Data source import complete.')

In [1]:
# If running on Kaggle/Colab, most deps are preinstalled.
# Uncomment to install anything missing.
# !pip -q install --upgrade opencv-python-headless==4.10.0.84 timm==1.0.7

import os
import sys
import math
import time
import json
import random
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from collections import defaultdict, Counter

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from torchvision.models.video import r3d_18, R3D_18_Weights

# Reproducibility
SEED = 1337
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print('Python', sys.version)
print('PyTorch', torch.__version__)
print('CUDA available:', torch.cuda.is_available())


Python 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
PyTorch 2.6.0+cu124
CUDA available: True


In [3]:
# === Set your dataset paths ===
# Example (Kaggle): /kaggle/input/cataract-101/cataract-101
DATA_ROOT = Path('/kaggle/input/cataract-101/cataract-101')  # <- change if needed
ANN_PATH  = DATA_ROOT / 'annotations.csv'                    # <- change if needed

# Where to save checkpoints & artifacts
WORK_DIR = Path('/kaggle/working/phase2_clip_baseline')
WORK_DIR.mkdir(parents=True, exist_ok=True)

# Model/data config
@dataclass
class Config:
    # Data
    target_fps: int = 12            # decode at this FPS for training/inference
    resize: int = 112               # r3d_18 was pretrained at 112x112
    clip_len_sec: float = 2.0       # ~2 seconds clips
    stride_frac: float = 0.5        # 50% overlap during training
    subset_n: Optional[int] = 12    # limit number of videos to fit 20GB; None=all
    max_frames_per_video: Optional[int] = None  # e.g., 5000 to cap long videos

    # Train
    epochs: int = 5
    batch_size: int = 4
    num_workers: int = 2
    lr: float = 1e-4
    weight_decay: float = 1e-4
    label_smoothing: float = 0.0
    amp: bool = True
    grad_accum_steps: int = 1

    # Evaluation/postproc
    median_smooth: int = 15         # odd window size for timeline smoothing

CFG = Config()
print(CFG)


Config(target_fps=12, resize=112, clip_len_sec=2.0, stride_frac=0.5, subset_n=12, max_frames_per_video=None, epochs=5, batch_size=4, num_workers=2, lr=0.0001, weight_decay=0.0001, label_smoothing=0.0, amp=True, grad_accum_steps=1, median_smooth=15)


In [4]:
# === Cell 3 — Robust reader for semicolon Cataract-101 annotations ===
# Outputs:
#   df  : FRAME schema → [video_id, frame_idx, phase_id]
#   videos : {video_id -> Path}
#   fps_by_vid, num_classes, phase_to_idx, idx_to_phase

import re, math
from pathlib import Path
import pandas as pd, numpy as np
import cv2

assert 'DATA_ROOT' in globals(), "Run Cell 2 first to set DATA_ROOT."
DATA_ROOT = Path(DATA_ROOT)

ANN_PATH   = DATA_ROOT / 'annotations.csv'
VIDEOS_CSV = DATA_ROOT / 'videos.csv'
PHASES_CSV = DATA_ROOT / 'phases.csv'
VIDEO_DIR  = DATA_ROOT / 'videos'
if not ANN_PATH.exists():
    raise FileNotFoundError(f"annotations.csv not found at {ANN_PATH}")

# ---------- helpers ----------
def _norm(s: str) -> str:
    s = str(s).strip().lower().replace('-', '_').replace(' ', '_')
    return re.sub(r'[^a-z0-9_]', '', s)

def find_col(df_or_cols, aliases):
    cols = list(df_or_cols.columns) if hasattr(df_or_cols, "columns") else list(df_or_cols)
    if not cols: return None
    for a in aliases:
        if a in cols: return a
    nmap = {_norm(c): c for c in cols}
    for a in aliases:
        na = _norm(a)
        if na in nmap: return nmap[na]
    low = {_norm(c): c for c in cols}
    for a in aliases:
        na = _norm(a)
        for lc, orig in low.items():
            if na in lc: return orig
    return None

# ---------- videos & fps ----------
VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv'}
videos = {}
if VIDEO_DIR.exists():
    for p in VIDEO_DIR.rglob('*'):
        if p.suffix.lower() in VIDEO_EXTS:
            videos[p.stem] = p
print(f"[info] found {len(videos)} video files on disk")

def get_video_fps(path: Path) -> float:
    cap = cv2.VideoCapture(str(path))
    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
    cap.release()
    return float(fps) if fps and fps > 1e-3 else 25.0

fps_by_vid = {}
if VIDEOS_CSV.exists():
    vdf = pd.read_csv(VIDEOS_CSV)
    vdf.columns = [_norm(c) for c in vdf.columns]
    vcol = find_col(vdf, ['video_id','video','vid','case_id','filename','file','name'])
    fcol = find_col(vdf, ['fps','frame_rate','framerate'])
    if vcol:
        vdf['__stem__'] = vdf[vcol].astype(str).str.replace(r'\.[^.]+$', '', regex=True)
        if fcol:
            for r in vdf[['__stem__', fcol]].itertuples(index=False):
                try: fps_by_vid[str(r[0])] = float(r[1])
                except: pass
        # ensure stem->file
        for stem in vdf['__stem__'].unique():
            if stem not in videos:
                for p in VIDEO_DIR.rglob('*'):
                    if p.suffix.lower() in VIDEO_EXTS and p.stem == str(stem):
                        videos[str(stem)] = p

# ---------- read annotations (force semicolon) ----------
# Try normal; if a single column or header contains ';', re-read with sep=';'
ann = pd.read_csv(ANN_PATH, engine='python', sep=None)
needs_semicolon = (ann.shape[1] == 1) or any(';' in str(c) for c in ann.columns)
if needs_semicolon:
    # Try with header; if that still yields 1 column, use header=None then split
    ann_try = pd.read_csv(ANN_PATH, sep=';', engine='python', on_bad_lines='warn')
    if ann_try.shape[1] == 1:
        ann = pd.read_csv(ANN_PATH, sep=';', engine='python', header=None, names=['__one__'])
        # Split the single column into 3 fields "video;frame;phase"
        split = ann['__one__'].astype(str).str.split(';', n=2, expand=True)
        if split.shape[1] != 3:
            raise ValueError("Expected 'video;frame;phase' format when splitting semicolon annotations.")
        ann = split
        ann.columns = ['video', 'frame', 'phase']
    else:
        ann = ann_try

# normalize column names
ann.columns = [_norm(c) for c in ann.columns]
cols = set(ann.columns)
print("[info] annotations columns (normalized):", sorted(cols))

# If we *still* have 1 column, split it here defensively
if len(ann.columns) == 1:
    only = ann.columns[0]
    if ann[only].astype(str).str.contains(';').any():
        split = ann[only].astype(str).str.split(';', n=2, expand=True)
        if split.shape[1] == 3:
            ann = split
            ann.columns = ['video','frame','phase']
            ann.columns = [_norm(c) for c in ann.columns]

# Detect canonical col names now
col_video = find_col(ann, ['video','video_id','vid','case_id','filename','file','name'])
col_frame = find_col(ann, ['frame','frame_idx','frame_id','frameindex','fid'])
col_phase = find_col(ann, ['phase','label','phase_id','class','action','phase_name'])

if not (col_video and col_frame and col_phase):
    raise ValueError(f"Could not find video/frame/phase columns in annotations. Got: {list(ann.columns)}")

# Normalize video ids → stems; then map 269 → case_269
ann[col_video] = ann[col_video].astype(str).str.replace(r'\.[^.]+$', '', regex=True)

def _to_case_stem(v):
    s = str(v).strip()
    if s.startswith('case_'): return s
    # if value is something like '269' or 'case269'
    m = re.search(r'(\d+)$', s)
    return f"case_{m.group(1)}" if m else s

# Build the FRAME schema df
tmp = ann[[col_video, col_frame, col_phase]].copy()
tmp.columns = ['video_id', 'frame_idx', 'phase_raw']
tmp['video_id'] = tmp['video_id'].map(_to_case_stem)

# phase mapping
if pd.api.types.is_numeric_dtype(tmp['phase_raw']):
    tmp['phase_id_raw'] = pd.to_numeric(tmp['phase_raw'], errors='coerce').astype('Int64').fillna(-1).astype(int)
else:
    cats = pd.Categorical(tmp['phase_raw'])
    tmp['phase_id_raw'] = cats.codes.astype(int)
    print("[info] phase name→id mapping (first 10):", {int(i): l for i,l in list(enumerate(cats.categories))[:10]})

uniq_raw = pd.Index(sorted(tmp['phase_id_raw'].dropna().unique()))
phase_to_idx = {int(p): i for i,p in enumerate(uniq_raw)}
idx_to_phase = {i:int(p) for i,p in enumerate(uniq_raw)}
num_classes = len(uniq_raw)
tmp['phase_id'] = tmp['phase_id_raw'].map(phase_to_idx).astype(int)

# frame_idx as int
tmp['frame_idx'] = pd.to_numeric(tmp['frame_idx'], errors='coerce').fillna(0).astype(np.int64)

# final df
df = tmp[['video_id','frame_idx','phase_id']].sort_values(['video_id','frame_idx']).reset_index(drop=True)

# sanity
print(f"[info] Detected FRAME-LEVEL schema with columns: {df.columns.tolist()}")
print(f"[info] Num phases = {num_classes}")
print(df.head())


SyntaxError: invalid syntax (1233736004.py, line 138)

In [None]:
# Map annotation video_ids like "934" → "case_934"
assert 'df' in globals(), "Run Cell 3 first."

def _to_case_stem(v):
    s = str(v).strip()
    # already good
    if s.startswith('case_'):
        return s
    # strip any extension just in case
    s = re.sub(r'\.[^.]+$', '', s)
    # keep only trailing digits
    m = re.search(r'(\d+)$', s)
    if m:
        return f"case_{m.group(1)}"
    return s  # fallback

df['video_id'] = df['video_id'].map(_to_case_stem)

# Re-check which ids match files we discovered
missing = sorted(set(df['video_id']) - set(videos.keys()))
print(f"[alias] after mapping, unmatched annotated videos: {len(missing)}")
print(missing[:20])


In [None]:
# Check unmatched video ids after mapping to case_{id}
missing = sorted(set(df['video_id']) - set(videos.keys()))
print(f"[check] unmatched after mapping: {len(missing)}")
print(missing[:25])


In [None]:
# === Patched Cell — Robust timeline builder (no double phase mapping) ===
from typing import Dict, Optional
import numpy as np, math
from pathlib import Path

def _encode_phase(val, num_classes: Optional[int]=None, phase_to_idx: Optional[Dict[int,int]]=None) -> int:
    try:
        v = int(val)
    except Exception:
        return 0
    if num_classes is not None and 0 <= v < num_classes:
        return v
    if phase_to_idx and v in phase_to_idx:
        return int(phase_to_idx[v])
    if num_classes is not None and num_classes > 0:
        return int(v) % num_classes
    return int(max(v, 0))

def build_timelines(df_ann: pd.DataFrame,
                    videos: Dict[str, Path],
                    target_fps: int,
                    max_frames_per_video: Optional[int]=None,
                    num_classes: Optional[int]=None,
                    phase_to_idx: Optional[Dict[int,int]]=None) -> Dict[str, Dict]:
    """
    Returns: dict[video_id] = {fps: int, n_frames: int, labels: np.ndarray[int]}
    Works without actual video files; assumes fps=25 if unknown.
    """
    def default_fps_for(vid: str) -> float:
        if isinstance(videos, dict) and vid in videos and callable(globals().get("get_video_fps", None)):
            try:
                return float(max(1e-3, globals()["get_video_fps"](videos[vid])))
            except Exception:
                pass
        return 25.0

    if num_classes is None and "phase_id" in df_ann.columns:
        try:
            num_classes = int(df_ann["phase_id"].max()) + 1
        except Exception:
            num_classes = None

    timelines: Dict[str, Dict] = {}

    # SEGMENT schema
    if set(["video_id","start_sec","end_sec","phase_id"]).issubset(df_ann.columns):
        by_vid = df_ann.groupby("video_id", sort=False)
        for vid, g in by_vid:
            total_sec = float(max(g["end_sec"].values) if len(g) else 0.0)
            n_tgt = int(max(1, math.ceil(total_sec * target_fps)))
            labels = np.full(n_tgt, fill_value=0, dtype=np.int64)
            for r in g.itertuples(index=False):
                p = _encode_phase(getattr(r, "phase_id"), num_classes, phase_to_idx)
                a = max(0, int(round(float(getattr(r, "start_sec")) * target_fps)))
                b = min(n_tgt, int(round(float(getattr(r, "end_sec"))   * target_fps)))
                if b <= a:
                    b = min(n_tgt, a + 1)
                labels[a:b] = p
            if max_frames_per_video is not None:
                labels = labels[:max_frames_per_video]
            timelines[vid] = dict(fps=target_fps, n_frames=int(labels.shape[0]), labels=labels)

    # FRAME schema
    elif set(["video_id","frame_idx","phase_id"]).issubset(df_ann.columns):
        by_vid = df_ann.groupby("video_id", sort=False)
        for vid, g in by_vid:
            fps_src = default_fps_for(vid)
            max_src_idx = int(g["frame_idx"].max()) + 1 if len(g) else 1
            total_sec = max_src_idx / max(fps_src, 1e-6)
            n_tgt = int(max(1, math.ceil(total_sec * target_fps)))
            labels = np.full(n_tgt, fill_value=0, dtype=np.int64)
            for r in g.itertuples(index=False):
                p  = _encode_phase(getattr(r, "phase_id"), num_classes, phase_to_idx)
                fi = int(getattr(r, "frame_idx"))
                t  = int(round((fi / max(fps_src, 1e-6)) * target_fps))
                t  = int(np.clip(t, 0, n_tgt - 1))
                labels[t] = p
            if max_frames_per_video is not None:
                labels = labels[:max_frames_per_video]
            timelines[vid] = dict(fps=target_fps, n_frames=int(labels.shape[0]), labels=labels)
    else:
        raise ValueError("df must be SEGMENT (video_id,start_sec,end_sec,phase_id) or FRAME (video_id,frame_idx,phase_id) schema.")

    print('Built timelines for', len(timelines), 'videos')
    return timelines

# Rebuild with robust function
TIMELINES = build_timelines(
    df,
    videos,
    target_fps=CFG.target_fps,
    max_frames_per_video=CFG.max_frames_per_video if hasattr(CFG, "max_frames_per_video") else None,
    num_classes=(num_classes if "num_classes" in globals() else None),
    phase_to_idx=(phase_to_idx if "phase_to_idx" in globals() else None),
)


In [None]:
def build_clip_index(timelines: Dict[str, Dict], clip_len_sec: float, stride_frac: float) -> pd.DataFrame:
    rows = []
    for vid, info in timelines.items():
        labels = info['labels']
        fps = info['fps']
        L = int(round(clip_len_sec * fps))
        S = max(1, int(round(L * stride_frac)))
        if L < 1: 
            continue
        for a in range(0, len(labels)-L+1, S):
            b = a + L
            window = labels[a:b]
            lab = window[L//2]  # center frame label
            rows.append((vid, a, b, int(lab)))
    idx = pd.DataFrame(rows, columns=['video_id','start','end','label'])
    return idx

CLIP_INDEX = build_clip_index(TIMELINES, CFG.clip_len_sec, CFG.stride_frac)
print('Total clips:', len(CLIP_INDEX))

# Train/val split by video (no leakage)
all_vids = list(TIMELINES.keys())
random.shuffle(all_vids)
split = int(0.8*len(all_vids)) if len(all_vids) > 1 else 1
train_vids, val_vids = set(all_vids[:split]), set(all_vids[split:])

def split_by_videos(index_df: pd.DataFrame, train_set: set, val_set: set):
    tr = index_df[index_df.video_id.isin(train_set)].reset_index(drop=True)
    va = index_df[index_df.video_id.isin(val_set)].reset_index(drop=True)
    return tr, va

CLIPS_TR, CLIPS_VA = split_by_videos(CLIP_INDEX, train_vids, val_vids)
print(CLIPS_TR.shape, CLIPS_VA.shape)

# Class weights from training labels (for imbalance)
cnt = Counter(CLIPS_TR['label'].tolist())
class_counts = np.array([cnt.get(i,0) for i in range(num_classes)])
class_weights = 1.0 / np.clip(class_counts, 1, None)
class_weights = class_weights / class_weights.sum() * num_classes
print('Class counts:', class_counts)
print('Class weights:', class_weights.round(3))


In [None]:
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
IMAGENET_STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)

class ClipDataset(Dataset):
    def __init__(self, clips_df: pd.DataFrame, timelines: Dict[str, Dict], video_paths: Dict[str, Path],
                 resize: int, augment: bool, target_fps: int):
        self.df = clips_df
        self.timelines = timelines
        self.videos = video_paths
        self.size = resize
        self.augment = augment
        self.fps = target_fps

    def __len__(self):
        return len(self.df)

    def _read_window(self, path: Path, a: int, b: int) -> np.ndarray:
        # Read frames for [a, b) at self.fps using nearest timestamp mapping
        cap = cv2.VideoCapture(str(path))
        fps_src = cap.get(cv2.CAP_PROP_FPS) or 25.0
        frames = []
        last_frame = None
        for t in range(a, b):
            src_idx = int(round((t / self.fps) * fps_src))
            cap.set(cv2.CAP_PROP_POS_FRAMES, src_idx)
            ok, frame = cap.read()
            if not ok:
                frame = last_frame if last_frame is not None else np.zeros((self.size, self.size, 3), np.uint8)
            else:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = cv2.resize(frame, (self.size, self.size), interpolation=cv2.INTER_AREA)
                last_frame = frame
            frames.append(frame)
        cap.release()
        arr = np.stack(frames, axis=0)  # [T,H,W,3]
        return arr

    def _augment(self, x: np.ndarray) -> np.ndarray:
        # Simple spatial augs (random horizontal flip)
        if not self.augment:
            return x
        if random.random() < 0.5:
            x = x[:, :, ::-1, :]
        return x

    def _normalize(self, x: np.ndarray) -> np.ndarray:
        x = x.astype(np.float32) / 255.0
        x = (x - IMAGENET_MEAN) / IMAGENET_STD
        return x

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        vid, a, b, lab = r.video_id, int(r.start), int(r.end), int(r.label)
        path = self.videos[vid]
        x = self._read_window(path, a, b)   # [T,H,W,3]
        x = self._augment(x)
        x = self._normalize(x)
        x = torch.from_numpy(x).permute(3,0,1,2)  # [3,T,H,W]
        y = torch.tensor(lab, dtype=torch.long)
        return x, y, vid, a, b


In [None]:
train_ds = ClipDataset(CLIPS_TR, TIMELINES, videos, resize=CFG.resize, augment=True,  target_fps=CFG.target_fps)
val_ds   = ClipDataset(CLIPS_VA, TIMELINES, videos, resize=CFG.resize, augment=False, target_fps=CFG.target_fps)

train_dl = DataLoader(train_ds, batch_size=CFG.batch_size, shuffle=True,  num_workers=CFG.num_workers, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

print('Train clips:', len(train_ds), ' Val clips:', len(val_ds))


In [None]:
# Pretrained backbone
weights = R3D_18_Weights.DEFAULT
net = r3d_18(weights=weights)
net.fc = nn.Linear(net.fc.in_features, num_classes)

# Optional: freeze early layers for speed on small data
# for name, p in net.named_parameters():
#     if not name.startswith('layer4') and 'fc' not in name:
#         p.requires_grad = False

criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float32))
optimizer = torch.optim.AdamW(net.parameters(), lr=CFG.lr, weight_decay=CFG.weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max(CFG.epochs,1))

scaler = torch.cuda.amp.GradScaler(enabled=CFG.amp)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net.to(device)
print('Device:', device)


In [None]:
def accuracy(logits, target):
    return (logits.argmax(1) == target).float().mean().item()

best_acc = 0.0
BEST_PATH = WORK_DIR / 'best_clip_model.pt'

for epoch in range(1, CFG.epochs+1):
    net.train()
    t0 = time.time()
    tr_loss = 0.0
    tr_acc = 0.0
    optimizer.zero_grad(set_to_none=True)

    for step, (x,y,_,_,_) in enumerate(train_dl, 1):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)
        with torch.cuda.amp.autocast(enabled=CFG.amp):
            logits = net(x)
            loss = F.cross_entropy(logits, y, label_smoothing=CFG.label_smoothing, reduction='mean')
        scaler.scale(loss/CFG.grad_accum_steps).backward()
        if step % CFG.grad_accum_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
        tr_loss += loss.item()
        tr_acc  += accuracy(logits.detach(), y)

    scheduler.step()

    # Validation
    net.eval()
    va_loss = 0.0
    va_acc = 0.0
    with torch.no_grad():
        for x,y,_,_,_ in val_dl:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            with torch.cuda.amp.autocast(enabled=CFG.amp):
                logits = net(x)
                loss = F.cross_entropy(logits, y, label_smoothing=0.0, reduction='mean')
            va_loss += loss.item()
            va_acc  += accuracy(logits, y)

    ntr = max(1, len(train_dl)); nva = max(1, len(val_dl))
    tr_loss/=ntr; tr_acc/=ntr; va_loss/=nva; va_acc/=nva
    dt = time.time()-t0

    print(f"Epoch {epoch:02d} | {dt:5.1f}s | "
          f"train loss {tr_loss:.3f} acc {tr_acc:.3f} | val loss {va_loss:.3f} acc {va_acc:.3f}")

    if va_acc > best_acc:
        best_acc = va_acc
        torch.save({'model': net.state_dict(),
                    'num_classes': num_classes,
                    'phase_to_idx': phase_to_idx}, BEST_PATH)
        print('  -> saved best to', BEST_PATH)


In [None]:
net.eval()
all_rows = []
with torch.no_grad():
    for x,y,vid,a,b in val_dl:
        x = x.to(device)
        logits = net(x)
        probs = torch.softmax(logits, dim=1).cpu().numpy()
        pred = probs.argmax(axis=1)
        for i in range(len(vid)):
            all_rows.append((vid[i], int(a[i]), int(b[i]), int(pred[i])))

VAL_PREDS = pd.DataFrame(all_rows, columns=['video_id','start','end','pred'])
print(VAL_PREDS.head())


In [None]:
from scipy.signal import medfilt

def stitch_predictions(pred_df: pd.DataFrame, timelines: Dict[str, Dict], num_classes: int,
                       smooth: int = 15) -> Dict[str, np.ndarray]:
    pred_timelines = {}
    for vid, g in pred_df.groupby('video_id'):
        if vid not in timelines:
            continue
        n = timelines[vid]['n_frames']
        votes = np.zeros((num_classes, n), dtype=np.int32)
        for r in g.itertuples(index=False):
            votes[r.pred, r.start:r.end] += 1
        out = votes.argmax(axis=0)
        if smooth and smooth % 2 == 1 and smooth > 1:
            out = medfilt(out, kernel_size=smooth)
        pred_timelines[vid] = out
    return pred_timelines

PRED_TIMELINES = stitch_predictions(VAL_PREDS, TIMELINES, num_classes, smooth=CFG.median_smooth)
print('Pred timelines for', len(PRED_TIMELINES), 'videos')


In [None]:
from sklearn.metrics import f1_score

def eval_frame_metrics(pred_tl: Dict[str, np.ndarray], gt: Dict[str, Dict]) -> Dict[str, float]:
    y_true = []
    y_pred = []
    for vid, pred in pred_tl.items():
        if vid not in gt:
            continue
        ref = gt[vid]['labels']
        m = min(len(ref), len(pred))
        y_true.append(ref[:m])
        y_pred.append(pred[:m])
    if not y_true:
        return dict(frame_acc=float('nan'), macro_f1=float('nan'))
    y_true = np.concatenate(y_true)
    y_pred = np.concatenate(y_pred)
    acc = (y_true == y_pred).mean().item()
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    return dict(frame_acc=acc, macro_f1=macro_f1)

metrics = eval_frame_metrics(PRED_TIMELINES, TIMELINES)
print(metrics)


In [None]:
# Plot ground truth vs. predicted for one validation video
val_example_vid = next(iter(set(CLIPS_VA.video_id))) if len(CLIPS_VA) else None
if val_example_vid is not None and val_example_vid in PRED_TIMELINES:
    y_gt = TIMELINES[val_example_vid]['labels']
    y_pr = PRED_TIMELINES[val_example_vid]
    T = min(len(y_gt), len(y_pr))
    t = np.arange(T) / CFG.target_fps

    plt.figure(figsize=(14,3))
    plt.plot(t, y_gt[:T], lw=2, label='GT', alpha=0.9)
    plt.plot(t, y_pr[:T], lw=1, label='Pred', alpha=0.9)
    plt.xlabel('Seconds')
    plt.ylabel('Phase idx')
    plt.title(f'Video: {val_example_vid} — GT vs Predicted (smoothed={CFG.median_smooth})')
    plt.legend()
    plt.tight_layout()
    plt.show()
else:
    print('No validation example available to visualize.')


In [None]:
# Load best model (optional; already loaded during training)
ckpt = torch.load(WORK_DIR / 'best_clip_model.pt', map_location='cpu')
net.load_state_dict(ckpt['model'])
net.eval()
print('Loaded best model with classes:', ckpt['num_classes'])

# Export per-frame predictions for all VAL videos (CSV)
rows = []
for vid, arr in PRED_TIMELINES.items():
    for i, p in enumerate(arr):
        rows.append((vid, i, p))

df_out = pd.DataFrame(rows, columns=['video_id','t_idx','pred_label_idx'])
CSV_OUT = WORK_DIR / 'val_frame_predictions.csv'
df_out.to_csv(CSV_OUT, index=False)
print('Saved:', CSV_OUT)


In [None]:
# Enforce a minimum duration per segment to reduce rapid label jitter.
def apply_min_duration(arr: np.ndarray, min_len_frames: int) -> np.ndarray:
    x = arr.copy()
    i = 0
    n = len(x)
    while i < n:
        j = i
        while j < n and x[j] == x[i]:
            j += 1
        run_len = j - i
        if run_len < min_len_frames:
            left = x[i-1] if i-1 >= 0 else x[j if j < n else i]
            right = x[j] if j < n else x[i-1]
            fill = left if left == right else (left if i>0 else right)
            x[i:j] = fill
        i = j
    return x

# Example usage on one video
val_example_vid = next(iter(PRED_TIMELINES.keys())) if PRED_TIMELINES else None
if val_example_vid is not None:
    pred0 = PRED_TIMELINES[val_example_vid]
    pred1 = apply_min_duration(pred0, min_len_frames=int(1.0*CFG.target_fps))  # 1s minimum
    print('Changed frames:', int((pred0!=pred1).sum()))
else:
    print('No predictions to post-process.')
