In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install ultralytics

Collecting opencv-python>=4.6.0 (from ultralytics)
  Using cached opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Using cached opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (67.0 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.12.0.88


In [None]:
!pip -q uninstall -y peft bitsandbytes albumentations albucore opencv-python opencv-python-headless || true


!pip -q install "transformers==4.41.2" "decord==0.6.0" "torchmetrics==1.4.0" "scikit-learn==1.5.1" \
                albumentations==1.4.8 albucore==0.0.14 opencv-python-headless==4.10.0.84 --no-cache-dir

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.8/156.8 kB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.9/49.9 MB[0m [31m195.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# CONFIG

from pathlib import Path

# Input / Output
VIDEO_PATH           = "/content/cows.mp4"
OUT_DIR              = Path("/content/cow_logs1")
WRITE_ANNOTATED_MP4  = True

# YOLO / ByteTrack
YOLO_WEIGHTS         = "/content/best.pt"
YOLO_IMGSZ           = 960
YOLO_CONF            = 0.60
YOLO_IOU             = 0.50
TRACKER_YAML         = "bytetrack.yaml"

# TimeSformer
SINGLE_MODEL_DIR     = "/content/drive/MyDrive/Models/timesformer-cows2"
CLF_RES              = 224               # 112 for speed
WINDOW_FRAMES        = 12                # T
SAMPLE_FPS           = 1.6               # 16 frames per 10 s
WINDOW_OVERLAP       = 0.5               # 0.5 = decision every 5 s at 1.6 fps
SMOOTH_K             = 3                 # majority over last K labels
MIN_WARMUP_FRAMES    = 8

# Segmentation / tracking timeouts
INACTIVITY_TIMEOUT_S = 10
MIN_SEGMENT_S        = 2

# Class map
SINGLE_ID2LABEL = {
    0: "Standing",
    1: "Lying",
    2: "Drinking",
    3: "Feeding & Standing",
    4: "Feeding & Lying",
    5: "Ruminating & Standing",
    6: "Ruminating & Lying",
}

PAD_TARGET_HW = (640, 640)

OUT_DIR.mkdir(parents=True, exist_ok=True)
print("Configured.")

Configured.


In [None]:
# Imports & helpers

import os, time, math, json, gc
import cv2, torch, numpy as np, pandas as pd
from collections import defaultdict, deque
from tqdm import tqdm
from ultralytics import YOLO
from transformers import TimesformerForVideoClassification, AutoConfig

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

def sec_to_hms(t):
    h = int(t//3600); m = int((t%3600)//60); s = int(t%60)
    return f"{h:02d}:{m:02d}:{s:02d}"

# Scale to fit center padding
def center_pad_bbox_crop(img_rgb, xyxy, pad_target=(640, 640), out_size=224):
    """
    Crop YOLO bbox, if larger than pad canvas, scale down to fit, center-pad to pad_target
    ,resize to out_size. Returns RGB uint8 or None if bbox empty.
    """
    H, W = img_rgb.shape[:2]
    x1, y1, x2, y2 = map(int, xyxy)
    x1 = max(0, x1); y1 = max(0, y1); x2 = min(W-1, x2); y2 = min(H-1, y2)
    crop = img_rgb[y1:y2, x1:x2]
    if crop.size == 0:
        return None

    th, tw = pad_target
    h, w = crop.shape[:2]
    if h > th or w > tw:
        scale = min(th / h, tw / w)
        nh, nw = max(1, int(round(h * scale))), max(1, int(round(w * scale)))
        crop = cv2.resize(crop, (nw, nh), interpolation=cv2.INTER_LINEAR)
        h, w = nh, nw

    top = (th - h) // 2; bottom = th - h - top
    left = (tw - w) // 2; right = tw - w - left
    padded = cv2.copyMakeBorder(crop, top, bottom, left, right,
                                borderType=cv2.BORDER_CONSTANT, value=(0, 0, 0))

    if out_size and (out_size != th):
        padded = cv2.resize(padded, (out_size, out_size), interpolation=cv2.INTER_LINEAR)
    return padded

# ImageNet norm
IMNET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32)
IMNET_STD  = np.array([0.229, 0.224, 0.225], dtype=np.float32)
def preprocess_clip(frames_hw3):  # list of T RGB (H,W,3) uint8
    arr = np.stack(frames_hw3, axis=0).astype(np.float32) / 255.0
    arr = (arr - IMNET_MEAN) / IMNET_STD
    arr = arr.transpose(0,3,1,2)  # T,C,H,W
    return torch.from_numpy(arr).unsqueeze(0)  # 1,T,C,H,W

def majority(lst):
    if not lst: return None
    return max(set(lst), key=lst.count)



Device: cuda


In [None]:
# Load models

yolo = YOLO(YOLO_WEIGHTS)
try:
    if device == "cuda":
        #try: yolo.model.half()
        pass
    yolo.fuse()
except Exception as e:
    print("YOLO optimize hint:", e)

cfg_single = AutoConfig.from_pretrained(SINGLE_MODEL_DIR)
try:
    if getattr(cfg_single, "id2label", None):
        SINGLE_ID2LABEL = {int(k): v for k, v in cfg_single.id2label.items()}
        print("Loaded id2label from checkpoint:", SINGLE_ID2LABEL)
    else:
        print("No id2label in config; using SINGLE_ID2LABEL from CONFIG.")
except Exception as e:
    print("id2label read error; using CONFIG map. Detail:", e)

clf_single = TimesformerForVideoClassification.from_pretrained(
    SINGLE_MODEL_DIR, config=cfg_single, ignore_mismatched_sizes=True
).to(device).eval()

print("Models loaded.")


YOLO11s summary (fused): 100 layers, 9,413,187 parameters, 0 gradients, 21.3 GFLOPs
Loaded id2label from checkpoint: {0: 'Drinking', 1: 'Feeding & Lying', 2: 'Feeding & Standing', 3: 'Lying', 4: 'Ruminating & Lying', 5: 'Ruminating & Standing', 6: 'Standing'}


Some weights of TimesformerForVideoClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Models/timesformer-cows2 and are newly initialized because the shapes did not match:
- timesformer.embeddings.time_embeddings: found shape torch.Size([1, 8, 768]) in the checkpoint and torch.Size([1, 12, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Models loaded.


In [None]:
# Main pipeline


# Video props
cap = cv2.VideoCapture(VIDEO_PATH)
assert cap.isOpened(), f"Cannot open video: {VIDEO_PATH}"
video_fps   = cap.get(cv2.CAP_PROP_FPS) or 25.0
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
video_w     = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
video_h     = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
cap.release()

# Sampling & windowing
sample_stride   = max(1, int(round(video_fps / max(0.1, float(SAMPLE_FPS)))))
step_frames     = max(1, int(round(WINDOW_FRAMES * (1.0 - float(WINDOW_OVERLAP)))))  # in SAMPLES
timeout_frames  = int(round(INACTIVITY_TIMEOUT_S * video_fps))

print(f"fps={video_fps:.2f} frames={frame_count} size=({video_w}x{video_h})")
print(f"Sample every {sample_stride} frames (~{SAMPLE_FPS} fps); window={WINDOW_FRAMES}, step={step_frames}, clf_res={CLF_RES}")

# Per-track state
buffers              = defaultdict(lambda: deque(maxlen=WINDOW_FRAMES))  # cow_id -> deque of padded crops
last_sample_frame    = defaultdict(lambda: -10**9)                        # cow_id -> last raw frame idx sampled
last_classify_frame  = defaultdict(lambda: -10**9)                        # cow_id -> last raw frame idx classified
last_seen_frame      = dict()                                            # cow_id -> last frame seen
pred_hist_single     = defaultdict(lambda: deque(maxlen=SMOOTH_K))       # cow_id -> last K labels

active_event = dict()  # cow_id -> {"label": str, "start_frame": int}
events       = []      # finalized segments

# Annotated video writer
writer = None
if WRITE_ANNOTATED_MP4:
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(str(OUT_DIR / "annotated.mp4"), fourcc, video_fps, (video_w, video_h))

start_time = time.time()
frame_idx  = -1

# ByteTrack stream
gen = yolo.track(
    source=VIDEO_PATH, stream=True, imgsz=YOLO_IMGSZ,
    conf=YOLO_CONF, iou=YOLO_IOU, tracker=TRACKER_YAML,
    device=0 if device=="cuda" else 'cpu', verbose=False, persist=True
)

# Ensure fast tracker dependency is present
try:
    import lap
except Exception:
    pass

# Debug dumps disabled by default
DEBUG_DUMPS = 3
_dbg_dumped = 0

for res in tqdm(gen, desc="Processing"):
    frame_idx += 1
    img_bgr = res.orig_img
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

    xyxy = None; ids_arr = None
    if res.boxes is not None and res.boxes.xyxy is not None:
        xyxy = res.boxes.xyxy.cpu().numpy()
        ids  = res.boxes.id
        ids_arr = ids.cpu().numpy().astype(int) if ids is not None else np.arange(len(xyxy), dtype=int)

        for bb, tid in zip(xyxy, ids_arr):
            last_seen_frame[tid] = frame_idx

            # sample into clip buffer at SAMPLE_FPS
            if frame_idx - last_sample_frame[tid] >= sample_stride:
                crop = center_pad_bbox_crop(img_rgb, bb, pad_target=PAD_TARGET_HW, out_size=CLF_RES)
                if crop is not None:
                    buffers[tid].append(crop)
                    last_sample_frame[tid] = frame_idx

            # Readiness, spaced by SAMPLES since last classify
            n_samples = len(buffers[tid])
            ready_full = (n_samples >= WINDOW_FRAMES)
            ready_warm = (active_event.get(tid) is None) and (n_samples >= MIN_WARMUP_FRAMES)
            step_ok    = (frame_idx - last_classify_frame[tid]) >= (sample_stride * step_frames)

            if (ready_full or ready_warm) and step_ok:
                clip = list(buffers[tid])
                if n_samples < WINDOW_FRAMES:
                    clip = clip + [clip[-1]] * (WINDOW_FRAMES - n_samples)  # pad to T

                pixel_values = preprocess_clip(clip).to(device)
                with torch.no_grad():
                    logits   = clf_single(pixel_values=pixel_values).logits
                    pred_idx = int(torch.argmax(logits, dim=-1).item())
                    label    = SINGLE_ID2LABEL.get(pred_idx, str(pred_idx))
                pred_hist_single[tid].append(label)
                smooth_label = majority(list(pred_hist_single[tid]))
                last_classify_frame[tid] = frame_idx

                # Debug dump
                if _dbg_dumped < DEBUG_DUMPS:
                    T = len(clip); cols=4; rows=math.ceil(T/cols)
                    padf = np.zeros_like(clip[0], dtype=np.uint8)
                    tiles = clip + [padf]*(rows*cols - T)
                    grid = np.concatenate([np.concatenate(tiles[r*cols:(r+1)*cols], 1) for r in range(rows)], 0)
                    out_path = OUT_DIR / f"dbg_cow{tid}_f{frame_idx}_raw-{label.replace(' ','_')}_sm-{smooth_label.replace(' ','_')}.jpg"
                    cv2.imwrite(str(out_path), cv2.cvtColor(grid, cv2.COLOR_RGB2BGR))
                    _dbg_dumped += 1

                # Segmenting around window center
                window_center_frame = frame_idx - (WINDOW_FRAMES // 2)
                if tid not in active_event:
                    active_event[tid] = {"label": smooth_label, "start_frame": max(0, window_center_frame)}
                else:
                    if smooth_label != active_event[tid]["label"]:
                        st = active_event[tid]["start_frame"]
                        et = max(st, window_center_frame)
                        if (et - st) / video_fps >= MIN_SEGMENT_S:
                            events.append({
                                "cow_id": int(tid),
                                "activity": active_event[tid]["label"],
                                "start_frame": st,
                                "end_frame": et
                            })
                        active_event[tid] = {"label": smooth_label, "start_frame": window_center_frame}

    # close stale tracks
    to_close = []
    for tid, lastf in list(last_seen_frame.items()):
        if frame_idx - lastf >= timeout_frames:
            to_close.append(tid)
    for tid in to_close:
        if tid in active_event:
            st = active_event[tid]["start_frame"]
            et = last_seen_frame[tid]
            if et < st: et = st
            if (et - st) / video_fps >= MIN_SEGMENT_S:
                events.append({
                    "cow_id": int(tid),
                    "activity": active_event[tid]["label"],
                    "start_frame": st,
                    "end_frame": et
                })
        buffers.pop(tid, None)
        pred_hist_single.pop(tid, None)
        active_event.pop(tid, None)
        last_seen_frame.pop(tid, None)

    # overlay
    if WRITE_ANNOTATED_MP4 and (xyxy is not None):
        for bb, tid in zip(xyxy, ids_arr):
            lab = active_event.get(int(tid), {}).get("label")
            if lab is None:
                lab = "estimating…"
            x1,y1,x2,y2 = map(int, bb)
            cv2.rectangle(img_bgr, (x1,y1), (x2,y2), (0,255,0), 2)
            cv2.putText(img_bgr, f"{int(tid)}: {lab}", (x1, max(0,y1-6)),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,255,0), 2, cv2.LINE_AA)
        writer.write(img_bgr)

# flush at end
for tid, ev in list(active_event.items()):
    st = ev["start_frame"]
    et = frame_idx
    if et < st: et = st
    if (et - st) / video_fps >= MIN_SEGMENT_S:
        events.append({
            "cow_id": int(tid),
            "activity": ev["label"],
            "start_frame": st,
            "end_frame": et
        })
if writer is not None:
    writer.release()

elapsed = time.time() - start_time
print(f"Processed {frame_idx+1} frames in {elapsed:.1f}s "
      f"(~{(frame_idx+1)/max(1.0,elapsed):.1f} FPS incl. det+track+class).")
if CLF_RES == 112:
    print("Note: 112×112 mode chosen (faster, slight accuracy drop).")




fps=30.00 frames=5399 size=(1920x1080)
Sample every 19 frames (~1.6 fps); window=12, step=6, clf_res=224


Processing: 5399it [03:58, 22.63it/s]

Processed 5399 frames in 238.8s (~22.6 FPS incl. det+track+class).





In [None]:
# Logs (CSV) + summary

def save_and_summarize(events, out_dir: Path, fps: float):
    if not events:
        print("No segments produced — check model paths / labels / thresholds.")
        return None, None

    df = pd.DataFrame(events)
    df["start_sec"]   = df["start_frame"] / fps
    df["end_sec"]     = df["end_frame"] / fps
    df["duration_s"]  = df["end_sec"] - df["start_sec"]
    df["start_hms"]   = df["start_sec"].map(sec_to_hms)
    df["end_hms"]     = df["end_sec"].map(sec_to_hms)
    df["duration_min"]= df["duration_s"] / 60.0

    csv_events = out_dir / "cow_activity_events.csv"
    df.sort_values(["cow_id","start_frame"]).to_csv(csv_events, index=False)
    print(f"Saved event log: {csv_events}")

    agg = (df.groupby(["cow_id","activity"])["duration_s"].sum().reset_index())
    agg["duration_min"] = agg["duration_s"] / 60.0
    csv_totals = out_dir / "cow_activity_totals.csv"
    agg.sort_values(["cow_id","duration_s"], ascending=[True, False]).to_csv(csv_totals, index=False)
    print(f"Saved totals:    {csv_totals}")

    print("\nSample summary:")
    view = (agg.sort_values(["cow_id","duration_s"], ascending=[True, False])
              .groupby("cow_id").head(5))
    for _, row in view.iterrows():
        print(f"cow {int(row['cow_id'])}: {row['activity']} for {row['duration_min']:.1f} min")
    return csv_events, csv_totals

csv_events, csv_totals = save_and_summarize(events, OUT_DIR, video_fps)



Saved event log: /content/cow_logs1/cow_activity_events.csv
Saved totals:    /content/cow_logs1/cow_activity_totals.csv

Sample summary:
cow 1: Lying for 2.9 min
cow 2: Feeding & Standing for 2.6 min
cow 2: Standing for 0.2 min
cow 2: Ruminating & Standing for 0.1 min
cow 3: Ruminating & Standing for 0.9 min
cow 3: Drinking for 0.7 min
cow 3: Standing for 0.7 min
cow 3: Feeding & Standing for 0.4 min
cow 3: Feeding & Lying for 0.2 min
cow 4: Drinking for 1.0 min
cow 4: Feeding & Standing for 0.3 min
cow 4: Ruminating & Standing for 0.1 min
cow 4: Standing for 0.1 min
cow 5: Lying for 0.5 min
cow 5: Ruminating & Lying for 0.2 min
cow 6: Feeding & Standing for 1.5 min
cow 6: Standing for 0.8 min
cow 6: Lying for 0.6 min
cow 7: Feeding & Standing for 2.5 min
cow 7: Standing for 0.3 min
cow 7: Ruminating & Lying for 0.2 min
cow 8: Standing for 1.7 min
cow 8: Drinking for 0.6 min
cow 8: Feeding & Standing for 0.6 min
cow 10: Lying for 0.4 min
cow 10: Ruminating & Lying for 0.1 min
cow 11: D