## Imports and Setup 


In [None]:
# ============================================
# 0. Imports & Global Settings
# ============================================
import os
from pathlib import Path
from collections import defaultdict

import cv2
import numpy as np
import pandas as pd

from scipy.optimize import linear_sum_assignment

# YOLOv8
from ultralytics import YOLO
import torch

# DeepSORT
from deep_sort_realtime.deepsort_tracker import DeepSort

# Plotting / debug (optional)
import matplotlib.pyplot as plt


In [None]:
# ============================================
# 0.1 Paths & Constants
# ============================================
BASE_DIR = Path("Object_Tracking")

TASK1_IMAGES_DIR = BASE_DIR / "Task1" / "images"
TASK1_GT_PATH    = BASE_DIR / "Task1" / "gt" / "gt.txt"

TASK2_IMAGES_DIR = BASE_DIR / "Task2" / "images"

# Output paths
TASK1_INPUT_VIDEO  = Path("task1_input.mp4")
TASK1_OUTPUT_VIDEO = Path("task1.mp4")
TASK2_OUTPUT_VIDEO = Path("task2.mp4")
TASK2_COUNTS_CSV   = Path("task2_count.csv")

FPS_TASK1 = 14
FPS_TASK2 = 14

# YOLO weights
YOLO_WEIGHTS = "yolov8x.pt"  
YOLO_IMGSZ = 1920       # using full-res
YOLO_CONF = 0.25          # confidence threshold
# Pick CUDA if available, otherwise CPU
DEVICE = "cuda"


## 1. Data Preparation (Task 1 – images → video @ 14 FPS)

In [None]:
# ============================================
# 1. Convert Task1 images to video (task1_input.mp4)
# ============================================
def images_to_video(image_dir: Path, output_path: Path, fps: int = 14):
    """
    Convert all images in image_dir to a video at the given fps.
    Assumes images are named so that lexicographic sort is correct frame order
    (e.g., 000001.jpg, 000002.jpg, ...).
    """
    image_files = sorted(
        [p for p in image_dir.iterdir() if p.suffix.lower() in [".jpg", ".jpeg", ".png"]]
    )
    assert len(image_files) > 0, f"No images found in {image_dir}"

    # Read first image to get frame size
    first_frame = cv2.imread(str(image_files[0]))
    assert first_frame is not None, f"Could not read first image {image_files[0]}"

    height, width = first_frame.shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))

    for img_path in image_files:
        frame = cv2.imread(str(img_path))
        if frame is None:
            print(f"Warning: could not read {img_path}, skipping.")
            continue
        out.write(frame)

    out.release()
    print(f"Saved video: {output_path} ({len(image_files)} frames at {fps} FPS)")

# Run for Task1
images_to_video(TASK1_IMAGES_DIR, TASK1_INPUT_VIDEO, fps=FPS_TASK1)


## 2. YOLOv8 + DeepSORT Tracking (Task 2 – Task1 video)

### 2.1 Initialize YOLO and DeepSORT

We use YOLOv8 for pedestrian detection and DeepSORT to maintain consistent IDs across frames. Each frame is processed, detections are filtered to "person", and DeepSORT assigns a track ID. The output is both an annotated video and a MOT-format tracking file.


In [None]:
# ============================================
# 2.1 Initialize YOLOv8 and DeepSORT
# ============================================
def init_yolo(weights_path: str = YOLO_WEIGHTS, device: str = DEVICE):
    """
    Initialize YOLOv8 model on CPU or CUDA if available.
    """
    model = YOLO(weights_path)
    model.to(device)
    return model


def init_deepsort():
    """
    Initialize DeepSort tracker from deep_sort_realtime.
    """
    tracker = DeepSort(
        max_age=45,
        n_init=3,
        nn_budget=100,
        max_iou_distance=0.7,
        max_cosine_distance=0.2,
        embedder="mobilenet",
        half=False,
        bgr=True,
        embedder_gpu=True,
    )
    return tracker


yolo_model = init_yolo()
deepsort_tracker = init_deepsort()


### Optional sharpening and brightening

Gamma correction and sharpening help improve contrast and edge clarity in frames. This makes small or low-contrast pedestrians slightly easier for the detector to recognize.


In [None]:
# Global hyperparameters for preprocessing
GAMMA = 1.3         # brightness gamma
SHARP_SIGMA = 0.9   # Gaussian blur sigma for unsharp mask
SHARP_AMOUNT = 1.8   # sharpening strength


def preprocess_frame(frame_bgr):
    """
    Preprocessing before detection/tracking:
      - apply gamma on V channel (brightness)
      - apply mild unsharp masking for sharpening
    Uses global GAMMA, SHARP_SIGMA, SHARP_AMOUNT.
    """

    # --- 1) Gamma on brightness (HSV V channel) ---
    hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    v_float = v.astype(np.float32) / 255.0
    # gamma < 1 => brighter, gamma > 1 => darker
    v_gamma = np.power(v_float, GAMMA)
    v_new = np.clip(v_gamma * 255.0, 0, 255).astype(np.uint8)

    hsv = cv2.merge([h, s, v_new])
    img_bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

    # --- 2) Sharpening (unsharp mask) ---
    if SHARP_AMOUNT > 0:
        blurred = cv2.GaussianBlur(img_bgr, (0, 0), SHARP_SIGMA)
        sharp = cv2.addWeighted(
            img_bgr,
            1.0 + SHARP_AMOUNT,
            blurred,
            -SHARP_AMOUNT,
            0,
        )
        return sharp
    else:
        # no sharpening
        return img_bgr


### 2.2 Helper: Run tracker on a video & save results

In [None]:
# ============================================
# 2.2 Run YOLOv8x + DeepSORT on Task1 video
#     with darken+sharpen preprocessing
# ============================================
def run_tracking(
    input_video_path: Path,
    output_video_path: Path,
    tracker_txt_out: Path,
    yolo_model,
    deepsort_tracker,
    fps: int,
    conf: float = YOLO_CONF,
    imgsz: int = YOLO_IMGSZ,
):
    """
    Run YOLOv8 + DeepSORT tracking on a video.

    Outputs:
      - Annotated video with tracking boxes & IDs
      - Text file with tracking results:
        <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>
    Uses preprocess_frame() for detection (darken + sharpen).
    """
    cap = cv2.VideoCapture(str(input_video_path))
    assert cap.isOpened(), f"Cannot open {input_video_path}"

    # Get frame size from first frame
    ret, first_frame = cap.read()
    assert ret, "Could not read first frame"
    height, width = first_frame.shape[:2]
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)  # reset to start

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    out = cv2.VideoWriter(str(output_video_path), fourcc, fps, (width, height))

    all_tracks = []  # (frame_idx, track_id, bb_left, bb_top, bb_width, bb_height)
    frame_idx = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame_idx += 1

        # --- Preprocess for detector (darken + sharpen) ---
        frame_proc = preprocess_frame(frame)

        height, width = frame_proc.shape[:2]

        # YOLO inference on preprocessed frame
        results = yolo_model(frame_proc, imgsz=imgsz, conf=conf, verbose=False)[0]
        boxes = results.boxes

        detections = []
        if boxes is not None and len(boxes) > 0:
            xyxy  = boxes.xyxy.cpu().numpy()
            confs = boxes.conf.cpu().numpy()
            clss  = boxes.cls.cpu().numpy()

            for bbox, score, cls in zip(xyxy, confs, clss):
                # COCO class 0 = 'person'
                if int(cls) != 0:
                    continue
                x1, y1, x2, y2 = bbox

                # Clamp to image
                x1 = max(0.0, min(x1, width - 1.0))
                x2 = max(0.0, min(x2, width - 1.0))
                y1 = max(0.0, min(y1, height - 1.0))
                y2 = max(0.0, min(y2, height - 1.0))
                if x2 <= x1 or y2 <= y1:
                    continue

                # DeepSORT expects [x, y, w, h]
                w = x2 - x1
                h = y2 - y1

                detections.append(([x1, y1, w, h], float(score), "person"))

        # Update DeepSORT using preprocessed frame
        tracks = deepsort_tracker.update_tracks(detections, frame=frame_proc)

        for track in tracks:
            if not track.is_confirmed():
                continue

            # allow track to be drawn for up to 2 frames without an update
            if track.time_since_update > 0:
                continue

            track_id = track.track_id

            # Get left, top, right, bottom from tracker
            l, t, r, b = track.to_ltrb()
            l = max(0, min(int(l), width - 1))
            r = max(0, min(int(r), width - 1))
            t = max(0, min(int(t), height - 1))
            b = max(0, min(int(b), height - 1))

            bb_left   = float(l)
            bb_top    = float(t)
            bb_width  = float(r - l)
            bb_height = float(b - t)

            all_tracks.append(
                (frame_idx, int(track_id), bb_left, bb_top, bb_width, bb_height)
            )

            # Draw on preprocessed frame
            cv2.rectangle(frame_proc, (l, t), (r, b), (0, 255, 0), 2)
            cv2.putText(
                frame_proc,
                f"ID {track_id}",
                (l, t - 5),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.6,
                (0, 255, 0),
                2,
                cv2.LINE_AA,
            )

        out.write(frame_proc)

    cap.release()
    out.release()

    # Save tracking results to txt
    tracker_txt_out = Path(tracker_txt_out)
    with tracker_txt_out.open("w") as f:
        for (frame_idx, track_id, bb_left, bb_top, bb_width, bb_height) in all_tracks:
            f.write(
                f"{frame_idx},{track_id},{bb_left:.2f},{bb_top:.2f},"
                f"{bb_width:.2f},{bb_height:.2f}\n"
            )

    print(f"Tracking done. Saved video to {output_video_path}")
    print(f"Tracking results saved to {tracker_txt_out}")


# Run tracking for Task1
TASK1_TRACKS_TXT = Path("task1_tracks.txt")
run_tracking(
    TASK1_INPUT_VIDEO,
    TASK1_OUTPUT_VIDEO,
    TASK1_TRACKS_TXT,
    yolo_model,
    deepsort_tracker,
    fps=FPS_TASK1,
    conf=YOLO_CONF
)


## 3. Model Evaluation: MOTA (Task 3)

### 3.1 Load ground truth

In [None]:
# ============================================
# 3.1 Load ground truth annotations (Task1/gt/gt.txt)
# Using only the first 6 columns:
# <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>, ...
# ============================================
def load_gt(gt_path: Path):
    """
    Load ground truth from MOT-style gt.txt.

    Assumes columns:
      1: frame
      2: id
      3: bb_left
      4: bb_top
      5: bb_width
      6: bb_height
      [7: conf (optional)]
      [8: class (optional, 1 = pedestrian)]
      [9+: other fields, ignored]

    We:
      - skip lines with conf <= 0 (unlabeled / ignored)
      - if class column exists, keep only class == 1 (pedestrians)
    """
    gt_by_frame = defaultdict(list)

    with open(gt_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            cols = line.split(",")
            if len(cols) < 6:
                continue

            frame = int(cols[0])
            obj_id = int(cols[1])
            bb_left   = float(cols[2])
            bb_top    = float(cols[3])
            bb_width  = float(cols[4])
            bb_height = float(cols[5])

            # Optional 7th column: conf
            if len(cols) >= 7:
                conf = float(cols[6])
                # MOT convention: conf <= 0 => ignore
                if conf <= 0:
                    continue

            # Optional 8th column: class (1 = pedestrian)
            if len(cols) >= 8:
                cls = int(cols[7])
                if cls != 1:
                    # keep only pedestrians
                    continue

            gt_by_frame[frame].append(
                {
                    "id": obj_id,
                    "bbox": [bb_left, bb_top, bb_width, bb_height],
                }
            )

    return gt_by_frame

gt_by_frame = load_gt(TASK1_GT_PATH)
print("Loaded GT frames:", len(gt_by_frame))
print("Total GT boxes:", sum(len(v) for v in gt_by_frame.values()))


### 3.2 Load predictions (tracker output)

In [None]:
# ============================================
# 3.2 Load tracking results from our tracker output txt
# Format: <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>
# ============================================
def load_predictions(pred_path: Path):
    pred_by_frame = defaultdict(list)
    with pred_path.open("r") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.strip().split(",")
            frame = int(parts[0])
            track_id = int(parts[1])
            x = float(parts[2])
            y = float(parts[3])
            w = float(parts[4])
            h = float(parts[5])

            pred_by_frame[frame].append(
                {
                    "id": track_id,
                    "bbox": np.array([x, y, w, h], dtype=float),
                }
            )
    return pred_by_frame

pred_by_frame = load_predictions(TASK1_TRACKS_TXT)
print(f"Loaded predictions for {len(pred_by_frame)} frames")


### 3.3 IoU, Hungarian matching

We compute IoU between GT and predicted boxes and use the Hungarian algorithm to match pairs with IoU ≥ 0.5. Unmatched predictions = FP; unmatched GT = FN; switching matched IDs across frames = IDSW.

In [None]:
# ============================================
# 3.3 IoU & matching utilities
# ============================================
def xywh_to_xyxy(box_xywh):
    """Convert [x, y, w, h] -> [x1, y1, x2, y2]."""
    x, y, w, h = box_xywh
    return np.array([x, y, x + w, y + h], dtype=float)


def compute_iou_matrix(gt_boxes_xywh, pred_boxes_xywh):
    """
    Compute IoU matrix between:
      - gt_boxes_xywh: list of [x, y, w, h]
      - pred_boxes_xywh: list of [x, y, w, h]
    Returns: (N_gt, N_pred) IoU matrix.
    """
    N = len(gt_boxes_xywh)
    M = len(pred_boxes_xywh)

    if N == 0 or M == 0:
        return np.zeros((N, M), dtype=float)

    gt = np.array([xywh_to_xyxy(b) for b in gt_boxes_xywh], dtype=float)  # (N,4)
    pr = np.array([xywh_to_xyxy(b) for b in pred_boxes_xywh], dtype=float)  # (M,4)

    gt_x1 = gt[:, 0][:, None]
    gt_y1 = gt[:, 1][:, None]
    gt_x2 = gt[:, 2][:, None]
    gt_y2 = gt[:, 3][:, None]

    pr_x1 = pr[:, 0][None, :]
    pr_y1 = pr[:, 1][None, :]
    pr_x2 = pr[:, 2][None, :]
    pr_y2 = pr[:, 3][None, :]

    inter_x1 = np.maximum(gt_x1, pr_x1)
    inter_y1 = np.maximum(gt_y1, pr_y1)
    inter_x2 = np.minimum(gt_x2, pr_x2)
    inter_y2 = np.minimum(gt_y2, pr_y2)

    inter_w = np.clip(inter_x2 - inter_x1, a_min=0, a_max=None)
    inter_h = np.clip(inter_y2 - inter_y1, a_min=0, a_max=None)
    inter_area = inter_w * inter_h

    gt_area = (gt_x2 - gt_x1) * (gt_y2 - gt_y1)   # (N,1)
    pr_area = (pr_x2 - pr_x1) * (pr_y2 - pr_y1)   # (1,M)
    union_area = gt_area + pr_area - inter_area

    iou = np.zeros_like(inter_area)
    mask = union_area > 0
    iou[mask] = inter_area[mask] / union_area[mask]
    return iou

### 3.4 Compute MOTA, FP, FN, IDSW, GT

In [None]:
# ============================================
# 3.4 Compute MOTA, FP, FN, IDSW, GT
# ============================================
def compute_mota(gt_by_frame, pred_by_frame, iou_threshold=0.5):
    """
    Compute MOTA, and totals of FP, FN, IDSW, and GT.
    Following the definition given in the assignment.
    """
    frames = sorted(gt_by_frame.keys())       # frames with GT
    all_frames = frames                       # just use GT frames (fixed for issue)


    total_FP = 0
    total_FN = 0
    total_IDSW = 0
    total_GT = 0

    # For ID switch tracking: gt_id -> last matched pred_id
    prev_match_for_gt = {}

    for t in all_frames:
        gt_objs = gt_by_frame.get(t, [])
        pr_objs = pred_by_frame.get(t, [])

        gt_boxes = [g["bbox"] for g in gt_objs]
        gt_ids = [g["id"] for g in gt_objs]

        pr_boxes = [p["bbox"] for p in pr_objs]
        pr_ids = [p["id"] for p in pr_objs]

        N = len(gt_boxes)
        M = len(pr_boxes)

        total_GT += N

        if N == 0 and M == 0:
            # nothing here
            continue

        # IoU matrix
        iou_mat = compute_iou_matrix(gt_boxes, pr_boxes)

        if N > 0 and M > 0:
            # Cost matrix for Hungarian: we want to maximize IoU,
            # so we minimize (1 - IoU). Set cost very high if IoU < threshold.
            cost = 1.0 - iou_mat
            cost[iou_mat < iou_threshold] = 1e6

            row_ind, col_ind = linear_sum_assignment(cost)

            matched_gt_idx = set()
            matched_pr_idx = set()

            # Evaluate matches above threshold
            for r, c in zip(row_ind, col_ind):
                if iou_mat[r, c] >= iou_threshold:
                    matched_gt_idx.add(r)
                    matched_pr_idx.add(c)

                    gt_id = gt_ids[r]
                    pr_id = pr_ids[c]

                    # Identity switch
                    if gt_id in prev_match_for_gt:
                        if prev_match_for_gt[gt_id] != pr_id:
                            total_IDSW += 1
                    prev_match_for_gt[gt_id] = pr_id

            # FN: GT with no match
            FN_t = N - len(matched_gt_idx)

            # FP: predictions with no match
            FP_t = M - len(matched_pr_idx)

        elif N == 0 and M > 0:
            # All predictions are FP
            FP_t = M
            FN_t = 0

        elif N > 0 and M == 0:
            # All GT are FN
            FN_t = N
            FP_t = 0

        total_FN += FN_t
        total_FP += FP_t

    if total_GT == 0:
        mota = 0.0
    else:
        mota = 1.0 - (total_FN + total_FP + total_IDSW) / total_GT

    return mota, total_FP, total_FN, total_IDSW, total_GT


mota, total_FP, total_FN, total_IDSW, total_GT = compute_mota(
    gt_by_frame, pred_by_frame, iou_threshold=0.5
)

print(f"MOTA: {mota:.4f}")
print(f"Total GT:   {total_GT}")
print(f"Total FP:   {total_FP}")
print(f"Total FN:   {total_FN}")
print(f"Total IDSW: {total_IDSW}")


In [None]:
# debug: how many GT vs predictions per frame?
all_gt_frames = sorted(gt_by_frame.keys())
all_pred_frames = sorted(pred_by_frame.keys())

total_GT = sum(len(gt_by_frame[f]) for f in all_gt_frames)
total_pred = sum(len(pred_by_frame.get(f, [])) for f in all_gt_frames)

print("Frames with GT:", len(all_gt_frames))
print("Total GT boxes:", total_GT, "-> avg per frame:", total_GT / len(all_gt_frames))
print("Total pred boxes on those frames:", total_pred, "-> avg per frame:", total_pred / len(all_gt_frames))


## MOTA Score Explanation

Our tracker achieved **MOTA = 0.47**. This is mainly due to:

- **High FN (missed people):** Many pedestrians are far away, small, or occluded, so YOLOv8 fails to detect them.
- **High FP (extra detections):** Background textures and shadows sometimes trigger false positives.
- **Identity switches:** Pedestrians overlap and look very similar, causing DeepSORT to change IDs when tracks are lost or occluded.
- **Low FPS (14):** Larger motion between frames makes tracking harder and increases ID switches.

All these errors contribute directly to lowering the MOTA score.


### DEBUG sweep sharpening brightness and confidence values


In [None]:
from pathlib import Path
import itertools
import pandas as pd

# ============================================
# Hyperparameter sweep: conf × gamma × sigma × amount
# ============================================

# Confidence values to try
CONF_SWEEP = [0.29, 0.28, 0.27, 0.26, 0.25]

# Preprocessing hyperparam grids 
GAMMA_SWEEP = [1.3]   
SIGMA_SWEEP = [0.9]        
AMOUNT_SWEEP = [1.8, 1.9]      

sweep_results = []

# Load GT once
gt_by_frame = load_gt(TASK1_GT_PATH)

for conf in CONF_SWEEP:
    print("\n" + "=" * 70)
    print(f"### Sweeping hyperparams for conf = {conf:.2f}")
    print("=" * 70)

    # For this conf, reuse same output filenames
    tag = int(conf * 100)
    task1_video_out  = Path(f"task1_conf{tag:03d}.mp4")
    task1_tracks_out = Path(f"task1_tracks_conf{tag:03d}.txt")

    for gamma, sigma, amount in itertools.product(GAMMA_SWEEP, SIGMA_SWEEP, AMOUNT_SWEEP):
        # Set global preprocess hyperparameters
        GAMMA = gamma
        SHARP_SIGMA = sigma
        SHARP_AMOUNT = amount

        print(
            f"\n--- conf={conf:.2f}, gamma={gamma:.2f}, "
            f"sigma={sigma:.2f}, amount={amount:.2f} ---"
        )

        # Fresh model + tracker for each combo
        yolo_model_t1 = init_yolo()
        deepsort_t1   = init_deepsort()

        # Run tracking with these hyperparams
        run_tracking(
            TASK1_INPUT_VIDEO,
            task1_video_out,
            task1_tracks_out,
            yolo_model_t1,
            deepsort_t1,
            fps=FPS_TASK1,
            conf=conf,
            imgsz=YOLO_IMGSZ,
        )

        # Compute MOTA for this combo
        pred_by_frame = load_predictions(task1_tracks_out)

        mota, total_FP, total_FN, total_IDSW, total_GT = compute_mota(
            gt_by_frame, pred_by_frame, iou_threshold=0.5
        )

        print(
            f"MOTA={mota:.4f} | FP={total_FP} | FN={total_FN} | "
            f"IDSW={total_IDSW} | GT={total_GT}"
        )

        sweep_results.append(
            {
                "conf": conf,
                "gamma": gamma,
                "sigma": sigma,
                "amount": amount,
                "MOTA": mota,
                "FP": total_FP,
                "FN": total_FN,
                "IDSW": total_IDSW,
                "GT": total_GT,
            }
        )

# Turn results into a DataFrame 
sweep_df = pd.DataFrame(sweep_results)
sweep_df


## 4. Prediction & Kaggle Competition

### 4.1 Convert Task2 images to a video and track

## Task 2 Counting Method

For each frame, we use Faster R-CNN to detect pedestrians and count all detection boxes above a chosen confidence threshold. These per-frame counts are saved in a CSV file for Kaggle evaluation.

Although we implemented DeepSORT for Task 2 as well, we found during testing that **using only the detector (no tracker) and tuning the confidence threshold produced a better Kaggle score**. The tracker's identity persistence is not needed for counting and sometimes increases false positives.

However, **the assignment requires an annotated tracking video**, so our submitted `task2.mp4` uses the tracker, while the Kaggle CSV uses the detector-only counts.

## Reason for Using Faster R-CNN

We chose Faster R-CNN because it performs more reliably on medium-sized pedestrians and produces more stable scores across frames compared to YOLOv8. This stability makes confidence-based counting more consistent, especially when not using a tracker for the Kaggle submission.


In [None]:
# ============================================
# 4.1 Task2: Faster R-CNN + DeepSORT on Task2 images
#     - Preprocess frames
#     - Build input video (task2_input.mp4)
#     - Build tracked video (task2.mp4)
#     - Save tracking results (task2_tracks.txt)
#     - Return per-frame counts for CSV
# ============================================
import torch
from pathlib import Path
from PIL import Image
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2

# cv2, np, DeepSort imported earlier in the notebook.

# Task2-specific settings 
DEVICE_T2       = "cuda"   
FPS_TASK2       = 14.0     
SCORE_THRESH_T2 = 0.475    # detector score threshold

# Paths for Task2
TASK2_INPUT_VIDEO  = Path("task2_input.mp4")
TASK2_TRACKS_TXT   = Path("task2_tracks.txt")


def load_frcnn_detector(device: str = DEVICE_T2):
    """
    Load a Faster R-CNN ResNet50 FPN v2 detector with COCO weights.
    """
    if device == "cuda" and not torch.cuda.is_available():
        print("CUDA requested but not available, falling back to CPU.")
        device = "cpu"

    model = fasterrcnn_resnet50_fpn_v2(weights="DEFAULT")
    model.to(device)
    model.eval()
    return model, device


def init_deepsort_task2():
    """
    Initialize DeepSort tracker for Task2.
    """
    tracker = DeepSort(
        max_age=30,
        n_init=3,
        max_iou_distance=0.7,
        nms_max_overlap=1.0,
        max_cosine_distance=0.2,
        embedder="mobilenet",
        half=False,
        bgr=True,
        embedder_gpu=True,
    )
    return tracker


def preprocess_frame(frame_bgr):
    """
    Darken bright regions and sharpen the image to help detector.
    """
    # 1) Darken highlights using gamma on V channel in HSV
    hsv = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2HSV)
    h, s, v = cv2.split(hsv)

    v_float = v.astype(np.float32) / 255.0
    gamma = 1.4
    v_gamma = np.power(v_float, gamma)
    v_new = np.clip(v_gamma * 255.0, 0, 255).astype(np.uint8)

    hsv = cv2.merge([h, s, v_new])
    img_bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)

    # 2) Stronger sharpening (unsharp mask)
    sigma = 1.2
    amount = 1.8
    blurred = cv2.GaussianBlur(img_bgr, (0, 0), sigma)
    sharp = cv2.addWeighted(img_bgr, 1.0 + amount, blurred, -amount, 0)

    return sharp


def process_task2_frames(
    frames_dir: Path,
    input_video_path: Path,
    tracked_video_path: Path,
    tracks_txt_path: Path,
    fps: float = FPS_TASK2,
    device: str = DEVICE_T2,
    score_thresh: float = SCORE_THRESH_T2,
):
    """
    - Reads Task2 frames from frames_dir
    - Builds a raw input video (input_video_path)
    - Runs Faster R-CNN + DeepSORT on preprocessed frames
    - Builds a tracked video (tracked_video_path)
    - Saves MOT-style tracks to tracks_txt_path
    - Returns: dict[frame_idx] -> person_count (from detector boxes)
    """
    # Load detector + tracker
    model, device = load_frcnn_detector(device)
    tracker = init_deepsort_task2()

    image_paths = sorted(frames_dir.glob("*.jpg"))
    if not image_paths:
        raise FileNotFoundError(f"No .jpg files found in {frames_dir}")

    # Read first frame to get size
    first_frame = cv2.imread(str(image_paths[0]))
    if first_frame is None:
        raise RuntimeError(f"Could not read first frame: {image_paths[0]}")

    height, width = first_frame.shape[:2]
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")

    # Video writers
    input_writer = cv2.VideoWriter(str(input_video_path), fourcc, fps, (width, height))
    tracked_writer = cv2.VideoWriter(str(tracked_video_path), fourcc, fps, (width, height))

    # Outputs
    track_lines = []      # for MOT-style txt
    frame_counts = {}     # frame_idx -> person_count

    frame_idx = 0

    with torch.no_grad():
        for img_path in image_paths:
            frame_idx += 1

            frame_bgr = cv2.imread(str(img_path))
            if frame_bgr is None:
                print(f"WARNING: could not read {img_path}, skipping.")
                continue

            original_bgr = frame_bgr.copy()

            # Preprocess for detector (Optional)
            # frame_bgr = preprocess_frame(frame_bgr)

            # Write original raw frame to input video
            input_writer.write(original_bgr)

            # Prepare image for detector (RGB) using preprocessed frame
            rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
            pil_img = Image.fromarray(rgb)
            tensor = F.to_tensor(pil_img).to(device)

            outputs = model([tensor])[0]
            boxes = outputs["boxes"].cpu()
            labels = outputs["labels"].cpu()
            scores = outputs["scores"].cpu()

            detections = []
            person_count = 0

            # Build detections for DeepSort
            for box, label, score in zip(boxes, labels, scores):
                if label.item() != 1:  # COCO: 1 = person
                    continue
                if score.item() < score_thresh:
                    continue

                x1, y1, x2, y2 = box.tolist()

                # Clamp to frame
                x1 = max(0.0, min(x1, width - 1.0))
                x2 = max(0.0, min(x2, width - 1.0))
                y1 = max(0.0, min(y1, height - 1.0))
                y2 = max(0.0, min(y2, height - 1.0))

                if x2 <= x1 or y2 <= y1:
                    continue

                w = x2 - x1
                h = y2 - y1

                detections.append(([x1, y1, w, h], float(score.item()), "person"))
                person_count += 1

            # Update tracker
            tracks = tracker.update_tracks(detections, frame=frame_bgr)

            # Draw tracks and build MOT lines
            for trk in tracks:
                if not trk.is_confirmed() or trk.time_since_update > 0:
                    continue

                x1, y1, x2, y2 = map(int, trk.to_ltrb())

                # Clamp
                x1 = max(0, min(x1, width - 1))
                x2 = max(0, min(x2, width - 1))
                y1 = max(0, min(y1, height - 1))
                y2 = max(0, min(y2, height - 1))

                bb_left = x1
                bb_top = y1
                bb_width = max(0, x2 - x1)
                bb_height = max(0, y2 - y1)
                track_id = trk.track_id

                # Draw rectangle + ID
                cv2.rectangle(
                    frame_bgr,
                    (bb_left, bb_top),
                    (bb_left + bb_width, bb_top + bb_height),
                    (0, 255, 0),
                    2,
                )
                cv2.putText(
                    frame_bgr,
                    f"ID {track_id}",
                    (bb_left, max(0, bb_top - 5)),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    1,
                    cv2.LINE_AA,
                )

                # MOT-style line: <frame>, <id>, <bb_left>, <bb_top>, <bb_width>, <bb_height>
                track_lines.append(
                    f"{frame_idx}, {track_id}, {bb_left}, {bb_top}, {bb_width}, {bb_height}\n"
                )

            # Write tracked frame (preprocessed + annotations)
            tracked_writer.write(frame_bgr)

            # Save count for this frame (detector-based, not unique IDs)
            frame_counts[frame_idx] = person_count

            print(
                f"Frame {frame_idx:4d}: {person_count:2d} people,"
                f" {len(detections)} detections, {len(tracks)} tracks"
            )

    # Release videos
    input_writer.release()
    tracked_writer.release()

    # Write tracks txt
    tracks_txt_path.parent.mkdir(parents=True, exist_ok=True)
    with tracks_txt_path.open("w") as f:
        f.writelines(track_lines)

    print(f"\nSaved input video to   {input_video_path}")
    print(f"Saved tracked video to {tracked_video_path}")
    print(f"Saved tracks to        {tracks_txt_path}")

    return frame_counts


# ---- Run Task2 processing and get per-frame counts ----
task2_frame_counts = process_task2_frames(
    frames_dir=TASK2_IMAGES_DIR,
    input_video_path=TASK2_INPUT_VIDEO,
    tracked_video_path=TASK2_OUTPUT_VIDEO,
    tracks_txt_path=TASK2_TRACKS_TXT,
    fps=FPS_TASK2,
    device=DEVICE_T2,
    score_thresh=SCORE_THRESH_T2,
)


### 4.2 Save frame counts

In [None]:
# ============================================
# 4.2 Save Task2 frame counts to CSV for Kaggle
#     Format:
#       Number,Count
#       1,12
#       2,15
#       ...
# ============================================
def save_counts_to_csv(frame_counts: dict, csv_path: Path):
    """
    frame_counts: dict[frame_idx] -> count
    """
    data = []
    for frame_idx in sorted(frame_counts.keys()):
        data.append({"Number": frame_idx, "Count": frame_counts[frame_idx]})

    df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)
    print(f"Saved counts to {csv_path}")


save_counts_to_csv(task2_frame_counts, TASK2_COUNTS_CSV)


## Kaggle Score Explanation

Our Kaggle RMSE score was **2.38**. Errors mainly come from:

- **Overcounting:** Duplicate detections on the same person in crowded frames.
- **Undercounting:** Small or distant pedestrians are often missed.
- **Threshold sensitivity:** The detector score threshold was not fully optimized.

These factors cause the predicted counts to differ from the ground truth by ~2–3 people on average.


## Final Summary

We built videos, tracked pedestrians, evaluated tracking with MOTA, and generated per-frame counts for Task 2. The results depend heavily on detection quality, occlusions, and threshold choices.