In [1]:
import time
from pathlib import Path

import albumentations as A
import cv2 as cv
import torch
from dotenv import find_dotenv

from cvnets.yolo.v2 import utils
from cvnets.yolo.v2.net import YOLOv2
from cvnets.yolo.voc import VOCSplit, load_voc_dataset

root = Path(find_dotenv(raise_error_if_not_found=True)).parent.absolute()
checkpoints = root.joinpath("src", "cvnets", "yolo", "v2", "checkpoints")

assert checkpoints.exists()

In [2]:
resize = 416

anchors = utils.load_anchor_bboxes(checkpoints.joinpath("anchors", "anchors_k5.npy"))
num_anchors = anchors.size(0)

dataset, classes = load_voc_dataset("~/Documents/Datasets/VOC/", split=VOCSplit.TRAINVAL)
idx_to_class = {k: v for k, v in enumerate(classes)}
num_classes = len(classes)

model = YOLOv2(num_anchors, num_classes)
model.load_state_dict(
    torch.load(
        checkpoints.joinpath("run-2025-01-22-16-03-23", "yolov2-voc.pt"),
        map_location="cpu",
        weights_only=True,
    )
)
model = model.eval()

In [None]:
preprocess = A.Compose([A.Resize(resize, resize), A.Normalize()])
cap = cv.VideoCapture(0)

while True:
    success, frame = cap.read()
    if not success:
        break

    normalized = preprocess(image=cv.cvtColor(frame, cv.COLOR_BGR2RGB))["image"]
    normalized = torch.from_numpy(normalized).permute(2, 0, 1)

    t0 = time.perf_counter()
    with torch.inference_mode():
        predictions = model(normalized.unsqueeze(0))
    inference_time = (time.perf_counter() - t0) * 1000
    print(f"Inference Time: {inference_time:.1f}ms")

    results = utils.postprocess_predictions(
        predictions=predictions,
        anchors=anchors,
        S=13,
        num_classes=num_classes,
        imgsz=(frame.shape[1], frame.shape[0]),
    )

    r = results[0]
    for xyxy, conf, label in zip(r.xyxys.numpy(), r.confs.numpy(), r.labels.numpy()):
        x1, y1, x2, y2 = xyxy.astype("int32")
        conf = conf.item()
        label = label.item()
        cv.putText(
            img=frame,
            text=f"{idx_to_class[label].title()}: {conf:.2f}",
            org=(x1, y1 - 10),
            fontFace=cv.FONT_HERSHEY_SIMPLEX,
            fontScale=0.55,
            color=(0, 255, 0),
            thickness=1,
            lineType=cv.LINE_AA,
        )
        cv.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2, cv.LINE_AA)

    cv.imshow("frame", frame)

    if cv.waitKey(1) == ord("q"):
        break

cap.release()
cv.destroyAllWindows()