In [22]:
from ultralytics import YOLO
from data_utils import extract_rectangles_from_xml
from eval_utils import mAP
import cv2
from tqdm import tqdm
import numpy as np

In [23]:
# Load Pretrained YOLOv8
model = YOLO("yolov8n.pt")

# Load Finetuned YOLOv8
# model = YOLO("results/strategy_A/weights/best.pt")

In [24]:
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# We only do inference on the last 75% of the video
starting_frame = int(n_frames // 4)
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)

frame_detections = []
for _ in tqdm(range(starting_frame, n_frames)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    # Run model on current frame
    results = model.predict(frame, verbose=False)

    # Save frame detections
    frame_detections.append(results[0].boxes)

100%|██████████| 1606/1606 [00:33<00:00, 48.23it/s]


In [25]:
# Load GT annotations
annotation = extract_rectangles_from_xml('data/ai_challenge_s03_c010-full_annotation.xml')
gt_bbox = [
    [list(np.array(r).astype(int)) for r in rect]
    for rect in list(annotation.values())[starting_frame:]
]

In [29]:
filtered_predictions = []
for i, frame_boxes in enumerate(frame_detections):
    filtered = []
    for bbox in frame_boxes:
        xyxy = bbox.xyxy[0]
        # We are only interested in car detections (Class 2)
        if bbox.cls == 2: 
            filtered.append(xyxy.tolist())
    filtered_predictions.append(filtered)

In [33]:
# Create video with detected bounding boxes + GT boxes
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)
video=cv2.VideoWriter('finetuned.mp4', -1, fps, (width, height), True)
for i in tqdm(range(n_frames-starting_frame)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    for gt in gt_bbox[i]:
        frame = cv2.rectangle(frame, (gt[0], gt[1]), (gt[2], gt[3]), (0, 255, 0), 3)

    for pred in filtered_predictions[i]:
        frame = cv2.rectangle(frame, (int(pred[0]), int(pred[1])), (int(pred[2]), int(pred[3])), (0, 0, 255), 2)
    video.write(frame)
video.release()

100%|██████████| 1606/1606 [00:16<00:00, 98.89it/s] 


In [32]:
map, mIoU = mAP(filtered_predictions, gt_bbox, N=10)
print(f"mAP: {map} | mIoU: {mIoU}")

mAP: 0.7837943181843569 | mIoU: 0.8857377985469398
