In [3]:
from ultralytics import YOLO
from PIL import Image
from data_utils import extract_rectangles_from_xml
from eval_utils import mAP
import cv2
from tqdm import tqdm
import numpy as np

In [4]:
# Load a model
model = YOLO("yolov8n.pt")  # load a pretrained model (recommended for training)

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.23M/6.23M [00:00<00:00, 9.14MB/s]


In [8]:
cap = cv2.VideoCapture('data/S03/c010/vdo.avi')
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# We only do inference on the last 75% of the video
starting_frame = int(n_frames // 4)
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)

frame_detections = []
for _ in tqdm(range(starting_frame, n_frames)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    # Run model on current frame
    results = model(frame, verbose=False)

    # Save frame detections
    frame_detections.append(results[0].boxes)

100%|██████████| 100/100 [00:35<00:00,  2.82it/s]


In [4]:
annotation = extract_rectangles_from_xml('data/ai_challenge_s03_c010-full_annotation.xml')
parked_cars = annotation[0]

gt_bbox = [
    [list(np.array(r).astype(int)) for r in rect]# if r not in parked_cars]
    for rect in list(annotation.values())[starting_frame:]
]

In [5]:
filtered_predictions = [[box.xyxy[0] for box in frame_boxes if box.cls == 2] for frame_boxes in frame_detections]

In [6]:
filtered_predictions = []
for i, frame_boxes in enumerate(frame_detections):
    filtered = []
    for bbox in frame_boxes:
        xyxy = bbox.xyxy[0]
        """
        dist_thr = 25
        distances = np.array(
            [
                np.mean([
                    np.linalg.norm((xyxy[0]-gt[0], xyxy[1]-gt[1])), 
                    np.linalg.norm((xyxy[2]-gt[2], xyxy[3]-gt[3]))]) 
                for gt in parked_cars
                ])
        # If the mean distance of the corners of the predicted bbox to the corners
        # of a parked car is above the threshold and the class of the prediction is a "Car",
        # it is a "good" prediction
        #if not np.any(distances < dist_thr) and bbox.cls == 2:
        """
        if bbox.cls == 2:
            filtered.append(xyxy.tolist())

    filtered_predictions.append(filtered)

In [7]:
# Create video with detected bounding boxes + GT boxes
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)
video=cv2.VideoWriter('test.mp4', -1, fps, (width, height), True)
for i in tqdm(range(n_frames-starting_frame)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    for gt in gt_bbox[i]:
        frame = cv2.rectangle(frame, (gt[0], gt[1]), (gt[2], gt[3]), (0, 255, 0), 3)

    for pred in filtered_predictions[i]:
        frame = cv2.rectangle(frame, (int(pred[0]), int(pred[1])), (int(pred[2]), int(pred[3])), (0, 0, 255), 2)
    video.write(frame)
video.release()

100%|██████████| 1606/1606 [00:37<00:00, 42.82it/s]


In [8]:
mAP, mIoU = mAP(filtered_predictions, gt_bbox, N=200)
print(f"mAP: {mAP} | mIoU: {mIoU}")

mAP: 0.46754475553156843 | mIoU: 0.6035712791765286
