In [31]:
import cv2
import supervision as sv
import numpy as np
from tqdm import tqdm
from inference.models.yolo_world.yolo_world import YOLOWorld
import pickle
from tqdm import tqdm
from evaluation import mAP
from data_utils import extract_rectangles_from_xml

In [32]:
# Load Pretrained YOLO-World
model = YOLOWorld(model_id="yolo_world/l")
classes = ["car"]
model.set_classes(classes)

In [33]:
# Load video and get some variables
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# We only use the last 75% of the video
starting_frame = int(n_frames // 4)
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)

True

In [34]:
processed_frames, frame_detections = [], []
for _ in tqdm(range(starting_frame, n_frames)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    results = model.infer(frame)
    detections = sv.Detections.from_inference(results)

    # Save frame detections
    frame_detections.append(detections.xyxy)

    # Add bounding boxes/confidences to frames
    box_annotator = sv.BoxAnnotator(thickness=2, text_thickness=1, text_scale=0.5)
    labels = [f"{model.class_names[class_id]} {confidence:0.2f}" for _, _, confidence, class_id, _, _ in detections]
    processed_frames.append(box_annotator.annotate(scene=frame, detections=detections, labels=labels))

cap.release()

100%|██████████| 1606/1606 [12:56<00:00,  2.07it/s]


In [35]:
# Create video with detected bounding boxes
video=cv2.VideoWriter('output/output_yolo.mp4', -1, fps, (width, height), True)
for frame in processed_frames:
    video.write(frame)
video.release()

In [36]:
# Save detections
with open("output/yolo_detections.pkl", "wb") as f:
    pickle.dump(frame_detections, f)

In [37]:
# Load GT annotations
annotation = extract_rectangles_from_xml('data/ai_challenge_s03_c010-full_annotation.xml')
parked_cars = annotation[0] # Get the parked cars bboxes from the first frame

# Ignore parked cars from the GTs
gt_bbox = [
    [list(np.array(r).astype(int)) for r in rect if r not in parked_cars]
    for rect in list(annotation.values())[starting_frame:]
]

In [38]:
# Load YOLO detections
with open("output/yolo_detections.pkl", "rb") as f:
    frame_detections = pickle.load(f)

# Filter those predictions that correspond to parked cars
dist_thr = 30
filtered_predictions = []
for i, detections in enumerate(frame_detections):
    filtered = []
    for bbox in detections:
        distances = np.array([np.mean([np.linalg.norm((bbox[0]-gt[0], bbox[1]-gt[1])), np.linalg.norm((bbox[2]-gt[2], bbox[3]-gt[3]))]) for gt in parked_cars])
        if not np.any(distances < dist_thr):
            filtered.append(bbox.tolist())
    filtered_predictions.append(filtered)

In [39]:
# Compute mAP
mAP(filtered_predictions, gt_bbox, N=200)

0.5649592128140454

In [40]:
# Create video with detected bounding boxes + GT boxes
cap = cv2.VideoCapture('data/AICity_data/train/S03/c010/vdo.avi')
cap.set(cv2.CAP_PROP_POS_FRAMES, starting_frame)
video=cv2.VideoWriter('output/output_yolo_and_GT.mp4', -1, fps, (width, height), True)
for i in tqdm(range(n_frames-starting_frame)):
    if not cap.isOpened():
        break
    ret, frame = cap.read()
    for gt in gt_bbox[i]:
        frame = cv2.rectangle(frame, (gt[0], gt[1]), (gt[2], gt[3]), (0, 255, 0), 3)

    for pred in filtered_predictions[i]:
        frame = cv2.rectangle(frame, (int(pred[0]), int(pred[1])), (int(pred[2]), int(pred[3])), (0, 0, 255), 2)
    video.write(frame)
video.release()

100%|██████████| 1606/1606 [00:26<00:00, 60.10it/s]
