In [1]:
import numpy as np
import cv2
from pathlib import Path
import os
from ultralytics import YOLO

## Inputs

In [2]:
model_path = Path("../yolo11s.onnx")
vid_dir_path = Path("../input_videos")
out_dir_path = Path("./output_videos")

model_input_size = np.array((640, 640))
label_l = [
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
]

imshow = True
display_color = (0, 255, 0)

## Init

In [3]:
# Model
onnx_model = YOLO(model_path)

# Input
vid_path_l = list(vid_dir_path.glob("*.mp4"))
assert len(vid_path_l) > 0, "Videos not found"

# Output
out_dir_path.mkdir(exist_ok=True)
assert out_dir_path.is_dir(), f"Error in creating out dir {out_dir_path}"



In [4]:
vid_path_l = [Path("../input_videos/12057881_3840_2160_24fps.mp4")]
for vid_path in vid_path_l:
    cap = cv2.VideoCapture(vid_path)
    if not cap.isOpened():
        print(f"Error in reading {vid_path} video")
        continue
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break  # End of video

        # Run inference on the frame
        results = onnx_model(frame)

        # Draw bounding boxes and labels
        for result in results:
            for box in result.boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])  # Bounding box coordinates
                conf = box.conf[0].item()  # Confidence score
                cls = int(box.cls[0].item())  # Class ID
                label = f"{onnx_model.names[cls]} ({conf:.2f})"

                # Draw bounding box
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(
                    frame,
                    label,
                    (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 0),
                    2,
                )
                    
            if imshow:
                cv2.imshow(vid_path.name, frame) 
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break

        cv2.destroyAllWindows()

Loading ../yolo11s.onnx for ONNX Runtime inference...
Using ONNX Runtime CPUExecutionProvider

0: 640x640 11 cars, 1 bus, 1 truck, 215.5ms
Speed: 10.8ms preprocess, 215.5ms inference, 18.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 9 cars, 1 bus, 137.0ms
Speed: 2.3ms preprocess, 137.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 cars, 1 bus, 1 truck, 1 traffic light, 118.5ms
Speed: 2.2ms preprocess, 118.5ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 cars, 1 bus, 2 trucks, 104.3ms
Speed: 2.3ms preprocess, 104.3ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 cars, 1 bus, 1 truck, 164.0ms
Speed: 2.2ms preprocess, 164.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 cars, 1 bus, 2 trucks, 1 traffic light, 160.4ms
Speed: 3.2ms preprocess, 160.4ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 11 cars,

2025-02-08 10:41:11.730 Python[7617:129750] +[IMKClient subclass]: chose IMKClient_Modern
2025-02-08 10:41:11.730 Python[7617:129750] +[IMKInputSession subclass]: chose IMKInputSession_Modern



0: 640x640 12 cars, 1 truck, 1 traffic light, 118.4ms
Speed: 1.7ms preprocess, 118.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 10 cars, 1 bus, 2 trucks, 1 traffic light, 109.8ms
Speed: 2.5ms preprocess, 109.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 cars, 1 bus, 2 trucks, 116.7ms
Speed: 1.8ms preprocess, 116.7ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 10 cars, 1 bus, 2 trucks, 1 traffic light, 162.9ms
Speed: 1.9ms preprocess, 162.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 cars, 1 bus, 2 trucks, 1 traffic light, 186.4ms
Speed: 2.3ms preprocess, 186.4ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 cars, 1 bus, 1 truck, 1 traffic light, 135.1ms
Speed: 2.0ms preprocess, 135.1ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 12 cars, 1 truck, 123.0ms
Speed: 2.3ms preproces

In [5]:
cv2.destroyAllWindows()