In [15]:
import torch
import onnxruntime as ort
import numpy as np
import cv2


In [16]:
def preprocess(image: np.ndarray, input_shape: tuple) -> np.ndarray:
    """
    Preprocess the input image for YOLOv10 model.

    Args:
        image (np.ndarray): The input image.
        input_shape (tuple): The shape of the input tensor.

    Returns:
        np.ndarray: The preprocessed image.
    """
    # Resize and normalize the image
    image_resized = cv2.resize(image, (input_shape[2], input_shape[3]))
    image_normalized = image_resized / 255.0
    image_transposed = np.transpose(image_normalized, (2, 0, 1))
    image_expanded = np.expand_dims(image_transposed, axis=0)
    return image_expanded.astype(np.float32)


def postprocess(preds: np.ndarray, max_det: int, nc: int = 80, conf_thres: float = 0.25) -> np.ndarray:
    """
    Post-processes the predictions obtained from a YOLOv10 model.

    Args:
        preds (np.ndarray): The predictions obtained from the model. It should have a shape of (batch_size, num_boxes, 4 + num_classes).
        max_det (int): The maximum number of detections to keep.
        nc (int, optional): The number of classes. Defaults to 80.

    Returns:
        (np.ndarray): The post-processed predictions with shape (batch_size, max_det, 6),
            including bounding boxes, scores, and cls.
    """
    boxes, scores = np.split(preds, [4], axis=-1)
    scores = scores[:, :, 1:]  # Ignore background score

    max_scores = scores.max(axis=-1)
    max_scores_indices = np.argsort(-max_scores, axis=-1)[:, :max_det]

    boxes = np.array([boxes[i, max_scores_indices[i]] for i in range(boxes.shape[0])])
    scores = np.array([scores[i, max_scores_indices[i]] for i in range(scores.shape[0])])
    max_scores = np.array([max_scores[i, max_scores_indices[i]] for i in range(max_scores.shape[0])])

    scores_flat = scores.reshape(scores.shape[0], -1)
    topk_indices = np.argsort(-scores_flat, axis=-1)[:, :max_det]
    topk_scores = np.array([scores_flat[i, topk_indices[i]] for i in range(scores_flat.shape[0])])
    labels = topk_indices % nc
    box_indices = topk_indices // nc
    selected_boxes = np.array([boxes[i, box_indices[i]] for i in range(boxes.shape[0])])

    # Filter out detections with low confidence
    mask = (topk_scores > conf_thres).flatten()
    selected_boxes = selected_boxes[:, mask, :]
    topk_scores = topk_scores[:, mask]
    labels = labels[:, mask]

    # Transform bbox coordinates to x1y1x2y2 format
    selected_boxes[:, :, :2] -= selected_boxes[:, :, 2:] / 2
    selected_boxes[:, :, 2:] += selected_boxes[:, :, :2]

    return np.concatenate(
        [selected_boxes, topk_scores[..., None], labels[..., None].astype(selected_boxes.dtype)],
        axis=-1
    )


def parse_yolo_outputs(outputs, strides, anchors):
    output = None
    for x, s, a in zip(outputs, strides, anchors):
        out = parse_yolo_output(x, s, a)
        output = out if output is None else np.concatenate((output, out), axis = 1)
        
    return output

def parse_yolo_output(x, stride, anchors = None):
    na = 1 if anchors is None else len(anchors)
    bs, _, ny, nx = x.shape
    grid = make_grid_numpy(nx, ny, na)
    x = x.reshape(bs, na, -1, ny, nx).transpose((0, 1, 4, 3, 2))
    #x[..., 0:2] = (x[..., 0:2] + grid) * stride  # xy
    #x[..., 2:4] = np.exp(x[..., 2:4]) * stride # wh

    x1y1 = grid - x[..., 0:2] + 0.5
    x2y2 = grid + x[..., 2:4] + 0.5

    c_xy = (x1y1 + x2y2) / 2
    wh = x2y2 - x1y1
    x[..., 0:2] = c_xy * stride
    x[..., 2:4] = wh * stride
    
    x = x.reshape(bs, ny * nx, -1)
    return x

def make_grid_numpy(ny, nx, na):
    y, x = np.arange(ny), np.arange(nx)
    yv, xv = np.meshgrid(y, x, indexing='ij')
    #grid = np.stack((xv, yv), 2).reshape(1, na, nx, ny, 2)
    grid = np.stack((yv, xv), 2).reshape(1, na, ny, nx, 2)
    return grid

# Inference function
def infer(image: np.ndarray, max_det: int = 10, nc: int = 80, session: ort.InferenceSession = None, input_name: str = None, output_names: list = None) -> torch.Tensor:
    """
    Perform inference on the input image using the YOLOv10 ONNX model.

    Args:
        image (np.ndarray): The input image.
        max_det (int, optional): The maximum number of detections to keep. Defaults to 100.
        nc (int, optional): The number of classes. Defaults to 80.

    Returns:
        torch.Tensor: The post-processed predictions.
    """
    input_shape = session.get_inputs()[0].shape
    print("Input shape:", input_shape)
    image_preprocessed = preprocess(image, input_shape)
    
    # Run inference
    outputs = session.run(output_names, {input_name: image_preprocessed})

    output = parse_yolo_outputs(outputs, [8, 16, 32], [None, None, None])
    # Combine outputs and convert to torch.Tensor
    #combined_output = np.concatenate(outputs, axis=1)
    #preds = torch.tensor(combined_output)

    print(output.shape)
    
    # Post-process predictions
    result = postprocess(output, max_det, nc)
    print(result.shape)
    
    return result

def visualize(image: np.ndarray, results: torch.Tensor, class_names: list):
    """
    Visualize the detection results on the image.

    Args:
        image (np.ndarray): The original image.
        results (torch.Tensor): The detection results.
        class_names (list): List of class names.
    """
    for det in results[0]:
        print(det)
        box = det[:4].astype(int)
        score = det[4]
        cls_id = int(det[5])
        
        # Draw bounding box
        print(box)
        print(score)
        cv2.rectangle(image, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2)
        #cv2.rectangle(image, (box[0] - box[2] // 2, box[1] - box[3] // 2), (box[0] + box[2] // 2, box[1] + box[3] // 2), (0, 255, 0), 2)
        
        # Draw label and score
        label = f"{class_names[cls_id]}: {score:.2f}"
        cv2.putText(image, label, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)

    return image

In [17]:
# Load the ONNX model
model_path = 'tmp/yolov10n.onnx'
session = ort.InferenceSession(model_path)

print(session.get_inputs()[0].name, session.get_inputs()[0].shape)

# Define the input and output names
input_name = session.get_inputs()[0].name
output_names = [session.get_outputs()[i].name for i in range(3)]


# Load an image
image_path = 'tmp/image.jpg'
class_names = [
    "person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat", 
    "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", 
    "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", 
    "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", 
    "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket", 
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", 
    "sofa", "pottedplant", "bed", "diningtable", "toilet", "tvmonitor", "laptop", "mouse", 
    "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", 
    "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

image = cv2.imread(image_path)

# Perform inference
results = infer(image, session=session, input_name=input_name, output_names=output_names)

# Visualize results
image = cv2.resize(image, (640, 640))
image_with_detections = visualize(image, results, class_names)

# Display the image
# cv2.imshow('Detections', image_with_detections)
# cv2.waitKey(0)
# cv2.destroyAllWindows()

# Save the image with detections
output_image_path = 'tmp/image_with_detections.jpg'
cv2.imwrite(output_image_path, image_with_detections)

images [1, 3, 640, 640]
Input shape: [1, 3, 640, 640]
(1, 8400, 85)
(1, 2, 6)
[179.37454    47.628754  407.9586    245.2543      0.8831582  14.       ]
[179  47 407 245]
0.8831582
[  5.6466675 143.55789   640.15515   615.3313      0.8165297   1.       ]
[  5 143 640 615]
0.8165297


True