In [1]:
import torch
import numpy as np
from PIL import Image
import os
import sys
import pandas as pd
import onnxruntime as rt
import torchvision.transforms as transforms
import cv2

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cpu


In [3]:
def letterbox(img, new_shape=(960, 960), color=(114, 114, 114)):
    orig_w, orig_h = img.size
    r = min(new_shape[0] / orig_h, new_shape[1] / orig_w)
    new_unpad = int(orig_w * r), int(orig_h * r)
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
    dw /= 2
    dh /= 2

    # resize
    img_resized = img.resize(new_unpad, Image.BILINEAR)
    # create padded image
    new_img = Image.new("RGB", new_shape, color)
    new_img.paste(img_resized, (int(dw), int(dh)))
    return new_img, new_unpad[0], new_unpad[1], int(dw), int(dh), r


In [35]:
def nms(boxes, scores, iou_threshold=0.5):
    # boxes: [N,4] xyxy
    x1 = boxes[:,0]
    y1 = boxes[:,1]
    x2 = boxes[:,2]
    y2 = boxes[:,3]
    areas = (x2-x1)*(y2-y1)
    order = scores.argsort()[::-1]
    keep = []

    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0, xx2-xx1)
        h = np.maximum(0, yy2-yy1)
        inter = w*h
        iou = inter / (areas[i]+areas[order[1:]]-inter)
        inds = np.where(iou <= iou_threshold)[0]
        order = order[inds+1]
    return keep

def post_process(output, pad_x, pad_y, orig_w, orig_h, scale, conf_var=0.5, iou_threshold = 0.5):
    pred = output[0]
    pred = pred[0].transpose(1, 0)

    # unpack
    cx = pred[:, 0]
    cy = pred[:, 1]
    w = pred[:, 2]
    h = pred[:, 3]
    conf = pred[:, 4]

    # threshold
    mask = conf > conf_var
    cx = cx[mask]
    cy = cy[mask]
    w = w[mask]
    h = h[mask]
    conf = conf[mask]

    # convert to xyxy
    x1 = cx - w / 2
    y1 = cy - h / 2
    x2 = cx + w / 2
    y2 = cy + h / 2

    boxes = np.stack([x1, y1, x2, y2, conf], axis=1)

    # undo letterbox
    boxes[:, 0] = (boxes[:, 0] - pad_x) / scale
    boxes[:, 1] = (boxes[:, 1] - pad_y) / scale
    boxes[:, 2] = (boxes[:, 2] - pad_x) / scale
    boxes[:, 3] = (boxes[:, 3] - pad_y) / scale

    # clip
    boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, orig_w)
    boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, orig_h)

    xyxy = boxes[:, :4]
    scores = boxes[:, 4]

    keep = nms(xyxy, scores, iou_threshold=iou_threshold)
    boxes = boxes[keep]
    return boxes

def draw_boxes_on_image(image_path, boxes, color=(0, 255, 0), thickness=2):
    """
    image_path: path to original RGB image
    boxes: numpy array (N, 5) â†’ [x1, y1, x2, y2, conf]
    """

    img = cv2.imread(image_path)
    if img is None:
        raise ValueError("Image failed to load")

    for box in boxes:
        x1, y1, x2, y2, conf = box.astype(int)

        # draw rectangle
        cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)

        # label
        label = f"{conf:.2f}"
        cv2.putText(img, label, (x1, y1 - 4),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (255,0,0), 1)
    return img

In [42]:
detection_sess = rt.InferenceSession("models/LineDetectionv1.onnx", providers=rt.get_available_providers())
input_name = detection_sess.get_inputs()[0].name
output_name = detection_sess.get_outputs()[0].name

In [44]:
img = Image.open("images/0_page.jpeg").convert("RGB")
orig_w, orig_h = img.size
padded, new_w, new_h, pad_x, pad_y, scale = letterbox(img)
image = np.array(padded).astype(np.float32)/255.0
image = image.transpose(2,0,1)[None,...]   


output = detection_sess.run([output_name], {input_name: image})
detection_boxes = post_process(output, pad_x, pad_y, orig_w,orig_h,scale, 0.5,0.4)

out_img = draw_boxes_on_image("images/0_page.jpeg", detection_boxes)
cv2.imwrite("detections_output.jpg", out_img)

True