In [9]:
import fitz 
from PIL import Image
import onnxruntime as ort

import cv2
import warnings
import numpy as np
from PIL import Image

warnings.filterwarnings("ignore")

def pdf_to_images(pdf_path, image_folder):
    pdf_document = fitz.open(pdf_path)
    for page_number in range(pdf_document.page_count):
        page = pdf_document[page_number]
        image = page.get_pixmap()
        pil_image = Image.frombytes("RGB", [image.width, image.height], image.samples)
        image_path = f"{image_folder}/page_{page_number + 1}.png"
        pil_image.save(image_path, "PNG")

    pdf_document.close()

    
def xywh2xyxy(x):
        y = np.copy(x)
        y[..., 0] = x[..., 0] - x[..., 2] / 2
        y[..., 1] = x[..., 1] - x[..., 3] / 2
        y[..., 2] = x[..., 0] + x[..., 2] / 2
        y[..., 3] = x[..., 1] + x[..., 3] / 2
        return y

    
def nms(boxes, scores, iou_threshold):
    sorted_indices = np.argsort(scores)[::-1]

    keep_boxes = []
    while sorted_indices.size > 0:
        box_id = sorted_indices[0]
        keep_boxes.append(box_id)

        ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])

        keep_indices = np.where(ious < iou_threshold)[0]

        sorted_indices = sorted_indices[keep_indices + 1]

    return keep_boxes

def compute_iou(box, boxes):
    xmin = np.maximum(box[0], boxes[:, 0])
    ymin = np.maximum(box[1], boxes[:, 1])
    xmax = np.minimum(box[2], boxes[:, 2])
    ymax = np.minimum(box[3], boxes[:, 3])

    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)

    box_area = (box[2] - box[0]) * (box[3] - box[1])
    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
    union_area = box_area + boxes_area - intersection_area

    iou = intersection_area / union_area

    return iou    
    

def yolo_infer(ort_session,image_path):
    model_inputs = ort_session.get_inputs()
    input_names = [model_inputs[i].name for i in range(len(model_inputs))]
    input_shape = model_inputs[0].shape

    model_output = ort_session.get_outputs()
    output_names = [model_output[i].name for i in range(len(model_output))]
    
    image = cv2.imread(image_path)
    image_height, image_width = image.shape[:2]

    input_height, input_width = input_shape[2:]
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    resized = cv2.resize(image_rgb, (input_width, input_height))

    input_image = resized / 255.0
    input_image = input_image.transpose(2,0,1)
    input_tensor = input_image[np.newaxis, :, :, :].astype(np.float32)

    outputs = ort_session.run(output_names, {input_names[0]: input_tensor})[0]
    predictions = np.squeeze(outputs).T
    conf_thresold = 0.65
    scores = np.max(predictions[:, 4:], axis=1)
    predictions = predictions[scores > conf_thresold, :]
    scores = scores[scores > conf_thresold] 

    class_ids = np.argmax(predictions[:, 4:], axis=1)

    boxes = predictions[:, :4]

    input_shape = np.array([input_width, input_height, input_width, input_height])
    boxes = np.divide(boxes, input_shape, dtype=np.float32)
    boxes *= np.array([image_width, image_height, image_width, image_height])
    boxes = boxes.astype(np.int32)
    
    indices = nms(boxes, scores, 0.3)

    CLASSES = [
    "text"
    ]

    bbox_values = []
    score_values = []
    label_values = []

    i=0
    image_draw = image.copy()
    image_draw_2 = image.copy()
    value = image_path.split("/")[-1].split(".")[0]
    for (bbox, score, label) in zip(xywh2xyxy(boxes[indices]), scores[indices], class_ids[indices]):
        bbox = bbox.round().astype(np.int32).tolist()
        bbox_values.append(bbox)
        score_values.append(score)
        cls_id = int(label)
        cls = CLASSES[cls_id]
        label_values.append(cls)
        if cls =="text":
            x1, y1, x2, y2 = bbox
            img = image_draw[y1:y2, x1:x2]
            cv2.rectangle(image_draw_2, (x1, y1), (x2, y2), (255, 0, 0), 2)
            input_image = Image.fromarray(img)
            cv2.imwrite(f"crops_sample/{value}-{i}"+".jpg",img)
        i+=1
    cv2.imwrite(f"plots/{value}.jpg", image_draw_2)

    
if __name__ == "__main__":
    pdf_file_path = 'Black Clover Vol. 1.pdf'

    output_folder = 'output_images'

    import os
    os.makedirs(output_folder, exist_ok=True)

    pdf_to_images(pdf_file_path, output_folder)
    print("Conversion completed.")
    
    model_path = "open-mantra-dataset/best.onnx"
    ort_session = ort.InferenceSession(model_path)
    os.makedirs("crops_sample", exist_ok=True)
    os.makedirs("plots", exist_ok=True)
    for i in os.listdir("output_images"):
        print(f"Running object detection inference over page {i}.")
        yolo_infer(ort_session,"output_images/"+i)

Conversion completed.
Running object detection inference over page page_1.png.
Running object detection inference over page page_10.png.
Running object detection inference over page page_100.png.
Running object detection inference over page page_101.png.
Running object detection inference over page page_102.png.
Running object detection inference over page page_103.png.
Running object detection inference over page page_104.png.
Running object detection inference over page page_105.png.
Running object detection inference over page page_106.png.
Running object detection inference over page page_107.png.
Running object detection inference over page page_108.png.
Running object detection inference over page page_109.png.
Running object detection inference over page page_11.png.
Running object detection inference over page page_110.png.
Running object detection inference over page page_111.png.
Running object detection inference over page page_112.png.
Running object detection inference ove

Running object detection inference over page page_63.png.
Running object detection inference over page page_64.png.
Running object detection inference over page page_65.png.
Running object detection inference over page page_66.png.
Running object detection inference over page page_67.png.
Running object detection inference over page page_68.png.
Running object detection inference over page page_69.png.
Running object detection inference over page page_7.png.
Running object detection inference over page page_70.png.
Running object detection inference over page page_71.png.
Running object detection inference over page page_72.png.
Running object detection inference over page page_73.png.
Running object detection inference over page page_74.png.
Running object detection inference over page page_75.png.
Running object detection inference over page page_76.png.
Running object detection inference over page page_77.png.
Running object detection inference over page page_78.png.
Running object 