In [3]:
import os
import cv2
import numpy as np
from paddleocr import PPStructure,save_structure_res, PaddleOCR, draw_structure_result
from paddle.utils import try_import
from PIL import Image
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx

ocr_engine = PPStructure(recovery=True, structure_version='PP-StructureV2')

save_folder = './dev_v1'
pdf_path = '../data/basic_image.pdf'
font_path = '../fonts/simfang.ttf' # PaddleOCR下提供字体包


fitz = try_import("fitz")
imgs = []
with fitz.open(pdf_path) as pdf:
    for pg in range(0, pdf.page_count):
        page = pdf[pg]
        mat = fitz.Matrix(2, 2) # 将 PDF 页面缩放两倍，为了提高图像的分辨率和清晰度。
        pm = page.get_pixmap(matrix=mat, alpha=False)

        # if width or height > 2000 pixels, don't enlarge the image
        if pm.width > 2000 or pm.height > 2000:
            pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)

        img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples) # 将像素图转换为 PIL 图像对象。
        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) # 将 PIL 图像转换为 NumPy 数组，并从 RGB 格式转换为 BGR 格式，以便 OpenCV 使用。
        imgs.append(img)



# for index, img in enumerate(imgs):
result = ocr_engine(imgs[0])
save_structure_res(result, save_folder, os.path.basename(pdf_path).split('.')[0], 0)
h, w, _ = img.shape
res = sorted_layout_boxes(result, w)
convert_info_docx(img, res, save_folder, os.path.basename(pdf_path).split('.')[0])

[2024/06/05 12:37:59] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/home/liangzhu/.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/home/liangzhu/.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec

In [2]:
for index, img in enumerate(imgs):
    result = ocr_engine(img)
    save_structure_res(result, save_folder, os.path.basename(pdf_path).split('.')[0], index)
    h, w, _ = img.shape
    res = sorted_layout_boxes(result, w)
    convert_info_docx(img, res, save_folder, os.path.basename(pdf_path).split('.')[0])

    # 保存每一页的结构化结果图片
    im_show = draw_structure_result(Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)), result, font_path=font_path)
    im_show = Image.fromarray(im_show)
    im_show.save(os.path.join(save_folder, f'result_page_{index + 1}.jpg'))

[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 15, elapsed : 0.03990745544433594
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 15, elapsed : 0.08130955696105957
[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 8, elapsed : 0.043082475662231445
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 8, elapsed : 0.03798413276672363
[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 5, elapsed : 0.03484773635864258
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 5, elapsed : 0.02341437339782715
[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.03373241424560547
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 3, elapsed : 0.01651167869567871
[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 2, elapsed : 0.033467769622802734
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 2, elapsed : 0.015752553939819336
[2024/06/05 12:35:23] ppocr DEBUG: dt_boxes num : 1, elapsed : 0.033954620361328125
[2024/06/05 12:35:23] ppocr DEBUG: rec_res num  : 1, elapsed : 0.0098216533660888