In [2]:
!pip install paddlepaddle paddleocr

Collecting paddlepaddle
  Downloading paddlepaddle-2.6.1-cp312-cp312-macosx_10_9_x86_64.whl.metadata (8.6 kB)
Collecting paddleocr
  Using cached paddleocr-2.7.3-py3-none-any.whl.metadata (26 kB)
Collecting astor (from paddlepaddle)
  Using cached astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (from paddlepaddle)
  Using cached opt_einsum-3.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting shapely (from paddleocr)
  Downloading shapely-2.0.4-cp312-cp312-macosx_10_9_x86_64.whl.metadata (7.0 kB)
Collecting scikit-image (from paddleocr)
  Downloading scikit_image-0.23.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (14 kB)
Collecting imgaug (from paddleocr)
  Using cached imgaug-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting pyclipper (from paddleocr)
  Downloading pyclipper-1.3.0.post5-cp312-cp312-macosx_10_9_x86_64.whl.metadata (9.0 kB)
Collecting lmdb (from paddleocr)
  Using cached lmdb-1.4.1.tar.gz (881 kB)
  Preparing metadata (setup.py) ... 

In [3]:
import os
import fitz  # pip install PyMuPDF
from tqdm import tqdm
import numpy as np
import base64
from paddleocr import PaddleOCR


def pdf_ocr_txt(filepath, dir_path="tmp_files"):
    full_dir_path = os.path.join(os.path.dirname(filepath), dir_path)
    if not os.path.exists(full_dir_path):
        os.makedirs(full_dir_path)
    doc = fitz.open(filepath)
    txt_file_path = os.path.join(full_dir_path, "{}.txt".format(os.path.split(filepath)[-1]))
    img_name = os.path.join(full_dir_path, 'tmp.png')
    with open(txt_file_path, 'w', encoding='utf-8') as fout:
        for i in tqdm(range(doc.page_count)):
            page = doc.load_page(i)
            pix = page.get_pixmap() # 将 PDF 页面转换成一个图像
            img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.h, pix.w, pix.n))

            img_data = {"img64": base64.b64encode(img).decode("utf-8"), "height": pix.h, "width": pix.w,
                        "channels": pix.n}
            result = ocr(img_data)
            result = [line for line in result if line]
            ocr_result = [i[1][0] for line in result for i in line]
            fout.write("\n".join(ocr_result))
    if os.path.exists(img_name):
        os.remove(img_name)
    return txt_file_path


def ocr(img_data):
    # 初始化 PaddleOCR 引擎
    ocr_engine = PaddleOCR(use_angle_cls=True, lang="ch", use_gpu=True, show_log=False, type='structure')

    img_file = img_data['img64']
    height = img_data['height']
    width = img_data['width']
    channels = img_data['channels']

    binary_data = base64.b64decode(img_file)
    img_array = np.frombuffer(binary_data, dtype=np.uint8).reshape((height, width, channels))

    # 无文件上传，返回错误
    if not img_file:
        return 'error: No file was uploaded.'

    # 调用 PaddleOCR 进行识别
    res = ocr_engine.ocr(img_array)

    # 返回识别结果
    return res

In [4]:
pdf_path = '../data/表格.pdf'
res = pdf_ocr_txt(pdf_path)
print(res)

100%|██████████| 2/2 [00:24<00:00, 12.35s/it]

../data/tmp_files/表格.pdf.txt



