In [1]:
import os
from uuid import uuid4
from typing import Tuple
from pathlib import Path
import openparse
from openparse.schemas import TableElement, TextElement, ImageElement, LineElement
import pdfplumber

In [2]:
PDF_PATH = "../data/pdf"
IMAGE_PATH = "../data/images"

pdf_path = Path(PDF_PATH)
image_path = Path(IMAGE_PATH)

In [9]:
# 테스트 PDF 파일(https://arxiv.org/pdf/2410.10315v1)
# filepath = pdf_path / "EasyRAG: Efficient Retrieval-Augmented Generation Framework for Automated Network Operations.pdf"
filepath = pdf_path / "LightRAG: Simple and Fast Retrieval-Augmented Generation.pdf"

In [10]:
table_args={
    "parsing_algorithm": "unitable",
    "min_table_confidence": 0.8,
}
parser = openparse.DocumentParser(table_args=table_args)
parsed_basic_doc = parser.parse(filepath)

In [11]:
allowed_elements = [TextElement, LineElement, TableElement, ImageElement]

all_elements = []

nodes = parsed_basic_doc.nodes
for node in nodes:
    for elements in node.elements:
        if type(elements) in allowed_elements:
            all_elements.append(elements)
            continue
        for element in elements:
            if type(element) in allowed_elements:
                all_elements.append(element)

table_elements = [element for element in all_elements if isinstance(element, TableElement)]

In [12]:
def extract_pdf_image(pdf_path, element, save_path, resolution=500):
    def _get_bbox(element) -> Tuple[float, float, float, float]:
        return (element.bbox.x0, 
                element.bbox.page_height - element.bbox.y1, 
                element.bbox.x1, 
                element.bbox.page_height - element.bbox.y0)
    
    pdf_obj = pdfplumber.open(pdf_path)
    page = pdf_obj.pages[element.bbox.page]
    cropped_page = page.crop(_get_bbox(element))
    image = cropped_page.to_image(resolution=resolution)
    image.save(save_path)

In [15]:
for element in all_elements:
    if isinstance(element, TableElement):
        filename = f"t-{uuid4()}.jpg" 
    elif isinstance(element, ImageElement):
        filename = f"i-{uuid4()}.jpg" 
    else:
        continue

    save_path = image_path / filename
    extract_pdf_image(filepath, element, save_path)
    