In [10]:
import os
import ipykernel
import pytesseract as py
import unstructured as un
from pathlib import Path
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
os.environ["PATH"] += os.pathsep + r"C:\Program Files\poppler\poppler-24.08.0\Library\bin"

In [36]:
def read_pdf_files():
    """Returns pdf path of all pdf files"""
    corpus_path = "corpus"
    pdf_files = []
    for root, dirs, files in os.walk(corpus_path):
        for file in files:
            if file.lower().endswith(".pdf"):
                file_path = os.path.join(root, file)
                pdf_files.append(file_path)
    return pdf_files

In [37]:
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Text
import pytesseract

def extract_text_with_metadata(pdf_path):
    pdf_path = Path(pdf_path)

    pdf_elements = partition_pdf(
        filename=str(pdf_path),
        strategy="hi_res",
        extract_images_in_pdf=False
    )

    page_data = {}
    for ele in pdf_elements:
        if isinstance(ele, Text) and ele.text.strip():
            meta = ele.metadata.to_dict() if ele.metadata else {}
            page_num = meta.get("page_number", "unknown")

            # Group text by page number
            page_data.setdefault(page_num, []).append(ele.text.strip())

    # Merge text chunks for each page
    for page in page_data:
        page_data[page] = "\n".join(page_data[page])

    return page_data

In [38]:
from pathlib import Path
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Image

def extract_images_with_metadata(pdf_path):
    pdf_path = Path(pdf_path)

    pdf_elements = partition_pdf(
        filename=str(pdf_path),
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image", "Figure", "Table"],
        extract_image_block_to_payload=True,
        chunking_strategy=None,
    )

    image_data = []
    for ele in pdf_elements:
        if isinstance(ele, Image):
            meta = ele.metadata.to_dict() if ele.metadata else {}
            image_data.append({
                "pdf_name": pdf_path.name,
                "page_number": meta.get("page_number"),
                "image_base64": meta.get("image_base64"),  # Base64 image data
                # "coordinates": meta.get("coordinates")
            })

    return image_data

In [None]:
def process_all_pdfs():
    """Processes all PDFs in the corpus folder and extracts text and images."""
    pdf_paths = read_pdf_files()
    all_results = {}

    for pdf_path in pdf_paths:
        print(f"\nProcessing: {pdf_path}")

        # Extract text
        text_data = extract_text_with_metadata(pdf_path)

        # Extract images
        image_data = extract_images_with_metadata(pdf_path)

        # Store both in results
        all_results[pdf_path] = {
            "text": text_data,
            "images": image_data
        }

        for page, text in text_data.items():
            print(f"\n--- {pdf_path} | Page {page} ---")
            print(text[:500], "..." if len(text) > 500 else "")

        print(f"\nExtracted {len(image_data)} images from {pdf_path}\n")

    return all_results
