In [None]:
# Must provide an S3 path

import os
from textractor import Textractor
from textractor.data.constants import TextractFeatures

def process_multi_page_pdf(pdf_file, output_dir, page_limit=5):
    # Initialize Textractor
    extractor = Textractor(profile_name='default')

    # Start document analysis
    job_id = extractor.start_document_analysis(
        file_source=pdf_file,
        features=[TextractFeatures.TABLES, TextractFeatures.FORMS]
    )

    # Fetch results
    document = extractor.get_document_analysis(job_id=job_id)

    # Process and annotate each page
    for page_number in range(1, page_limit + 1):
        page_data = [element for element in document.elements if element.page_number == page_number]

        # Annotate and save each page
        if page_data:
            annotate_page(pdf_file, page_number, page_data, output_dir)

def annotate_page(pdf_file, page_number, page_data, output_dir):
    import fitz  # PyMuPDF
    from PIL import Image, ImageDraw

    # Load the PDF with PyMuPDF
    pdf_document = fitz.open(pdf_file)
    page = pdf_document[page_number - 1]

    # Render page to an image
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Draw bounding boxes using PIL
    draw = ImageDraw.Draw(img)
    for element in page_data:
        bbox = element.bounding_box
        draw.rectangle(
            [
                (bbox.left * img.width, bbox.top * img.height),
                (bbox.right * img.width, bbox.bottom * img.height)
            ],
            outline="red",
            width=2
        )

    # Save annotated page
    annotated_image_path = os.path.join(output_dir, f"annotated_page_{page_number}.png")
    img.save(annotated_image_path)
    print(f"Annotated page saved at {annotated_image_path}")

def process_pdf_directory(input_dir, output_dir, page_limit=5):
    os.makedirs(output_dir, exist_ok=True)

    for pdf_file in os.listdir(input_dir):
        if pdf_file.lower().endswith('.pdf'):
            full_pdf_path = os.path.join(input_dir, pdf_file)
            print(f"Processing: {full_pdf_path}")
            process_multi_page_pdf(full_pdf_path, output_dir, page_limit)

if __name__ == "__main__":
    input_pdf_dir = "../tst/saved_pdfs"
    output_pdf_dir = "../tst/annotated_pdfs"
    page_limit = 5
    
    process_pdf_directory(input_pdf_dir, output_pdf_dir, page_limit)


Processing: ../tst/saved_pdfs/_-1.pdf


InputError: For files not in S3, an S3 upload path must be provided