## Setup Env

In [None]:
import fitz  # PyMuPDF
import os
from PIL import Image, ImageDraw
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient



In [None]:
# Azure Document Intelligence configuration
azure_endpoint = # add your endpoint
azure_key = # add your key

In [None]:
# Initialize the Azure Document Analysis Client
def initialize_azure_client(endpoint, key):
    return DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Function to draw bounding boxes on an image
def annotate_page(page, layout_data):
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    draw = ImageDraw.Draw(img)

    page_height = page.rect.height  # Get the height of the page in points

    # Draw bounding boxes for lines using 4-vertex quadrilaterals
    for line in layout_data.lines:
        if line.polygon and len(line.polygon) == 4:
            # Convert inches to points and adjust Y-coordinates to flip origin from bottom-left to top-left
            bbox = [(point.x * 72, (point.y * 72)) for point in line.polygon]
            draw.polygon(bbox, outline="blue", width=2)  # Draw box with blue outline

    # Draw bounding boxes for selection marks (if any) using quadrilateral order
    for mark in layout_data.selection_marks:
        if mark.polygon and len(mark.polygon) == 4:
            bbox = [(point.x * 72, (point.y * 72)) for point in mark.polygon]
            color = "green" if mark.state == "selected" else "red"
            draw.polygon(bbox, outline=color, width=2)  # Draw box with color based on state

    # Check if tables are present and annotate them
    if hasattr(layout_data, 'tables'):
        for table in layout_data.tables:
            for cell in table.cells:
                if cell.polygon and len(cell.polygon) == 4:
                    bbox = [(point.x * 72, (point.y * 72)) for point in cell.polygon]
                    draw.polygon(bbox, outline="purple", width=2)  # Draw box with purple outline

    return img

# Function to process and save a single PDF with annotations
def process_pdf_with_annotations(pdf_file, output_dir, azure_client, page_limit=5):
    extracted_pdf_dir = os.path.join(output_dir, "saved_pdfs")
    annotated_pdf_dir = os.path.join(output_dir, "annotated_pdfs")
    
    os.makedirs(extracted_pdf_dir, exist_ok=True)
    os.makedirs(annotated_pdf_dir, exist_ok=True)
    
    output_pdf_file = os.path.join(extracted_pdf_dir, os.path.basename(pdf_file))
    with fitz.open(pdf_file) as pdf:
        pdf_extract = fitz.open()
        for page_num in range(0, min(page_limit, len(pdf))):
            pdf_extract.insert_pdf(pdf, from_page=page_num, to_page=page_num)
        pdf_extract.save(output_pdf_file)

    # Analyze PDF layout with Azure Document Intelligence
    with open(output_pdf_file, "rb") as f:
        poller = azure_client.begin_analyze_document("prebuilt-layout", document=f)
        result = poller.result()

    # Annotate each page and save as a new PDF
    annotated_pdf_path = os.path.join(annotated_pdf_dir, f"annotated_{os.path.basename(pdf_file)}")
    with fitz.open(output_pdf_file) as pdf:
        images = []
        for page_num, page in enumerate(pdf.pages()):
            if page_num >= page_limit:
                break
            layout_data = result.pages[page_num]
            annotated_img = annotate_page(page, layout_data)
            images.append(annotated_img)
        
        if images:
            images[0].save(
                annotated_pdf_path, "PDF", save_all=True, append_images=images[1:]
            )

    print(f"Annotated PDF has been saved at {annotated_pdf_path}")

# Function to process all PDFs in a given directory
def process_pdf_directory(input_dir, output_dir, azure_client, page_limit=5):
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    for pdf_file in pdf_files:
        full_pdf_path = os.path.join(input_dir, pdf_file)
        print(f"Processing {full_pdf_path}...")
        process_pdf_with_annotations(full_pdf_path, output_dir, azure_client, page_limit)

# Define the input and output directories
input_pdf_dir = "../tst/sample_pdfs"  
output_pdf_dir = "../tst"  



# Initialize the Azure client
azure_client = initialize_azure_client(azure_endpoint, azure_key)

# Process all PDFs in the input directory
process_pdf_directory(input_pdf_dir, output_pdf_dir, azure_client, page_limit=5)


Processing ../tst/sample_pdfs/_-1.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated__-1.pdf
Processing ../tst/sample_pdfs/1715882152079-fun-4.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715882152079-fun-4.pdf
Processing ../tst/sample_pdfs/1715887458018-uop-14.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715887458018-uop-14.pdf
Processing ../tst/sample_pdfs/1715883408139-sbr-1.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715883408139-sbr-1.pdf
Processing ../tst/sample_pdfs/1715883096213-sio-1.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715883096213-sio-1.pdf
Processing ../tst/sample_pdfs/1715885123422-kyl-1.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715885123422-kyl-1.pdf
Processing ../tst/sample_pdfs/1715886035061-jjp-2.pdf...
Annotated PDF has been saved at ../tst/annotated_pdfs/annotated_1715886035061-jjp-2.pdf
Processing ../t