In [63]:
from docling.document_converter import DocumentConverter
import os
import json
import fitz
from PIL import Image


In [None]:
# Define colors for each recognized label
label_colors = {
    "caption": (1, 0, 0),             # Red
    "footnote": (0, 0, 1),            # Blue
    "formula": (0, 1, 0),             # Green
    "list_item": (1, 1, 0),           # Yellow
    "page_footer": (1, 0, 1),         # Magenta
    "page_header": (0, 1, 1),         # Cyan
    "picture": (0.5, 0.5, 0.5),       # Gray
    "section_header": (1, 0.5, 0),    # Orange
    "table": (0.6, 0.4, 0.2),         # Brown
    "text": (0, 0, 0),                # Black
    "title": (0.3, 0.7, 0.3),         # Olive Green
    "document_index": (0.5, 0.2, 0.7),# Purple
    "code": (0.9, 0.6, 0.1),          # Gold
    "checkbox_selected": (0.2, 0.7, 0.8), # Teal
    "checkbox_unselected": (0.7, 0.2, 0.6), # Pink
    "form": (0.4, 0.4, 0.8),          # Lavender
    "key_value_region": (0.6, 0.3, 0.9), # Violet
    "paragraph": (0.4, 0.5, 0.3),     # Dark Olive
    "reference": (0.8, 0.5, 0.2)      # Amber
}


def fetch_bbox_and_page_num(item):
    """
    Return the bounding box and page number of the item if label is recognized.
    """
    recognized_labels = {
        "caption", "footnote", "formula", "list_item", "page_footer",
        "page_header", "picture", "section_header", "table", "text",
        "title", "document_index", "code", "checkbox_selected",
        "checkbox_unselected", "form", "key_value_region", "paragraph",
        "reference"
    }
    
    if item.label in recognized_labels:
        return (item.label, item.prov[0].bbox), item.prov[0].page_no
    else:
        return f"{item.label} not found"

In [79]:

def annotate_pdf_with_bboxes(pdf_file, output_file, annotation_dictionary):
    """
    Annotates the given PDF with bounding boxes from annotation_dictionary
    and saves the annotated PDF. Skips annotations for pages that don't exist.
    """
    with fitz.open(pdf_file) as pdf:
        print(annotation_dictionary)
        for page_num in annotation_dictionary:
            # if page_num >= len(pdf):  # Skip if page does not exist in the PDF
            #     continue
            if page_num not in annotation_dictionary:
                continue # Skip if page does not have any annotations
            page = pdf[page_num - 1]  # Access each annotated page
            page_height = page.rect.height  # Get the height of the page for coordinate adjustment
            
            for (item, bbox) in annotation_dictionary[page_num]:
                # Extract l, t, r, b
                l, t, r, b = bbox.l, bbox.t, bbox.r, bbox.b
                
                # Adjust coordinates if needed, assuming bottom-left origin
                adjusted_rect = fitz.Rect(l, page_height - t, r, page_height - b)
                
                # Draw a rectangle on the page for each bounding box
                page.draw_rect(adjusted_rect, color=label_colors[item], width=1)  # Red outline for visibility

        # Save annotated PDF
        pdf.save(output_file)
 
 # Function to process and save a single PDF
def process_pdf(pdf_file, output_dir, aryn_api_key, page_limit=5):
    # Ensure output directories exist
    extracted_pdf_dir = os.path.join(output_dir, "saved_pdfs")
    annotated_pdf_dir = os.path.join(output_dir, "annotated_pdfs")
    
    os.makedirs(extracted_pdf_dir, exist_ok=True)
    os.makedirs(annotated_pdf_dir, exist_ok=True)
    
    # Extract specific pages (e.g., pages 1 to page_limit) and save to a temporary PDF
    output_pdf_file = os.path.join(extracted_pdf_dir, os.path.basename(pdf_file))
    with fitz.open(pdf_file) as pdf:
        pdf_extract = fitz.open()  # Create a new PDF
        for page_num in range(0, min(page_limit, len(pdf))):  # Pages 1 to page_limit
            pdf_extract.insert_pdf(pdf, from_page=page_num, to_page=page_num)
        pdf_extract.save(output_pdf_file)  # Save the extracted pages

    # Process the extracted PDF with docling
    docling_doc = DocumentConverter().convert(output_pdf_file).document
    # Fetch the bounding boxes and page numbers of the annotations
    annotation_dictionary = {}
    for item, level in docling_doc.iterate_items():
        item_and_bbox, page_num = fetch_bbox_and_page_num(item)
        # print(f"Bounding Box: {bbox}")
        if page_num not in annotation_dictionary:
            annotation_dictionary[page_num] = [item_and_bbox]
        else:
            annotation_dictionary[page_num].append(item_and_bbox)
    # Use the annotation dictionary to draw bounding boxes on the extracted pages and save as a new PDF with annotations in the annotated PDF directory
    # Annotate the extracted PDF and save
    annotated_pdf_file = os.path.join(annotated_pdf_dir, os.path.basename(pdf_file))
    annotate_pdf_with_bboxes(output_pdf_file, annotated_pdf_file, annotation_dictionary)
    



# Function to process all PDFs in a given directory
def process_pdf_directory(input_dir, output_dir, page_limit=5):
    # Get all PDF files in the input directory
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    
    # Process each PDF
    for pdf_file in pdf_files:
        full_pdf_path = os.path.join(input_dir, pdf_file)
        print(f"Processing {full_pdf_path}...")
        process_pdf(full_pdf_path, output_dir, page_limit)


# Define the input and output directories
input_pdf_dir = "../tst/sample_pdfs"  # Directory containing the PDFs to process
output_pdf_dir = "../tst"  # Directory to store the extracted and annotated PDFs


# Process all PDFs in the input directory
process_pdf_directory(input_pdf_dir, output_pdf_dir, page_limit=5)



Processing ../tst/sample_pdfs/_-1.pdf...


Fetching 9 files: 100%|██████████| 9/9 [00:00<00:00, 155344.59it/s]


{1: [BoundingBox(l=14.004631996154785, t=780.1151123046875, r=604.6612548828125, b=8.1182861328125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>)], 2: [BoundingBox(l=13.104262351989746, t=781.578857421875, r=603.934326171875, b=8.20477294921875, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>)], 3: [BoundingBox(l=14.17208480834961, t=780.7562866210938, r=603.1598510742188, b=10.181396484375, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>)], 4: [BoundingBox(l=14.732715606689453, t=782.398193359375, r=603.611572265625, b=11.0948486328125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>)], 5: [BoundingBox(l=14.709278106689453, t=781.0558471679688, r=603.7754516601562, b=8.38555908203125, coord_origin=<CoordOrigin.BOTTOMLEFT: 'BOTTOMLEFT'>)]}


ValueError: too many values to unpack (expected 2)