## Alternate open source based document parsing script without Google Document AI (draft)

In [51]:
import layoutparser as lp
import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image
import cv2
import json
import os
from io import BytesIO
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path
import numpy as np

In [52]:
uploader = widgets.FileUpload(accept='.pdf,.png,.jpg,.jpeg', multiple=True)
display(uploader)

FileUpload(value=(), accept='.pdf,.png,.jpg,.jpeg', description='Upload', multiple=True)

In [53]:
def correct_orientation(image):
    """Detect and rotate the image based on orientation metadata from Tesseract."""
    try:
        osd = pytesseract.image_to_osd(image)
        rotate_angle = int([line for line in osd.split('\n') if 'Rotate' in line][0].split(':')[1])
        if rotate_angle != 0:
            print(f"Rotating image by {rotate_angle} degrees")
            image = image.rotate(360 - rotate_angle, expand=True)
    except pytesseract.TesseractError as e:
        print("OSD detection failed:", e)
    return image

In [54]:
def process_image(image, filename, page_num=None):
    display_name = f"{filename}_page{page_num}" if page_num is not None else filename
    print(f"Processing image: {display_name}")
    output_loc = os.path.join(r"C:\Users\nehaj\OneDrive\Desktop\AI_agent\test_json", display_name + "_ocr.json")
    # Correct orientation
    image = correct_orientation(image)
    
    # OCR using Tesseract
    ocr_text = pytesseract.image_to_string(image)

    result = {
        "filename": output_loc,
        "ocr_text": ocr_text,
        "text_blocks": []
    }

    # Save output as JSON
    with open(output_loc, "w", encoding="utf-8") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    
    print(f"Saved OCR output to {output_loc}")

In [55]:
def process_uploaded_files(files):
    for file in files:
        file_name = file['name']
        file_content = file['content']

        print(f"\nProcessing uploaded file: {file_name}")
        
        # Handle PDFs
        if file_name.lower().endswith('.pdf'):
            try:
                images = convert_from_bytes(file_content)
                for page_num, image in enumerate(images, start=1):
                    process_image(image, os.path.splitext(file_name)[0], page_num)
            except Exception as e:
                print(f"Failed to process PDF {file_name}: {e}")

        # Handle image files
        elif file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
            try:
                image = Image.open(io.BytesIO(file_content)).convert("RGB")
                process_image(image, os.path.splitext(file_name)[0])
            except Exception as e:
                print(f"Failed to process image {file_name}: {e}")
        
        else:
            print(f"Unsupported file type: {file_name}")


In [56]:
if uploader.value:
    process_uploaded_files(uploader.value)


Processing uploaded file: truck consignment, exit certificate, delivery note.pdf
Processing image: truck consignment, exit certificate, delivery note_page1
Rotating image by 90 degrees
Saved OCR output to C:\Users\nehaj\OneDrive\Desktop\AI_agent\test_json\truck consignment, exit certificate, delivery note_page1_ocr.json
Processing image: truck consignment, exit certificate, delivery note_page2
Saved OCR output to C:\Users\nehaj\OneDrive\Desktop\AI_agent\test_json\truck consignment, exit certificate, delivery note_page2_ocr.json
Processing image: truck consignment, exit certificate, delivery note_page3
Saved OCR output to C:\Users\nehaj\OneDrive\Desktop\AI_agent\test_json\truck consignment, exit certificate, delivery note_page3_ocr.json
Processing image: truck consignment, exit certificate, delivery note_page4
Saved OCR output to C:\Users\nehaj\OneDrive\Desktop\AI_agent\test_json\truck consignment, exit certificate, delivery note_page4_ocr.json
