## 📋 Table of Contents

This notebook guides you through the following sections:

1. [**NER and Summarization of Labeled Documents (`Invoice`) with GPT-4o Multimodality + Pydantic**](#optical-character-recognition-ocr-with-gpt-4o-multipack): We are using GPT-4o multimodality and the `instructor` library along with Pydantic to extract necessary data, provide summaries, and run validation for classified invoices.

In [1]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbb-ai-smart-document-processing"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbb-ai-smart-document-processing


In [2]:
import instructor
from openai import AzureOpenAI
import os

# Initialize the AzureOpenAI client
client = instructor.from_openai(AzureOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_version=os.getenv("DEPLOYMENT_VERSION"),
    api_key=os.getenv("OPENAI_API_KEY"),
))

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import base64
import os
from pydantic import BaseModel, Field, ValidationError, root_validator
from typing import Optional, List, Dict

# Define the Pydantic models for items and invoices
class Item(BaseModel):
    name: str
    price: float
    quantity: int

class Invoice(BaseModel):
    items: List[Item]
    total: float
    reference_number: str
    signature_on_document: str
    origin_address: str
    destination_address: str
    summary: Optional[str] = None 

    @root_validator(pre=True)
    def check_total_and_reference_number(cls, values):
        items = values.get('items', [])
        total = values.get('total')
        reference_number = values.get('reference_number')
        
        # Check if the total matches the sum of item prices
        calculated_total = sum(item['price'] * item['quantity'] for item in items)
        if calculated_total != total:
            raise ValueError(
                f"Total {total} does not match the sum of item prices {calculated_total}"
            )
        
        # Check if the reference number is present
        if not reference_number:
            raise ValueError("Reference number is missing")
        
        return values

def extract_receipt(file_path: str) -> Invoice:
    with open(file_path, "rb") as image_file:
        image_bytes = image_file.read()

    # Encode the image in base64
    encoded_image = base64.b64encode(image_bytes).decode("utf-8")

    # Create the user message with the encoded image
    user_message = {
        "role": "user",
        "content": [
            {
         "type": "image_url",
"image_url": {
    "url": f"data:image/jpeg;base64,{encoded_image}",
},
},
{
    "type": "text",
    "text": (
        "You will be analyzing an image of an invoice and extracting specific details from it. "
        "When I provide an image, all further input from the 'Instructor:' will be related to extracting information from that image.\n\n"
        "## Details to Extract:\n"
        "1. **Items**: For each item listed on the invoice, extract the following:\n"
        "   - **Name**: The name or description of the item.\n"
        "   - **Price**: The price of the item.\n"
        "   - **Quantity**: The quantity of the item.\n\n"
        "2. **Total Amount**: Extract the total amount listed on the invoice.\n\n"
        "3. **Signature**: Extract any signature present on the invoice. If no signature is present, indicate 'No signature present'.\n\n"
        "4. **Origin Address**: Extract the origin address from the invoice.\n\n"
        "5. **Destination Address**: Extract the destination address from the invoice.\n\n"
        "6. **Reference Number**: Extract the reference number from the invoice. Ensure the reference number is correctly identified and extracted.\n\n"
        "7. **Summary**: Provide a summary of the document, describing in detail what the document is about and some of the key details.\n\n"
        "## Formatting Guidelines:\n"
        "Ensure the extracted information is clearly formatted as follows:\n\n"
        "### Items:\n"
        "- **Item 1**: Name, Price, Quantity\n"
        "- **Item 2**: Name, Price, Quantity\n"
        "- (and so on for each item)\n\n"
        "### Total Amount:\n"
        "- **Total Amount**: [Total Amount]\n\n"
        "### Signature:\n"
        "- **Signature**: [Signature or 'No signature present']\n\n"
        "### Origin Address:\n"
        "- **Origin Address**: [Origin Address]\n\n"
        "### Destination Address:\n"
        "- **Destination Address**: [Destination Address]\n\n"
        "### Reference Number:\n"
        "- **Reference Number**: [Reference Number]\n\n"
        "### Summary:\n"
        "- **Summary**: [Summary]\n\n"
        "## Accuracy:\n"
        "Ensure all details are accurate and clearly labeled. Double-check the extracted information to ensure it matches the details in the invoice image. "
        "Pay special attention to the reference number to ensure it is correctly identified and extracted."
    ),
 },
 ],
 }

    result = client.chat.completions.create(
        model=os.getenv("DEPLOYMENT_ID"),
        max_tokens=4000,
        response_model=Invoice,
        temperature=0,
        messages=[user_message],
    )

    # Parse the result and return as Invoice
    if result:
        try:
            receipt_data = result.model_dump()
            receipt = Invoice(**receipt_data)
            return receipt
        except ValidationError as e:
            print(f"Error in parsing receipt details: {e}")
            raise
    else:
        raise ValueError("No response from the Azure OpenAI service")


C:\Users\pablosal\AppData\Local\Temp\ipykernel_31432\2633541655.py:21: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.8/migration/
  @root_validator(pre=True)


In [4]:
# Example usage
file_path = r"C:\Users\pablosal\Desktop\gbb-ai-smart-document-processing\utils\data\scanned\test\invoice\invoice_4.png"
try:
    invoice_data = extract_receipt(file_path)
    print("Extracted Items:", invoice_data.items)
    print("Extracted Total Amount:", invoice_data.total)
    print("Extracted Reference Number:", invoice_data.reference_number)
except Exception as e:
    print(f"An error occurred: {e}")

Extracted Items: [Item(name='HP Folie DIN A4, 50 St. mit Schutzhülle für HP PaintJet', price=125.28, quantity=2), Item(name='HP Einzelblätter DIN A4, 200 Blatt f. HP DeskJet 500C u. HP PaintJet', price=43.83, quantity=5)]
Extracted Total Amount: 469.71
Extracted Reference Number: 93980


In [6]:
import json
from typing import Any


def invoice_to_json(invoice: Invoice) -> str:
    invoice_dict = invoice.dict()

    json_object = {
        "id": invoice_dict.get("reference_number"),
        "content": "",  
        "content_vector": [],
        "total": invoice_dict.get("total"),
        "reference_number": invoice_dict.get("reference_number"),
        "signature_on_document": invoice_dict.get("signature_on_document"),
        "origin_address": invoice_dict.get("origin_address"),
        "destination_address": invoice_dict.get("destination_address"),
        "items_purchased": [
            {"list_item": f"{item.name}, {item.price}, {item.quantity}"}
            for item in invoice.items
        ],
    }

    return json.dumps(json_object, indent=4)

json_output = invoice_to_json(invoice_data)
print(json_output)

{
    "id": "93980",
    "content": "",
    "content_vector": [],
    "total": 469.71,
    "reference_number": "93980",
    "signature_on_document": "No signature present",
    "origin_address": "OHLBERG GmbH, Unterrather Str. 42, W-4000 D\u00fcsseldorf 30",
    "destination_address": "INBIFO Institut f. biologische Forschung GmbH, Fuggerstr. 3, 5000 K\u00f6ln 90",
    "items_purchased": [
        {
            "list_item": "HP Folie DIN A4, 50 St. mit Schutzh\u00fclle f\u00fcr HP PaintJet, 125.28, 2"
        },
        {
            "list_item": "HP Einzelbl\u00e4tter DIN A4, 200 Blatt f. HP DeskJet 500C u. HP PaintJet, 43.83, 5"
        }
    ]
}
