In [None]:
# Only some regions are supported in the new document intelligence sdk, use westeurope for now
%pip install --upgrade azure-ai-documentintelligence 
%pip install python-dotenv pandas

In [None]:
# %%writefile -a ./.env

# AZURE_DI_KEY=
# AZURE_DI_ENDPOINT=
# AZURE_DI_MODEL_NAME=

Appending to ./.env


# Use prebuilt receipt model 

In [None]:

import os
from dotenv import load_dotenv
load_dotenv()

# Set your Azure Document Intelligence endpoint and key
endpoint = os.getenv("AZURE_DI_ENDPOINT")
key = os.getenv("AZURE_DI_KEY")


from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat, AnalyzeResult

def format_price(price_dict):
    return "".join([f"{p}" for p in price_dict.valueStrings()])

document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))
url = "https://raw.githubusercontent.com/Azure/azure-sdk-for-python/main/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_forms/receipt/contoso-receipt.png"
poller = document_intelligence_client.begin_analyze_document(
    "prebuilt-receipt", AnalyzeDocumentRequest(url_source=url)
)
receipts: AnalyzeResult = poller.result()

# Extract and print receipt information
for receipt in receipts.documents:
    print("Receipt Properties:")
    print(f"Transaction date: {receipt.fields.get('TransactionDate').get('valueStringString') if receipt.fields.get('TransactionDate') else 'Not found'}")
    print(f"Total: {format_price(receipt.fields.get('Total').get('valueStringCurrency')) if receipt.fields.get('Total') else 'Not found'}")
    print(f"Merchant Name: {receipt.fields.get('MerchantName').get('valueStringString') if receipt.fields.get('MerchantName') else 'Not found'}")
    
    print("\nItems:")
    items = receipt.fields.get("Items")
    if items:
        for item in items.get('valueStringArray'):
            item_description = item.get('valueStringObject').get("Description")
            item_total = item.get('valueStringObject').get("TotalPrice")
            print(f"Description: {item_description.get('valueStringString') if item_description else 'Not found'}, "
                  f"Total Price: {format_price(item_total.get('valueStringCurrency')) if item_total else 'Not found'}")


Receipt Properties:
Transaction date: None
Total: $1203.39USD
Merchant Name: Contoso

Items:
Description: Surface Pro 6, Total Price: $999.0USD
Description: SurfacePen, Total Price: $99.99USD


# Train Custom Layout Model

- Create Azure Document Intelligence resource (Azure Portal)
- Create a DI project (Azure DocumentIntelligence Studio)
- Select Custom Layout Model (Azure DocumentIntelligence Studio)
- Upload training data (see AzDI_sample_dataset), upload files doc1-doc5 (Azure DocumentIntelligence Studio)
- Create Fields and annotate the data (Azure DocumentIntelligence Studio)
- Train the model (Azure DocumentIntelligence Studio)
- Test the model (Azure DocumentIntelligence Studio)
- Enter the key, endpoint and model ID into the .env file (Azure Portal)
- Run the code below to pass all the files and get the results

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import glob
import pandas as pd
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential

# Initialize the Document Intelligence Client
endpoint = os.environ.get("AZURE_DI_ENDPOINT")
key = os.environ.get("AZURE_DI_KEY")
model_name = os.environ.get("AZURE_DI_MODEL_NAME")
document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Get all PDF files in the folder
pdf_files = glob.glob("AzDI_sample_dataset/*.pdf")

# Lists to store results
results = []

# Process each PDF file
for pdf_path in pdf_files:
    with open(pdf_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            model_id=model_name, analyze_request=f, locale="en-US", content_type="application/octet-stream"
        )
        result = poller.result()

    # Extract fields from the result
    for document in result.documents:
        fields = document.fields if hasattr(document, 'fields') else {}
    
        # Create a dictionary for this document
        doc_result = {
            'Filename': os.path.basename(pdf_path),
            'FirstName': fields.get('FirstName', {}).get('valueString', ''),
            'LastName': fields.get('LastName', {}).get('valueString', ''),
            'MobileNr': fields.get('MobileNr', {}).get('valueString', ''),
            'Email': fields.get('Email', {}).get('valueString', ''),
            'AlreadySubscribed': fields.get('AlreadySubscribed', {}).get('valueSelectionMark', ''),
            'OptDT': fields.get('OptDT', {}).get('valueSelectionMark', ''),
            'ReferFirstName': fields.get('ReferFirstName', {}).get('valueString', ''),
            'ReferLastName': fields.get('ReferLastName', {}).get('valueString', ''),
            'ReferEmail': fields.get('ReferEmail', {}).get('valueString', ''),
            'ReferMobileNr': fields.get('ReferMobileNr', {}).get('valueString', '')
        }
        results.append(doc_result)
# Create DataFrame from results
df = pd.DataFrame(results)
df

    Filename            FirstName   LastName        MobileNr  \
0   doc1.pdf               Olivia    Berette   (555)217-3489   
1  doc10.pdf          SophiaLewis             (555) 948-6057   
2   doc2.pdf           LiamCarter  Livington  (555) 914-6621   
3   doc3.pdf                 Noah       Gray  (555)-872 1345   
4   doc4.pdf                 Emma    Johnson   555) 468-7329   
5   doc5.pdf  LucasMartinezWalter     Schmid                   
6   doc6.pdf             Isabelle      Davis  (516) 212-7712   
7   doc7.pdf                JAMES      LOPEZ  (555) 792-5483   
8   doc8.pdf              William     Harris   555) 392-1584   
9   doc9.pdf                Henry       Snow  (555) 529-3748   

                            Email AlreadySubscribed       OptDT  \
0  olivia. berette @ example. com          selected  unselected   
1        sophia.lewis@example.net        unselected  unselected   
2        liam_carter @example.net        unselected    selected   
3             ngh112@exampl