In [None]:
# Only some regions are supported in the new document intelligence sdk, use westeurope for now
%pip install --upgrade azure-ai-documentintelligence 
%pip install python-dotenv pandas

In [None]:
# %%writefile -a ./.env

# AZURE_DI_KEY=
# AZURE_DI_ENDPOINT=
# AZURE_DI_MODEL_NAME=

Appending to ./.env


# Train Custom Layout Model

- Create Azure Document Intelligence resource (Azure Portal)
- Create a DI project (Azure DocumentIntelligence Studio)
- Select Custom Layout Model (Azure DocumentIntelligence Studio)
- Upload training data (see AzDI_sample_dataset), upload files doc1-doc5 (Azure DocumentIntelligence Studio)
- Create Fields and annotate the data (Azure DocumentIntelligence Studio)
- Train the model (Azure DocumentIntelligence Studio)
- Test the model (Azure DocumentIntelligence Studio)
- Enter the key, endpoint and model ID into the .env file (Azure Portal)
- Run the code below to pass all the files and get the results

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

import glob
import pandas as pd
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.core.credentials import AzureKeyCredential

# Initialize the Document Intelligence Client
endpoint = os.environ.get("AZURE_DI_ENDPOINT")
key = os.environ.get("AZURE_DI_KEY")
model_name = os.environ.get("AZURE_DI_MODEL_NAME")
document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

# Get all PDF files in the folder
pdf_files = glob.glob("AzDI_sample_dataset/*.pdf")

# Lists to store results
results = []

# Process each PDF file
for pdf_path in pdf_files:
    with open(pdf_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            model_id=model_name, analyze_request=f, locale="en-US", content_type="application/octet-stream"
        )
        result = poller.result()

    # Extract fields from the result
    for document in result.documents:
        fields = document.fields if hasattr(document, 'fields') else {}
    
        # Create a dictionary for this document
        doc_result = {
            'Filename': os.path.basename(pdf_path),
            'FirstName': fields.get('FirstName', {}).get('valueString', ''),
            'LastName': fields.get('LastName', {}).get('valueString', ''),
            'MobileNr': fields.get('MobileNr', {}).get('valueString', ''),
            'Email': fields.get('Email', {}).get('valueString', ''),
            'AlreadySubscribed': fields.get('AlreadySubscribed', {}).get('valueSelectionMark', ''),
            'OptDT': fields.get('OptDT', {}).get('valueSelectionMark', ''),
            'ReferFirstName': fields.get('ReferFirstName', {}).get('valueString', ''),
            'ReferLastName': fields.get('ReferLastName', {}).get('valueString', ''),
            'ReferEmail': fields.get('ReferEmail', {}).get('valueString', ''),
            'ReferMobileNr': fields.get('ReferMobileNr', {}).get('valueString', '')
        }
        results.append(doc_result)
# Create DataFrame from results
df = pd.DataFrame(results)
df

Unnamed: 0,Filename,FirstName,LastName,MobileNr,Email,AlreadySubscribed,OptDT,ReferFirstName,ReferLastName,ReferEmail,ReferMobileNr
0,doc1.pdf,Olivia,Berette,(555)217-3489,olivia. berette @ example. com,selected,unselected,Joseph,Liam,,(555) 321-5734
