# Azure Document Intelligence: Pilot Pipeline for Key Metric Extraction

In [22]:
import configparser

In [23]:
# Load credentials
config = configparser.ConfigParser(interpolation = None)
config.read('config.ini')

doc_endpoint = config['docintel']['endpoint']
doc_key = config['docintel']['key']
connection_str = config['storage']['connection_string']
storage_key = config['storage']['key']

### 1) Uploading PDFs to a container in a blob storage

In [24]:
# Libraries
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os

In [19]:
# Folder with test quarterly Citibank documents
file_path = "test_docs/"

In [20]:
# Create the BlobServiceClient object
blob_service_client = BlobServiceClient.from_connection_string(connection_str)

# Create the container if not already in blob
try:
    # Attempt to create a container named 'test-container'
    container_client = blob_service_client.create_container("test-container")
    print("Container 'test-container' created successfully.")
except Exception as e:
    # Handle exceptions that occur during container creation
    print(f"Failed to create container: {e}")

Container 'test-container' created successfully.


In [21]:
# Upload every pdf file in the directory
for filename in os.listdir(file_path):
    file_dir = os.path.join(file_path, filename)

    # Create a blob client using the local file name as the name for the blob
    blob_client = blob_service_client.get_blob_client(container="test-container", blob=filename)
    print("Uploading to Azure Storage as blob: " + filename)

    # Upload the created file
    with open(file=file_dir, mode="rb") as data:
        blob_client.upload_blob(data)
    

Uploading to Azure Storage as blob: 1Q18.pdf
Uploading to Azure Storage as blob: 1Q19.pdf
Uploading to Azure Storage as blob: 1Q20.pdf
Uploading to Azure Storage as blob: 1Q21.pdf
Uploading to Azure Storage as blob: 1Q22.pdf
Uploading to Azure Storage as blob: 1Q23.pdf
Uploading to Azure Storage as blob: 2Q18.pdf
Uploading to Azure Storage as blob: 2Q19.pdf
Uploading to Azure Storage as blob: 2Q20.pdf
Uploading to Azure Storage as blob: 2Q21.pdf
Uploading to Azure Storage as blob: 2Q22.pdf
Uploading to Azure Storage as blob: 2Q23.pdf
Uploading to Azure Storage as blob: 3Q18.pdf
Uploading to Azure Storage as blob: 3Q19.pdf
Uploading to Azure Storage as blob: 3Q20.pdf
Uploading to Azure Storage as blob: 3Q21.pdf
Uploading to Azure Storage as blob: 3Q22.pdf
Uploading to Azure Storage as blob: 3Q23.pdf
Uploading to Azure Storage as blob: 4Q18.pdf
Uploading to Azure Storage as blob: 4Q19.pdf
Uploading to Azure Storage as blob: 4Q20.pdf
Uploading to Azure Storage as blob: 4Q21.pdf
Uploading 

### 2) Running Custom Document Intelligence Model on New PDFs in Blob

In [36]:
# Libraries
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient, AnalyzeResult
from azure.core.serialization import AzureJSONEncoder
import json

In [27]:
# Check content of test-container
container_client = blob_service_client.get_container_client(container='test-container')

# List the blobs in the container
print("\nListing blobs...")

# List the blobs in the container
blob_list = container_client.list_blobs()
for blob in blob_list:
    print(" " + blob.name)


Listing blobs...
 1Q18.pdf
 1Q18.pdf.labels.json
 1Q18.pdf.ocr.json
 1Q19.pdf
 1Q19.pdf.labels.json
 1Q19.pdf.ocr.json
 1Q20.pdf
 1Q20.pdf.labels.json
 1Q20.pdf.ocr.json
 1Q21.pdf
 1Q21.pdf.labels.json
 1Q21.pdf.ocr.json
 1Q22.pdf
 1Q22.pdf.labels.json
 1Q22.pdf.ocr.json
 1Q23.pdf
 2Q18.pdf
 2Q19.pdf
 2Q20.pdf
 2Q21.pdf
 2Q22.pdf
 2Q23.pdf
 3Q18.pdf
 3Q19.pdf
 3Q20.pdf
 3Q21.pdf
 3Q22.pdf
 3Q23.pdf
 4Q18.pdf
 4Q19.pdf
 4Q20.pdf
 4Q21.pdf
 4Q22.pdf
 fields.json


In [28]:
# Download a blob in container to test with custom model
blob = '3Q23.pdf'
# Get the BlobClient for the specific blob
blob_client = blob_service_client.get_container_client('test-container').get_blob_client(blob)
# Download the blob content
with open(f'{blob}', "wb") as download_file:
    
    download_file.write(blob_client.download_blob().readall())


In [39]:
# Set custom model and target form
model_id = "citi_test_model"
form = open('3Q23.pdf', 'rb')

document_analysis_client = DocumentAnalysisClient(
    endpoint=doc_endpoint, credential=AzureKeyCredential(doc_key)
)

In [40]:
# Run custom model
poller = document_analysis_client.begin_analyze_document(model_id, form)
result = poller.result()


In [41]:
# Result to dictionary
result_dict = result.to_dict()

with open(f'{filename}.json' , 'w') as f:
    json.dump(result_dict, f, cls = AzureJSONEncoder)

In [48]:
for idx, document in enumerate(result.documents):
        print(f"--------Analyzing document #{idx + 1}--------")
        print(f"Document has type {document.doc_type}")
        print(f"Document has document type confidence {document.confidence}")
        print(f"Document was analyzed with model with ID {result.model_id}")
        for name, field in document.fields.items():
            field_value = field.value if field.value else field.content
            print(
                f"......found field of type '{field.value_type}' with value '{field_value}' and with confidence {field.confidence}"
            )

--------Analyzing document #1--------
Document has type citi_test_model
Document has document type confidence 0.987
Document was analyzed with model with ID citi_test_model
......found field of type 'list' with value '[DocumentField(value_type=dictionary, value={'period': DocumentField(value_type=string, value='4Q 2021', content=4Q 2021, bounding_regions=[BoundingRegion(page_number=2, polygon=[Point(x=3.2465, y=0.5205), Point(x=3.4041, y=0.5205), Point(x=3.4041, y=0.6781), Point(x=3.2465, y=0.6781)])], spans=[DocumentSpan(offset=861, length=7)], confidence=None), 'total_revenues': DocumentField(value_type=string, value='17,017', content=17,017, bounding_regions=[BoundingRegion(page_number=2, polygon=[Point(x=3.2895, y=0.7545), Point(x=3.4948, y=0.7545), Point(x=3.4948, y=0.8309), Point(x=3.2895, y=0.8309)])], spans=[DocumentSpan(offset=1064, length=6)], confidence=None), 'cet1_ratio': DocumentField(value_type=string, value='12.25%', content=12.25%, bounding_regions=[BoundingRegion(page