# Azure Document Intelligence 
## Extracting Data from a PDF using General document model

Refer
- https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?view=doc-intel-4.0.0&preserve-view=true&pivots=programming-language-python#general-document-model
- https://learn.microsoft.com/en-us/python/api/overview/azure/ai-formrecognizer-readme?view=azure-python
- https://www.geeksforgeeks.org/how-to-convert-python-dictionary-to-json/
- https://www.geeksforgeeks.org/json-dump-in-python/

In [6]:
# import libraries
import os
import openai
from dotenv import load_dotenv, find_dotenv

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

_ = load_dotenv(find_dotenv()) # read local .env file

# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint  = os.environ['DI_ENDPOINT']
key  = os.environ['DI_KEY']
openai.api_key  = os.environ['OPENAI_API_KEY']


In [7]:
# print(endpoint)
# print(key)
# print(openai.api_key )

## Utility Functions

In [8]:
def format_bounding_region(bounding_regions):
    if not bounding_regions:
        return "N/A"
    return ", ".join("Page #{}: {}".format(region.page_number, format_polygon(region.polygon)) for region in bounding_regions)

def format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in polygon])

In [9]:
import json

def analyze_general_documents(docUrl):
   
    # Create your `DocumentAnalysisClient` instance and `AzureKeyCredential` variable
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-document", docUrl)
    result = poller.result()
    print("*** result is ", type(result))

    # Convert result to dict and write JSON object to file
    result_dict = result.to_dict()
    with open("output.json", "w") as outfile: 
        json.dump(result_dict, outfile, indent=4)


    # Print result
    for style in result.styles:
        if style.is_handwritten:
            print("Document contains handwritten content: ")
            print(",".join([result.content[span.offset:span.offset + span.length] for span in style.spans]))

    print("----Key-value pairs found in document----")
    for kv_pair in result.key_value_pairs:
        if kv_pair.key:
            print(
                    "Key '{}' found within '{}' bounding regions".format(
                        kv_pair.key.content,
                        format_bounding_region(kv_pair.key.bounding_regions),
                    )
                )
        if kv_pair.value:
            print(
                    "Value '{}' found within '{}' bounding regions\n".format(
                        kv_pair.value.content,
                        format_bounding_region(kv_pair.value.bounding_regions),
                    )
                )

    for page in result.pages:
        print("----Analyzing document from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            print(
                "...Line # {} has text content '{}' within bounding box '{}'".format(
                    line_idx,
                    line.content,
                    format_polygon(line.polygon),
                )
            )

        for word in page.words:
            print(
                "...Word '{}' has a confidence of {}".format(
                    word.content, word.confidence
                )
            )

        for selection_mark in page.selection_marks:
            print(
                "...Selection mark is '{}' within bounding box '{}' and has a confidence of {}".format(
                    selection_mark.state,
                    format_polygon(selection_mark.polygon),
                    selection_mark.confidence,
                )
            )

    for table_idx, table in enumerate(result.tables):
        print(
            "Table # {} has {} rows and {} columns".format(
                table_idx, table.row_count, table.column_count
            )
        )
        for region in table.bounding_regions:
            print(
                "Table # {} location on page: {} is {}".format(
                    table_idx,
                    region.page_number,
                    format_polygon(region.polygon),
                )
            )
        for cell in table.cells:
            print(
                "...Cell[{}][{}] has content '{}'".format(
                    cell.row_index,
                    cell.column_index,
                    cell.content,
                )
            )
            for region in cell.bounding_regions:
                print(
                    "...content on page {} is within bounding box '{}'\n".format(
                        region.page_number,
                        format_polygon(region.polygon),
                    )
                )
    print("----------------------------------------")

In [10]:
# sample document
pdf = "https://storageaiforfnsurance.blob.core.windows.net/insurance/GHI_Quote.pdf"
analyze_general_documents(pdf)

*** result is  <class 'azure.ai.formrecognizer._models.AnalyzeResult'>
Document contains handwritten content: 
1
Document contains handwritten content: 
2
Document contains handwritten content: 
)
----Key-value pairs found in document----
Key 'Name of the Proposer' found within 'Page #1: [0.5788, 1.3271], [1.9766, 1.3451], [1.9745, 1.5135], [0.5766, 1.4955]' bounding regions
Value 'ELIXIR ENTERPRISES AND HOTELS PRIVATE LIMITED' found within 'Page #1: [2.3299, 1.3298], [3.8477, 1.3298], [3.8477, 1.8983], [2.3299, 1.8983]' bounding regions

Key 'Location of Proposer' found within 'Page #1: [0.5939, 1.9846], [1.9035, 1.9902], [1.9028, 2.1547], [0.5932, 2.1491]' bounding regions
Value 'BANGALORE' found within 'Page #1: [2.335, 1.9897], [3.1827, 1.9948], [3.1827, 2.1216], [2.3401, 2.1267]' bounding regions

Key 'Industry Type' found within 'Page #1: [0.5939, 2.2263], [1.4379, 2.2333], [1.4365, 2.3977], [0.5925, 2.3907]' bounding regions
Value 'Wholesale/Retail Trading' found within 'Page #1