In [None]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
    AnalyzeResult,
    DocumentAnalysisFeature,
    AnalyzeOutputOption,
    DocumentContentFormat,
)
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential, AzureCliCredential
from markitdown import MarkItDown
from IPython.display import Markdown, display
import os
import pandas as pd
import pydantic


In [2]:

def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (word.span.offset + word.span.length) <= (span.offset + span.length):
            return True
    return False

def _format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


In [3]:
document_intelligence_client = di = DocumentIntelligenceClient(
    endpoint=os.getenv("DOC_INTELLIGENCE_API"),
    credential=DefaultAzureCredential(),
)



In [None]:
# document_intelligence_client._config.api_version = "2024-11-30"  # Set the API version to the latest one

In [4]:
from azure.ai.documentintelligence import VERSION as doc_intelligence_version
print(f"Document Intelligence SDK version: {doc_intelligence_version}")


Document Intelligence SDK version: 1.0.2


In [5]:
document_intelligence_client._config.api_version

'2024-11-30'

In [6]:
# path_to_sample_documents = "data/Northwind_Standard_Benefits_Details.pdf"
path_to_sample_documents = "data/Small business startup checklis1.docx"

In [1]:

with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read",
        body=f,
        # pages="3",
        # output_content_format=DocumentContentFormat.MARKDOWN,
    )
result: AnalyzeResult = poller.result()

NameError: name 'path_to_sample_documents' is not defined

In [None]:
display(Markdown(result.content))

In [None]:
path_to_sample_documents = "data/Northwind_Standard_Benefits_Details.pdf"
with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        body=f,
        pages="4-8",
        output_content_format=DocumentContentFormat.MARKDOWN,
    )
result: AnalyzeResult = poller.result()
print(f"Document has {len(result.pages)} pages")

In [None]:
import json

print(type(result))
# print(json.dumps(result.as_dict(), indent=2))



In [None]:
result.as_dict().keys()

In [None]:
result.get("pages")



display(pd.json_normalize(result.as_dict()['pages'], sep="_").head(5))
display(pd.json_normalize(result.as_dict()['paragraphs'], sep="_").head(5))


In [None]:

# Create Pydantic models for Document Intelligence results
class Span(pydantic.BaseModel):
    offset: int
    length: int

class DocumentWord(pydantic.BaseModel):
    content: str
    polygon: list[float] = None
    confidence: float
    span: Span

class BoundingRegion(pydantic.BaseModel):
    page_number: int
    polygon: list[float] = None

class Paragraph(pydantic.BaseModel):
    content: str
    role: str = None
    spans: list[Span]
    bounding_regions: list[BoundingRegion] = None

# Parse words and paragraphs
words = [DocumentWord(**word) for word in result.as_dict()['words']] if 'words' in result.as_dict() else []
paragraphs = [Paragraph(**para) for para in result.as_dict()['paragraphs']] if 'paragraphs' in result.as_dict() else []

# Display some examples
print(f"Total words: {len(words)}")
if words:
    print(f"Sample word: {words[0].content}, confidence: {words[0].confidence}")

print(f"Total paragraphs: {len(paragraphs)}")
if paragraphs:
    print(f"Sample paragraph: {paragraphs[0].content[:50]}...")

In [None]:
print(result.content_format)

# print(result.content)

In [None]:
# display(Markdown(result.content))

In [None]:

if result.styles and any([style.is_handwritten for style in result.styles]):
    print("Document contains handwritten content")
else:
    print("Document does not contain handwritten content")


In [None]:
print(f"Document has {len(result.pages)} pages")
for page in result.pages:
    print(f"----Analyzing layout from page #{page.page_number}----")
    print(f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}")

    # if page.lines:
    #     for line_idx, line in enumerate(page.lines):
    #         words = []
    #         if page.words:
    #             for word in page.words:
    #                 print(f"......Word '{word.content}' has a confidence of {word.confidence}")
    #                 if _in_span(word, line.spans):
    #                     words.append(word)
    #         print(
    #             f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
    #             f"within bounding polygon '{_format_polygon(line.polygon)}'"
    #         )

    # if page.selection_marks:
    #     for selection_mark in page.selection_marks:
    #         print(
    #             f"Selection mark is '{selection_mark.state}' within bounding polygon "
    #             f"'{_format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
    #         )


In [None]:

if result.paragraphs:
    print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
    # Sort all paragraphs by span's offset to read in the right order.
    result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
    # print("-----Print sorted paragraphs-----")
    # for paragraph in result.paragraphs:
    #     if not paragraph.bounding_regions:
    #         print(f"Found paragraph with role: '{paragraph.role}' within N/A bounding region")
    #     else:
    #         print(f"Found paragraph with role: '{paragraph.role}' within")
    #         print(
    #             ", ".join(
    #                 f" Page #{region.page_number}: {_format_polygon(region.polygon)} bounding region"
    #                 for region in paragraph.bounding_regions
    #             )
    #         )
    #     print(f"...with content: '{paragraph.content}'")
    #     print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")


In [None]:

if result.tables:
    for table_idx, table in enumerate(result.tables):
        print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
        if table.bounding_regions:
            for region in table.bounding_regions:
                print(
                    f"Table # {table_idx} location on page: {region.page_number} is {_format_polygon(region.polygon)}"
                )
        for cell in table.cells:
            print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
            if cell.bounding_regions:
                for region in cell.bounding_regions:
                    print(
                        f"...content on page {region.page_number} is within bounding polygon '{_format_polygon(region.polygon)}'"
                    )

print("----------------------------------------")

In [None]:
result.content_format