In [127]:
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
    AnalyzeResult,
    DocumentAnalysisFeature,
    AnalyzeOutputOption,
    DocumentContentFormat,
)
from azure.identity import DefaultAzureCredential
from markitdown import MarkItDown
from IPython.display import Markdown, display
import os
import pandas as pd
import pydantic


In [128]:

def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (word.span.offset + word.span.length) <= (span.offset + span.length):
            return True
    return False

def _format_polygon(polygon):
    if not polygon:
        return "N/A"
    return ", ".join([f"[{polygon[i]}, {polygon[i + 1]}]" for i in range(0, len(polygon), 2)])


In [129]:
document_intelligence_client = di = DocumentIntelligenceClient(
    endpoint=os.getenv("DOC_INTELLIGENCE_API"),
    credential=DefaultAzureCredential(),
)



In [130]:
document_intelligence_client._config.api_version

'2024-11-30'

In [131]:
# path_to_sample_documents = "data/Northwind_Standard_Benefits_Details.pdf"
path_to_sample_documents = "data/Small business startup checklis1.docx"

In [132]:

with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read",
        body=f,
        # pages="3",
        output_content_format=DocumentContentFormat.MARKDOWN,
    )
result: AnalyzeResult = poller.result()

In [133]:
display(Markdown(result.content))

Small business startup checklist
Startup checklist
Getting started
1.
Conduct a personal evaluation to determine why you want to start a business.
2.
Create a business plan:
a.
What do we do?
b.
How do we do it?
c.
Who do we serve?
3.
Conduct a SWOT analysis to identify your strengths, weaknesses, opportunities, and threats.
4.
Assess how much capital you have available to invest.
5.
Discuss your plans with family members to ensure they are behind you.
6.
Determine whether you want the business to be full-time or part-time.
Committing to your business
1.
Define the customer “problem” and how your business can provide a solution in a unique way.
2.
Determine viability: Is there a market for your service?
3.
Identify businesses that are having success today.
4.
Will your business be relevant as time passes? How will you adapt?
5.
Define your market:
a.
Who is your ideal customer?
b.
What’s your market size?
c.
How easy is it to acquire the customer?
6.
Validate your idea by talking to potential customers about it.
7.
Evaluate how customers solve this problem today, as well as what the competition offers.
8.
Create a quick financial plan, identifying costs and forecasted sales, to see if your capital gets you to a profit.
Setting up your business
1.
Select your business name. Perform a corporate name search to make sure your name is still available.
2.
Register a domain name and secure social media profiles for the company.
3.
Apply for an EIN with the IRS and local or state business licenses.
4.
Open a business bank account and apply for a business credit card.
5.
Find appropriate space to become your primary business location.
6.
Once location is secured, get services set up in the business name, including phone number and other necessary utilities.
7.
Decide on a legal structure or business structure and incorporate: Corporation, LLC, or Sole Proprietorship.
8.
Get your website up and running.
9.
Set up an accounting and record keeping system:
a.
hire an accountant,
b.
select an accounting system, and
c.
select a fiscal year.
10.
Evaluate and select needed insurance policies for your business: liability, workers’ compensation, or health insurance.
11.
Prepare and begin networking with pre-marketing materials:
a.
business cards,
b.
letterhead,
c.
invoice,
d.
brochures, or
e.
public relations.
12.
Introduce your business to the surrounding businesses to your location.
Ensuring Sufficient Funds are Available
1.
Estimate how long it will take for your business to acquire paying customers.
2.
Itemize your business expenses for the first year, at least, and assess where these will come from.
3.
Itemize how much capital is required to launch the business and project how long it will take to become profitable.
4.
From this, determine how many months of savings or investment you need to breakeven.
5.
If outside investment is required beyond the founders’ savings, complete a business plan.
Planning for your business plan
1.
Complete a Company Overview that includes basic information and a summary of the management team.
2.
Write a Business Description section describing your services and what problems they solve.
3.
Prepare a Market Analysis section that describes the total market and your target market, specific segment needs, competitive offerings available, and any trends that will affect the analysis.
4.
Describe an Operating Plan for the business, such as operating hours, number of employees, key vendors, or seasonal adjustments your business might need to adjust to.
5.
Create a Marketing and Sales Plan that includes a “Go To Market” or launch plan, pricing, how your business will generate leads, and close new business.
6.
Build a Financial Plan that shows a break-even analysis, projected profit and loss, and projected cash flows.
7.
Write an Executive Summary which gives a general overview of the above completed sections.
Setting Up to Operate
1.
Find a feasible space for your office that can provide you with the ideal space needed to conduct business.
2.
Set up your office with a comfy chair, desk, shelves, filing cabinets, and etc.
3.
Identify any staffing needs.
4.
Recruit, interview, hire, and train employees (if applicable).
5.
If hiring employees, get an employer ID from the IRS.
6.
Identify and set up any needed technology:
a.
laptop for business operations,
b.
printer(s),
c.
business software and applications,
d.
POS,
e.
business email accounts,
f.
phones,
g.
CRM
h.
billing, and
i.
payment systems.
7.
Ensure your technology systems are secure with your information and customer information.
8.
Install a security system, if applicable.
9.
Depending on the business type, identify and partner with the right suppliers/vendors.
Marketing and launching your business
1.
Develop and refine a brand for your company and its products or services.
2.
Create and fine tune an Elevator Pitch through conversations.
3.
Community outreach and networking. Attend various networking events to build relationships with community connectors.
4.
Begin distributing or displaying your marketing materials:
a.
web-based promotions,
b.
social media,
c.
direct mail,
d.
business cards,
e.
trade shows, or
f.
brochures.
5.
Establish an email marketing account and initiate digital marketing through blogs, emails, or SEO strategies to drive traffic to your website.
6.
Let the local or regional press know you are opening and when.
7.
Utilize Guerrilla Marketing tactics to generate traffic.
8.
Organize an opening event. Create a relationship that might allow cross-promotions.
Sustaining your business
1.
Keep track of strategies that worked and flopped to fine tune your marketing tactics.
2.
Ask for referrals and testimonials to build credibility.
3.
Maintain and nurture your repeat business. Remember, it costs 80% less to maintain a current relationship than to develop a new one.
4.
Recognize your top client base. They will be your best pied pipers. Ask for referrals.
1
5

In [134]:
path_to_sample_documents = "data/Northwind_Standard_Benefits_Details.pdf"
with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        body=f,
        pages="4-8",
        output_content_format=DocumentContentFormat.MARKDOWN,
    )
result: AnalyzeResult = poller.result()
print(f"Document has {len(result.pages)} pages")

Document has 2 pages


In [135]:
import json

print(type(result))
# print(json.dumps(result.as_dict(), indent=2))



<class 'azure.ai.documentintelligence.models._models.AnalyzeResult'>


In [136]:
result.as_dict().keys()

dict_keys(['apiVersion', 'modelId', 'stringIndexType', 'content', 'pages', 'paragraphs', 'contentFormat', 'sections'])

In [137]:
result.get("pages")



display(pd.json_normalize(result.as_dict()['pages'], sep="_").head(5))
display(pd.json_normalize(result.as_dict()['paragraphs'], sep="_").head(5))


Unnamed: 0,pageNumber,angle,width,height,unit,words,lines,spans
0,4,-0.0045,8.5,11,inch,"[{'content': 'Once', 'polygon': [1.2421, 1.012...",[{'content': 'Once you have reached this limit...,"[{'offset': 0, 'length': 2216}]"
1,5,-0.0076,8.5,11,inch,"[{'content': 'services.', 'polygon': [1.2376, ...",[{'content': 'services. This means that you ma...,"[{'offset': 2216, 'length': 2853}]"


Unnamed: 0,spans,boundingRegions,content,role
0,"[{'offset': 0, 'length': 106}]","[{'pageNumber': 4, 'polygon': [1.2323, 1.0127,...","Once you have reached this limit, the plan wil...",
1,"[{'offset': 109, 'length': 30}]","[{'pageNumber': 4, 'polygon': [1.2404, 1.5685,...",Tips for Managing Your Costs,sectionHeading
2,"[{'offset': 141, 'length': 187}]","[{'pageNumber': 4, 'polygon': [1.2353, 1.909, ...",There are several steps that you can take to h...,
3,"[{'offset': 330, 'length': 163}]","[{'pageNumber': 4, 'polygon': [1.2389, 2.6708,...",· Make sure to take advantage of preventive ca...,
4,"[{'offset': 495, 'length': 123}]","[{'pageNumber': 4, 'polygon': [1.2365, 3.2175,...",· Always make sure to visit in-network provide...,


In [138]:

# Create Pydantic models for Document Intelligence results
class Span(pydantic.BaseModel):
    offset: int
    length: int

class DocumentWord(pydantic.BaseModel):
    content: str
    polygon: list[float] = None
    confidence: float
    span: Span

class BoundingRegion(pydantic.BaseModel):
    page_number: int
    polygon: list[float] = None

class Paragraph(pydantic.BaseModel):
    content: str
    role: str = None
    spans: list[Span]
    bounding_regions: list[BoundingRegion] = None

# Parse words and paragraphs
words = [DocumentWord(**word) for word in result.as_dict()['words']] if 'words' in result.as_dict() else []
paragraphs = [Paragraph(**para) for para in result.as_dict()['paragraphs']] if 'paragraphs' in result.as_dict() else []

# Display some examples
print(f"Total words: {len(words)}")
if words:
    print(f"Sample word: {words[0].content}, confidence: {words[0].confidence}")

print(f"Total paragraphs: {len(paragraphs)}")
if paragraphs:
    print(f"Sample paragraph: {paragraphs[0].content[:50]}...")

Total words: 0
Total paragraphs: 27
Sample paragraph: Once you have reached this limit, the plan will co...


In [139]:
print(result.content_format)

# print(result.content)

DocumentContentFormat.MARKDOWN


In [140]:
# display(Markdown(result.content))

In [141]:

if result.styles and any([style.is_handwritten for style in result.styles]):
    print("Document contains handwritten content")
else:
    print("Document does not contain handwritten content")


Document does not contain handwritten content


In [142]:
print(f"Document has {len(result.pages)} pages")
for page in result.pages:
    print(f"----Analyzing layout from page #{page.page_number}----")
    print(f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}")

    # if page.lines:
    #     for line_idx, line in enumerate(page.lines):
    #         words = []
    #         if page.words:
    #             for word in page.words:
    #                 print(f"......Word '{word.content}' has a confidence of {word.confidence}")
    #                 if _in_span(word, line.spans):
    #                     words.append(word)
    #         print(
    #             f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
    #             f"within bounding polygon '{_format_polygon(line.polygon)}'"
    #         )

    # if page.selection_marks:
    #     for selection_mark in page.selection_marks:
    #         print(
    #             f"Selection mark is '{selection_mark.state}' within bounding polygon "
    #             f"'{_format_polygon(selection_mark.polygon)}' and has a confidence of {selection_mark.confidence}"
    #         )


Document has 2 pages
----Analyzing layout from page #4----
Page has width: 8.5 and height: 11.0, measured with unit: LengthUnit.INCH
----Analyzing layout from page #5----
Page has width: 8.5 and height: 11.0, measured with unit: LengthUnit.INCH


In [143]:

if result.paragraphs:
    print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
    # Sort all paragraphs by span's offset to read in the right order.
    result.paragraphs.sort(key=lambda p: (p.spans.sort(key=lambda s: s.offset), p.spans[0].offset))
    # print("-----Print sorted paragraphs-----")
    # for paragraph in result.paragraphs:
    #     if not paragraph.bounding_regions:
    #         print(f"Found paragraph with role: '{paragraph.role}' within N/A bounding region")
    #     else:
    #         print(f"Found paragraph with role: '{paragraph.role}' within")
    #         print(
    #             ", ".join(
    #                 f" Page #{region.page_number}: {_format_polygon(region.polygon)} bounding region"
    #                 for region in paragraph.bounding_regions
    #             )
    #         )
    #     print(f"...with content: '{paragraph.content}'")
    #     print(f"...with offset: {paragraph.spans[0].offset} and length: {paragraph.spans[0].length}")


----Detected #27 paragraphs in the document----


In [144]:

if result.tables:
    for table_idx, table in enumerate(result.tables):
        print(f"Table # {table_idx} has {table.row_count} rows and " f"{table.column_count} columns")
        if table.bounding_regions:
            for region in table.bounding_regions:
                print(
                    f"Table # {table_idx} location on page: {region.page_number} is {_format_polygon(region.polygon)}"
                )
        for cell in table.cells:
            print(f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'")
            if cell.bounding_regions:
                for region in cell.bounding_regions:
                    print(
                        f"...content on page {region.page_number} is within bounding polygon '{_format_polygon(region.polygon)}'"
                    )

print("----------------------------------------")

----------------------------------------


In [145]:
result.content_format

<DocumentContentFormat.MARKDOWN: 'markdown'>