### Cosmos DB - Azure Document Intelligence

In [1]:
from tenacity import retry, wait_random_exponential, stop_after_attempt
import os, html, time, uuid
from datetime import datetime
from openai import AzureOpenAI
from azure.cosmos import CosmosClient
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_embeddings(openai_client, text):
    """
    Generates embeddings for a given text using the OpenAI API v1.x
    """

    return openai_client.embeddings.create(
        input=text,
        model=os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
    ).data[0].embedding


def get_completion(openai_client, prompt: str) -> str:

    response = openai_client.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_CHAT_MODEL"),
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

In [3]:
# Step 1: Configure your Cosmos DB connection
COSMOS_DB_ENDPOINT = os.getenv('AZURE_COSMOSDB_NOSQL_ENDPOINT')
COSMOS_DB_KEY = os.getenv('AZURE_COSMOSDB_NOSQL_KEY')
DATABASE_NAME = os.getenv('AZURE_COSMOSDB_NOSQL_DATABASE')
CONTAINER_NAME = os.getenv('AZURE_COSMOSDB_NOSQL_CONTAINER')

In [4]:
# Step 2: Initialize Cosmos DB client
 
client = CosmosClient(COSMOS_DB_ENDPOINT, COSMOS_DB_KEY)
database = client.get_database_client(DATABASE_NAME)
container = database.get_container_client(CONTAINER_NAME)

In [5]:
# Step 3: Initialize Azure OpenAI client
openai_client = AzureOpenAI(
    api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
    api_version = os.getenv("AZURE_OPENAI_API_VERSION"),  
    azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)


In [6]:

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from urllib.parse import urlparse, unquote

def parse_file(filename, model="prebuilt-document", verbose=False):
    """
    Parses documents using Azure Document Intelligence SDK.

    Args:
        filename (str): The path to the document file to be parsed. Supported formats are PDF, PPTX, and DOCX.
        model (str, optional): The model to use for document analysis. Defaults to "prebuilt-document".
        verbose (bool, optional): If True, prints detailed logs during processing. Defaults to False.

    Returns:
        tuple: A tuple containing:
            - page_map (list): A list of tuples, each containing the page number, offset, and extracted text or HTML content.
            - elapsed_time (float): The time taken to process the document in seconds.

    Raises:
        ValueError: If the file format is not supported.
        Exception: If there is an error during document analysis.

    Example:
        >>> page_map, elapsed_time = parse_file("example.pdf", verbose=True)
        >>> print(f"Processed {len(page_map)} pages in {elapsed_time:.2f} seconds")
        >>> for page in pages:
                page_number = page[0]
                page_offset = page[1]
                page_content = page[2]

    """
    # Capture the start time
    start_time = time.time()
    offset = 0
    pages = []

    sas_token = os.getenv("AZURE_STORAGE_SAS_TOKEN")
    file = unquote(filename + sas_token)
    print(f"Ingesting {file}..")

    file_extension = os.path.splitext(filename)[1].lower()

    #----------------------------------------- Using Azure Document Intelligence -----------------------------------------

    if verbose:
        print(f"Extracting text using Azure Document Intelligence..")
    credential = AzureKeyCredential(
        os.environ["AZURE_DOCUMENT_INTELLIGENCE_KEY"])

    form_recognizer_client = DocumentAnalysisClient(
        endpoint=os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"], credential=credential)

    if file_extension == '.pdf':
        model = "prebuilt-document"

    if file_extension in (".pptx", ".docx", ".xlsx"):
        model = "prebuilt-read"

    poller = form_recognizer_client.begin_analyze_document_from_url(model, document_url=file)
    
    result = poller.result()
    # Whole document content
    # print ("Document contains content: ", result.content)

    for page_num, page in enumerate(result.pages):
        print(f"---- Reading from page #{page.page_number} ----")

        words_array = []
        content_length = 0  # Track the length of page content
        for word in page.words:
            if content_length + len(word.content) <= 8000:
                words_array.append(word.content)
                content_length += len(word.content)
            else:
                break

        page_content = ' '.join(words_array)
        print(page_content)
        pages.append((page_num+1, offset, page_content))
        offset += len(page_content)

    # Capture the end time and Calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Done. Read {len(pages)} page(s) in {elapsed_time} seconds.")
    return pages

### Parse PDF file using Azure Document Intelligence

In [7]:
url_path = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf";
pages = parse_file(url_path, verbose=True)

Ingesting https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf?sv=2022-11-02&ss=b&srt=sco&sp=rwdlacyx&se=2025-03-10T19:22:51Z&st=2024-03-10T11:22:51Z&spr=https&sig=Xioy2WYsGhEytLQ/UV/WVUW1Uvr2FO3mtbWNBOPsdzI=..
Extracting text using Azure Document Intelligence..
---- Reading from page #1 ----
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2020 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to Commission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON (STATE OF INCORPORATION) 91-1144442 (I.R.S. ID) ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor Securities registered pursuant to Section 12(b) of the Act: Title of each cla

### Create embeddings and store in Cosmos DB

In [8]:
print(pages)

[(1, 0, 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2020 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to Commission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON (STATE OF INCORPORATION) 91-1144442 (I.R.S. ID) ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor Securities registered pursuant to Section 12(b) of the Act: Title of each class Trading Symbol Name of exchange on which registered Common stock, $0.00000625 par value per share MSFT NASDAQ 2.125% Notes due 2021 MSFT NASDAQ 3.125% Notes due 2028 MSFT NASDAQ 2.625% Notes due 2033 MSFT NASDAQ Securities registered pursuant to Section 12(g) of the Act: NONE Indicate by check mark whether the registrant (1) has filed all reports required t

In [10]:
# Add documents to Cosmos DB
for page in pages:
    page_number = page[0]
    page_offset = page[1]
    page_content = page[2]

    print(f"---- Adding Page #{page_number} ----")
    
    doc = {
        "id": str(uuid.uuid4()),
        "file_name": os.path.basename(url_path),
        "file_uri": url_path,
        "page": str(page_number),
        "doc_type": url_path.split('.')[-1],  # Extracts the file extension from the url_path
        "content": page_content,
        "content_vector": generate_embeddings(openai_client, page_content),
        "dt_updated": datetime.now().isoformat()
    }

    # Upsert document in Cosmos DB NoSQL
    try:
        container.upsert_item(doc)
        print(f"Successfully added Page #{page_number} to Cosmos DB.")
    except Exception as e:
        print(f"Failed to add Page #{page_number}. Error: {str(e)}")
    

---- Adding Page #1 ----
Successfully added Page #1 to Cosmos DB.


### Query

In [12]:
# vectorize user query
query = "How many outstanding shares as of April 2020?"
query_vector = generate_embeddings(openai_client, query)
print(query_vector)

[0.005692640319466591, -0.02660887874662876, 0.058041587471961975, 0.07178691774606705, 0.016935279592871666, -0.012798715382814407, -0.001188938389532268, 0.019126752391457558, -0.013602687045931816, -0.02518247626721859, 0.04616355150938034, 6.417795520974323e-05, 0.01546997670084238, -0.02671261690557003, 0.04494462534785271, 0.007637733593583107, -0.03420770913362503, -0.04043200984597206, -0.038590654730796814, -0.0019467143574729562, -0.03254789859056473, -0.007540478836745024, 0.03949836269021034, 0.033092524856328964, -0.0138231310993433, 0.03254789859056473, -0.07733691483736038, -0.07406915724277496, -0.0372939258813858, 0.014691939577460289, 0.02821682207286358, -0.03921308368444443, 0.0422993004322052, 0.005423569120466709, -0.04803084209561348, 0.0021136682480573654, 0.020203037187457085, -0.00036652854760177433, 0.013045093044638634, -0.04676004499197006, 0.011547371745109558, -0.05409953370690346, -0.006117318756878376, 0.03047313168644905, 0.013991705141961575, 0.012532

In [13]:
# retrieve similar documents
results = container.query_items(
    query=f"""
SELECT TOP 5
    c.file_name, 
    VectorDistance(c.content_vector, @query_vector) as similarity_score,
    c.page, 
    c.doc_type, 
    c.content
FROM c 
ORDER BY VectorDistance(c.content_vector, @query_vector)
""",
    parameters=[
        {'name': "@query_vector", "value": query_vector},
    ],
    enable_cross_partition_query=True
)

docs = []
for doc in results:
    docs.append(doc)
print(docs)

[{'file_name': 'sample-layout.pdf', 'similarity_score': 0.5273692419819315, 'page': '1', 'doc_type': 'pdf', 'content': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 FORM 10-Q ☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2020 OR ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to Commission File Number 001-37845 MICROSOFT CORPORATION WASHINGTON (STATE OF INCORPORATION) 91-1144442 (I.R.S. ID) ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor Securities registered pursuant to Section 12(b) of the Act: Title of each class Trading Symbol Name of exchange on which registered Common stock, $0.00000625 par value per share MSFT NASDAQ 2.125% Notes due 2021 MSFT NASDAQ 3.125% Notes due 2028 MSFT NASDAQ 2.625% Notes due 2033 MSFT NASDAQ Securities registered pursuant to Se

In [14]:
# insert documents into context
# and use Azure OpenAI to answer query
prompt = f"""
## On your ability to answer question based on fetched documents (sources):
- Given extracted parts (contexts) from one or multiple documents, and a question, Answer the question thoroughly with citations/references. 
- If there are conflicting information or multiple definitions or explanations, detail them all in your answer.
- In your answer, **You MUST use** all relevant extracted parts that are relevant to the question.
- **YOU MUST** place inline citations directly after the sentence they support using this HTML format: `<sup><a href="[file_uri]" target="_blank">&nbsp;[citation_number]</a></sup>`.
- Each citation must have its own unique number, even if it points to the same source. Each citation must be numbered sequentially, starting at 1 and increasing by 1 with no duplicate citation numbers.
- You can **only** use the information in your CONTEXT to answer the question. Do not use any external knowledge.
- ALWAYS provide an answer with references.
- DO NOT remove the SAS token from CONTEXT
- DO NOT remove the SAS token from the file_uri
- Make sure that sources are mentioned in the answer
- Make sure the url includes target="_blank"

## About your output format:
- Answer the user's QUESTION in markdown format. 
- You have access to Markdown rendering elements to present information in a visually appealing way. For example:
  - You can use headings when the response is long and can be organized into sections.
  - You can use compact tables to display data or information in a structured manner.
  - You can use short lists to present multiple items or options concisely.
  - You can use code blocks to display formatted content such as poems, code snippets, lyrics, etc.
  
- **You must** respond in the same language as the question

- Take a step back and make sure that your answer follows all the rules described above

User: {query} 

CONTEXT: 
{str(docs)}

Assistant:

""" 
response = get_completion(openai_client, prompt)
print(response)

As of April 24, 2020, Microsoft Corporation had **7,583,440,247 shares** of common stock outstanding, with a par value of $0.00000625 per share<sup><a href="https://example.com/sample-layout.pdf" target="_blank">&nbsp;1</a></sup>.
