In [None]:
%pip install openai
%pip install azure-ai-formrecognizer==3.3.0
%pip install azure-search-documents==11.4.0b11


In [None]:
import re
import json
import openai
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient  


In [None]:
OPENAI_ENDPOINT = "https://[youropenai].openai.azure.com/"
OPENAI_APIKEY = ""
OPENAI_EMBEDDINGS_DEPLOYMENT = "text-embedding-ada-002"

SERACH_ENDPOINT="https://[yoursearch].search.windows.net"
SEARCH_APIKEY = ""
SEARCH_INDEX = "document-comparison"

FORMRECOGNIZER_ENDPOINT = "https://[yourcognitiveservice].cognitiveservices.azure.com/"
FORMRECOGNIZER_APIKEY = ""

we're using key credentials here, but feel free to utilize other methods such as logged in user identity

In [None]:
formrecognizer_credentials = AzureKeyCredential(FORMRECOGNIZER_APIKEY)
search_credentials = AzureKeyCredential(SEARCH_APIKEY)

In [None]:
document_analysis_client = DocumentAnalysisClient(endpoint=FORMRECOGNIZER_ENDPOINT, credential=formrecognizer_credentials)
search_client = SearchClient(endpoint=SERACH_ENDPOINT, index_name=SEARCH_INDEX, credential=search_credentials)
openai.api_type = "azure"
openai.api_key = OPENAI_APIKEY
openai.api_base = OPENAI_ENDPOINT
openai.api_version = "2023-07-01-preview"

In [None]:
def get_embeddings(text):
    response = openai.Embedding.create(
        input=text,
        engine=OPENAI_EMBEDDINGS_DEPLOYMENT
    )
    embeddings = response['data'][0]['embedding']
    return embeddings

def extract_blocks_from_analysis_result(paragraphs):
    blocks = []
    current_block = ""
    has_copytext = False
    
    for p in paragraphs:
        content = p.content
        # Check if paragraph starts with a combination of numbers and dots with a trailing space
        if re.match(r'^(\d+(\.\d+)*)\s', content):
            # If there's an existing block, add it to blocks list
            if has_copytext and len(current_block) > 10:
                blocks.append(current_block)
                current_block = ""
            current_block = content + "\n"
            has_copytext = False
        else:
            has_copytext = True
            current_block += content + " "

    # Add any remaining paragraphs to blocks list
    if len(current_block)>0:
        blocks.append(current_block)

    return blocks

def index_pdf(name, documentId):
    with open(name, "rb") as f:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=f
        )
    result = poller.result()

    blocks = extract_blocks_from_analysis_result(result.paragraphs)
    documents = []
    idx = 0

    for item in blocks:
        idx+=1
        doc = {"key" : f"{documentId}-{idx:03}", "documentId": documentId, "chapter" : item }
        content_embeddings = get_embeddings(item)
        doc['vector'] = content_embeddings
        documents.append(doc)
    
    print(f"Uploading documents") 
    index_result = search_client.upload_documents(documents)  
    print(f"Uploaded {len(documents)} documents") 


In [None]:
index_pdf("document1.pdf", "doc01")
index_pdf("document2.pdf", "doc02")