In [None]:
import sys

# Get the shell
shell = sys.executable
print("Shell:", shell)

# Get the profile
profile = sys.argv[0]
print("Profile:", profile)

%alias python python3

print("python xcode profile version")
!python --version

print("python kernel version")
!python3 --version

print("loading dotenv extension...")
%load_ext dotenv


In [None]:
print("load environment variables from .env file")

%reload_ext dotenv

from dotenv import load_dotenv

loaded_env = load_dotenv("../../.env")
print("Loaded .env file:", loaded_env)

## Install packages
To run the code, install the following packages. Please use the latest stable version by running pip install azure-search-documents. This sample currently uses version 11.4.0 and openai version 1.3.3.

In [None]:
! pip install azure-search-documents 
! pip install openai 
! pip install tenacity
! pip install pandas
! pip install azure-ai-documentintelligence
! pip install azure-ai-formrecognizer

## Import required libraries and environment variables

In [None]:
# Import required libraries  
import os  
import json  
import openai
import unicodedata
import pandas as pd
from openai import AzureOpenAI 
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient, SearchIndexingBufferedSender  
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.ai.formrecognizer import AnalyzeResult
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
) 
from openai.types.chat import (
    ChatCompletionAssistantMessageParam,
    ChatCompletionContentPartParam,
    ChatCompletionMessageParam,
    ChatCompletionSystemMessageParam,
    ChatCompletionUserMessageParam,
)

# Configure environment variables  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = "pdfindex" #os.getenv("AZURE_SEARCH_INDEX") 
key = os.getenv("AZURE_SEARCH_API_KEY") 
open_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
open_api_key = os.getenv("AZURE_OPENAI_API_KEY")
open_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
form_recognizer_key = os.getenv("AZURE_FORMRECOGNIZER_API_KEY")
form_recognizer_endpoint = os.getenv("AZURE_FORMRECOGNIZER_ENDPOINT")

credential = AzureKeyCredential(key)

## Create embeddings

Read your data, generate OpenAI embeddings and export to a format to insert your Azure AI Search index:

In [None]:
deployment_name = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT")
model_name = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME")

print("Deployment Name:", deployment_name)
print("Model Name:", model_name)

client = AzureOpenAI(
  api_key = open_api_key,  
  api_version = "2023-05-15",
  azure_endpoint = open_api_endpoint
)

document_analysis_client = DocumentAnalysisClient(
    endpoint=form_recognizer_endpoint, credential=AzureKeyCredential(form_recognizer_key)
)

shell_report_url = "https://reports.shell.com/sustainability-report/2022/_assets/downloads/shell-sustainability-report-2022.pdf"

print(f"Beginning analyze document for {shell_report_url}")
analyze_poller = document_analysis_client.begin_analyze_document_from_url("prebuilt-read", shell_report_url)
analyze_result: AnalyzeResult = analyze_poller.result()

print("Document Analyze Result completed")

analysed_pages = []

for page in analyze_result.pages:
    parsed_page = {
      "pageNumber": f"{page.page_number}",
      "content": ""
    }
    for word in page.words:
        parsed_page["content"] += word.content + " "
    
    analysed_pages.append(parsed_page)

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text, model):
    try:
        return client.embeddings.create(input = [text], model=model).data[0].embedding
    except Exception as e:
        print("Error:", e) 

print("Generating embeddings for content")
for page in analysed_pages:
    pageNumber = page['pageNumber']
    content = page['content']
    content_embeddings = generate_embeddings(content, deployment_name)
    page['contentVector'] = content_embeddings

# Output embeddings to docVectors.json file
with open("../output/docVectors.json", "w") as f:
    print("Writing docVectors.json")
    json.dump(analysed_pages, f)

## Create your search index
Create your search index schema and vector search configuration:

In [None]:
# Create a search index
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)

fields = [
    SimpleField(name="pageNumber", type=SearchFieldDataType.String, key=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration  
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
        )
    ]
)



semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index_client.delete_index(index_name)

index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [None]:
# Upload some documents to the index
with open('../output/docVectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents") 

## Perform a vector similarity search
This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

In [None]:
# Pure Vector Search
query = "operational process safety events in 2022?"

print("Asking question:", query)

search_client = SearchClient(service_endpoint, index_name, credential=credential)

# Generate the query vector using text embeddings from the OpenAI API
vector_query = VectorizedQuery(vector=generate_embeddings(query, model=deployment_name), k_nearest_neighbors=7, fields="contentVector")

print("Searching...")
results = search_client.search(  
    search_text=None,  
    query_type=QueryType.SEMANTIC,
    vector_queries= [vector_query],
    select=["pageNumber", "content"],
)  

search_result = ""
  
for result in results:  
    search_result += f"Page: {result['pageNumber']}\n"
    search_result += f"Content: {result['content']}\n"
    search_result += f"Score: {result['@search.score']}\n\n"
    
print(search_result)

## Crate Chat Completion

Using the search result as user content, we can use the OpenAI API to generate a response.

In [None]:
chat_model_name = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT")
print("Chat Model Name:", chat_model_name)

system_chat_template = (
        "You are an intelligent assistant helping Contoso Inc employees for their sustainability report questions. "
        + "Use 'you' to refer to the individual asking the questions even if they ask with 'I'. "
        + "Answer the following question using only the data provided in the sources below. "
        + "For tabular information return it as an html table. Do not return markdown format. "
        + "Include page source as citation"
        + "Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. "
        + "If you cannot answer using the sources below, say you don't know. Use below example to answer"
    )

def create_message(role: str, content: str):
    """
    Inserts a message into the conversation at the specified index,
    or at index 1 (after system message) if no index is specified.
    Args:
        role (str): The role of the message sender (either "user", "system", or "assistant").
        content (str | List[ChatCompletionContentPartParam]): The content of the message.
        index (int): The index at which to insert the message.
    """
    message: ChatCompletionMessageParam
    if role == "user":
        return ChatCompletionUserMessageParam(role="user", content=unicodedata.normalize("NFC", content))
    elif role == "system" and isinstance(content, str):
        return ChatCompletionSystemMessageParam(role="system", content=unicodedata.normalize("NFC", content))
    elif role == "assistant" and isinstance(content, str):
        return ChatCompletionAssistantMessageParam(role="assistant", content=unicodedata.normalize("NFC", content))
    else:
        raise ValueError(f"Invalid role: {role}")


messages: list[ChatCompletionMessageParam] = []

user_content = f"{query} \n Sources: \n {search_result}"
messages.append(create_message("user", user_content))
messages.append(create_message("system", system_chat_template))

print("Chat completion messages:", messages)

print("Creating chat completion...")
chat_completion = client.chat.completions.create(
                # Azure Open AI takes the deployment name as the model name
                model=chat_model_name,
                messages=messages,
                temperature=0.3,
                max_tokens=1024,
                n=1,
            )


print(chat_completion)

print("Chat completion completed")
print("Chat completion result:", chat_completion.choices[0].message.content)


