# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. 

In [None]:
! pip install azure-search-documents==11.4.0b6
! pip install python-dotenv
! pip install pandas
! pip install openai==0.28.1
! pip install tenacity

## Import required libraries and environment variables

In [2]:
# Import required libraries
import os
import json
import uuid
import openai
import random
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

# Configure environment variables
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)

openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")

pd.set_option('expand_frame_repr', False)

In [3]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=os.getenv("AZURE_OPENAI_EMBEDDING_ENGINE"))
    embeddings = response['data'][0]['embedding']
    return embeddings

## Create your search index
Create your search index schema and vector search configuration:

In [4]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="session_id", type=SearchFieldDataType.String,filterable=True, retrievable=True),
    SimpleField(name="content_order", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 test-replica-issue created


## Insert text and embeddings into vector store multiple times 
Add texts and metadata from the JSON data to the vector store:

In [5]:
with open('input_data.json', 'r', encoding='utf-8') as file:
    base_documents = json.load(file)

session_ids = []
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
for i in range(100):
    session_id = str(uuid.uuid4())
    documents = base_documents
    for document in documents:
        document['id'] = str(uuid.uuid4())
        document['session_id'] = session_id

    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(documents)} documents for session {session_id}")
    session_ids.append(session_id)

Uploaded 108 documents for session 73bb1718-205b-42b2-bd7e-04dc497fe976
Uploaded 108 documents for session 18bd58fc-5905-4957-a606-b413965e63f8
Uploaded 108 documents for session 5b00cd96-32ef-483b-8589-863d79558101
Uploaded 108 documents for session 75a43d0c-9a73-46a6-a1fc-431604767d80
Uploaded 108 documents for session 2ca11267-8237-4681-98e9-5271a293bf76
Uploaded 108 documents for session 5acd1fe3-7f04-480d-aa1b-5e169bafbc99
Uploaded 108 documents for session adaa826c-e8c3-4909-9ce9-68479b5a960f
Uploaded 108 documents for session b40d0a10-bdea-48ae-a2a1-4bedc05240f9
Uploaded 108 documents for session 4397ab6d-c0b3-4f3e-aa6a-4a5904ccc52e
Uploaded 108 documents for session b331a4e7-9d02-4a35-adaf-f946865fc2d6
Uploaded 108 documents for session 493a1c90-8437-49ea-b176-a32aacedd752
Uploaded 108 documents for session 39b158c9-a16c-47fb-b40d-4212a6656957
Uploaded 108 documents for session 3a4669b9-7aa9-42c3-8d59-02e9e0aa95f8
Uploaded 108 documents for session 57196037-9330-4613-a1f4-4616e

### Below will pull one random session_id from above and try search 10 times to check is we getting consistent results

### run below all block mutiple times if you are not see inconsistent results


In [11]:
query = "scalable storage solution"
query_embeddings = generate_embeddings(query)
session_id = '57196037-9330-4613-a1f4-4616e9efad26' #random.choice(session_ids)

search_sticky_session_string = 'st-1203293293' # this is fix string for acs session_id

## Perform a Simple text Search

In [12]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        #vector=query_embeddings, top_k=15,
        #vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

        0   1   2   3   4   5   6   7   8   9   10  11   12  13  14
run_1   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_2   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_3   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_4   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_5   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_6   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_7   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_8   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_9   10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
run_10  10  55  74   4  75  36  50  51  52  53  48  56  103   6   3
              0        1        2         3         4         5         6         7         8         9         10        11        12        13        14
run_1   4.815766  4.29206  3.47809  3.395077  3

## Perform a Simple Vector Search

In [13]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=None,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

         0  1     2
run_1   50  4   NaN
run_2   50  4  52.0
run_3   50  4   NaN
run_4   50  4  52.0
run_5   50  4   NaN
run_6   50  4  52.0
run_7   50  4   NaN
run_8   50  4  52.0
run_9   50  4   NaN
run_10  50  4  52.0
               0         1         2
run_1   0.860246  0.859375       NaN
run_2   0.860246  0.859375  0.857549
run_3   0.860246  0.859375       NaN
run_4   0.860246  0.859375  0.857549
run_5   0.860246  0.859375       NaN
run_6   0.860246  0.859375  0.857549
run_7   0.860246  0.859375       NaN
run_8   0.860246  0.859375  0.857549
run_9   0.860246  0.859375       NaN
run_10  0.860246  0.859375  0.857549


## Perform a Simple Vector Search with sticky session

In [19]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=None,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

         0  1
run_1   50  4
run_2   50  4
run_3   50  4
run_4   50  4
run_5   50  4
run_6   50  4
run_7   50  4
run_8   50  4
run_9   50  4
run_10  50  4
               0         1
run_1   0.860246  0.859375
run_2   0.860246  0.859375
run_3   0.860246  0.859375
run_4   0.860246  0.859375
run_5   0.860246  0.859375
run_6   0.860246  0.859375
run_7   0.860246  0.859375
run_8   0.860246  0.859375
run_9   0.860246  0.859375
run_10  0.860246  0.859375


## Perform a Hybrid Search without scoring_statistics & sticky session

In [14]:
# Hybrid Search
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

        0   1   2   3   4   5   6   7   8   9   10  11   12  13  14
run_1    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_2    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_3    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_4    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_5    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_6    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_7    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_8    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_9    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_10   4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
              0         1         2         3         4         5         6         7         8         9         10        11        12        13        14
run_1   0.032266  0.031818  0.030835  0.01666

## Perform a Hybrid Search with scoring_statistics 

In [15]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

        0   1   2   3   4   5   6   7   8   9   10  11   12  13  14
run_1    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_2    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_3    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_4    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_5    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_6    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_7    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_8    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_9    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_10   4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
              0         1         2         3         4         5         6         7         8         9         10        11        12        13        14
run_1   0.032266  0.031818  0.030835  0.01666

## Perform a Hybrid Search with sticky session

In [16]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

        0   1   2   3   4   5   6   7   8   9   10  11   12  13  14
run_1    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_2    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_3    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_4    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_5    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_6    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_7    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_8    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_9    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_10   4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
              0         1         2         3         4         5         6         7         8         9         10        11        12        13        14
run_1   0.032266  0.031818  0.030835  0.01666

## Perform a Hybrid Search with scoring_statistics & sticky session

In [17]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 57196037-9330-4613-a1f4-4616e9efad26

        0   1   2   3   4   5   6   7   8   9   10  11   12  13  14
run_1    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_2    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_3    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_4    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_5    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_6    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_7    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_8    4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
run_9    4  50  52  10  55  74  75  36  51  53  48  56  103   6   3
run_10   4  50  10  55  74  75  36  51  52  53  48  56  103   6   3
              0         1         2         3         4         5         6         7         8         9         10        11        12        13        14
run_1   0.032266  0.031818  0.030835  0.01666