# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. 

In [None]:
! pip install azure-search-documents==11.4.0b6
! pip install python-dotenv
! pip install pandas
! pip install openai==0.28.1
! pip install tenacity

## Import required libraries and environment variables

In [1]:
# Import required libraries
import os
import json
import uuid
import openai
import random
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

# Configure environment variables
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)

openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")

In [2]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=os.getenv("AZURE_OPENAI_EMBEDDING_ENGINE"))
    embeddings = response['data'][0]['embedding']
    return embeddings

## Create your search index
Create your search index schema and vector search configuration:

In [3]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="session_id", type=SearchFieldDataType.String,filterable=True, retrievable=True),
    SimpleField(name="content_order", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 test-replica-issue created


## Insert text and embeddings into vector store multiple times 
Add texts and metadata from the JSON data to the vector store:

In [4]:
with open('input_data.json', 'r', encoding='utf-8') as file:
    base_documents = json.load(file)

session_ids = []
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
for i in range(100):
    session_id = str(uuid.uuid4())
    documents = base_documents
    for document in documents:
        document['id'] = str(uuid.uuid4())
        document['session_id'] = session_id

    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(documents)} documents for session {session_id}")
    session_ids.append(session_id)

Uploaded 108 documents for session 2ff145bc-fc4a-4c88-ae2d-037ec1f06ea9
Uploaded 108 documents for session 81f33a5d-81ca-481c-911f-f570ec2b1343
Uploaded 108 documents for session 9272a889-f29a-415b-9791-e7a60884a145
Uploaded 108 documents for session f4d05a1e-feec-4ef8-9b9d-25edd797ac2c
Uploaded 108 documents for session b1676df3-754b-437d-8650-9e21af3f1df7
Uploaded 108 documents for session 53021d78-ed33-4e03-b0b0-6512a6a26ba0
Uploaded 108 documents for session 0c614a55-3673-43b8-8508-0e24a62d6638
Uploaded 108 documents for session 0d905bda-3a82-45e6-8ec5-a8ebd38f1337
Uploaded 108 documents for session 1edd0fab-552a-44ff-ab8f-ce288eb79ae7
Uploaded 108 documents for session 8fb27d9f-f863-4e05-93ae-31fe1d1a0459
Uploaded 108 documents for session 40bac361-8416-4ea1-82db-63bac73cb9a3
Uploaded 108 documents for session dbeadd4d-c301-4f0f-949c-2294ea9942ce
Uploaded 108 documents for session 5b8544ea-5fc0-45c7-bd82-9465573c5a82
Uploaded 108 documents for session 87ad5c58-14c5-4a20-8e75-3a0f5

### Below will pull one random session_id from above and try search 10 times to check is we getting consistent results

### run below all block mutiple times if you are not see inconsistent results


In [7]:
query = "scalable storage solution"
query_embeddings = generate_embeddings(query)
session_id = 'ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5' #random.choice(session_ids)

search_sticky_session_string = 'st-1203293293' # this is fix string for acs session_id

## Perform a Plan text Search

In [18]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        #vector=query_embeddings, top_k=15,
        #vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5

        0   1   2   3   4   5   6   7   8   9   10  11  12   13  14
run_1   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_2   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_3   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_4   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_5   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_6   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_7   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_8   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_9   10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
run_10  10  55  74   4  36  50  75  53  52  51  48   6  56  103   3
              0         1         2         3         4         5         6   \
run_1   4.647244  4.181843  3.414228  3.295576  2.967874  2.805747  2.628322   
run_2   4.647244  4.181843  3.414228  3.29

## Perform a Plan Vector Search

In [20]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=None,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

TypeError: search() missing 1 required positional argument: 'search_text'

## Perform a Hybrid Search without scoring_statistics & sticky session

In [19]:
# Hybrid Search
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5

        0   1   2   3   4   5   6   7   8   9   10  11  12   13  14
run_1    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_2    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_3    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_4    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_5    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_6    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_7    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_8    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_9    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_10   4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
              0         1         2         3         4         5         6   \
run_1   0.032266  0.032051  0.031498  0.030835  0.016667  0.016393  0.016129   
run_2   0.032266  0.032051  0.031754  0.01

## Perform a Hybrid Search with scoring_statistics 

In [15]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5

        0   1   2   3   4   5   6   7   8   9   10  11  12   13  14
run_1    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_2    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_3    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_4    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_5    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_6    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_7    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_8    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_9    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_10   4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
              0         1         2         3         4         5         6   \
run_1   0.032266  0.032051  0.031498  0.030835  0.016667  0.016393  0.016129   
run_2   0.032266  0.032051  0.031754  0.01

## Perform a Hybrid Search with sticky session

In [16]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5

        0   1   2   3   4   5   6   7   8   9   10  11  12   13  14
run_1    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_2    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_3    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_4    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_5    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_6    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_7    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_8    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_9    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_10   4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
              0         1         2         3         4         5         6   \
run_1   0.032266  0.032051  0.031754  0.016667  0.016393  0.016129  0.015152   
run_2   0.032266  0.032051  0.031498  0.03

## Perform a Hybrid Search with scoring_statistics & sticky session

In [17]:
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: ba5f5f9c-6b9b-42b6-9df6-74d1444bbfb5

        0   1   2   3   4   5   6   7   8   9   10  11  12   13  14
run_1    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_2    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_3    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_4    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_5    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_6    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_7    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_8    4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
run_9    4  50  36  10  55  74  75  53  52  51  48   6  56  103   3
run_10   4  50  36  52  10  55  74  75  53  51  48   6  56  103   3
              0         1         2         3         4         5         6   \
run_1   0.032266  0.032051  0.031754  0.016667  0.016393  0.016129  0.015152   
run_2   0.032266  0.032051  0.031498  0.03