# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. 

In [None]:
! pip install azure-search-documents==11.4.0b6
! pip install python-dotenv
! pip install pandas
! pip install openai==0.28.1
! pip install tenacity

## Import required libraries and environment variables

In [12]:
# Import required libraries
import os
import json
import uuid
import openai
import random
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

# Configure environment variables
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)

openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")

In [13]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=os.getenv("AZURE_OPENAI_EMBEDDING_ENGINE"))
    embeddings = response['data'][0]['embedding']
    return embeddings

## Create your search index
Create your search index schema and vector search configuration:

In [14]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="session_id", type=SearchFieldDataType.String,filterable=True, retrievable=True),
    SimpleField(name="content_order", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 test-replica-issue created


## Insert text and embeddings into vector store multiple times 
Add texts and metadata from the JSON data to the vector store:

In [15]:
with open('input_data.json', 'r', encoding='utf-8') as file:
    base_documents = json.load(file)

session_ids = []
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
for i in range(100):
    session_id = str(uuid.uuid4())
    documents = base_documents
    for document in documents:
        document['id'] = str(uuid.uuid4())
        document['session_id'] = session_id

    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(documents)} documents for session {session_id}")
    session_ids.append(session_id)

Uploaded 108 documents for session 64f63550-e843-424a-8313-19fa4cf90a95
Uploaded 108 documents for session 076283e3-cdf3-4ce9-8e73-7c7594b34356
Uploaded 108 documents for session 7ce6d7a5-a404-4df8-a228-62483faac015
Uploaded 108 documents for session 4e0e0684-aa37-4826-b6dc-4f13c552a821
Uploaded 108 documents for session d0cbf3af-ecd4-449d-9448-17cbb39f2c4d
Uploaded 108 documents for session de419e2e-45a9-4f2b-b729-b8e429254e43
Uploaded 108 documents for session 86b98ae2-cdec-486d-8e3d-e0bb4e29cb45
Uploaded 108 documents for session 430605a6-2094-48f5-9f7f-de6f45face9f
Uploaded 108 documents for session fafdb69f-310c-437c-ab92-24d69ec64565
Uploaded 108 documents for session e4f31425-032b-45c3-aa5d-db1cc9ff0633
Uploaded 108 documents for session 9a4970c9-b88a-47ed-bc59-80194ddbadfb
Uploaded 108 documents for session 71d35159-a5c0-496f-94d6-3450789a5cf6
Uploaded 108 documents for session bb21a287-0135-4a53-8faf-97205709b508
Uploaded 108 documents for session b6f116d9-0783-4d6a-b39e-11748

### Below will pull one random session_id from above and try search 10 times to check is we getting consistent results

### run below all block mutiple times if you are not see inconsistent results


In [18]:
query = "scalable storage solution"
query_embeddings = generate_embeddings(query)
session_id = random.choice(session_ids)

search_sticky_session_string = 'st-1203293293' # this is fix string for acs session_id

## Perform a Hybrid Search without scoring_statistics & sticky session

In [20]:
# Hybrid Search
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
#print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 81ffe9fe-27f5-40b6-bc07-229ce854001a

        0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
run_1    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_2    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_3    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_4    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_5    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_6    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_7    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_8    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_9    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_10   4  50  36  52  53  51  48   6  69  49  95  71  97  54  10


## Perform a Hybrid Search with scoring_statistics 

In [21]:

all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        #session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
#print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 81ffe9fe-27f5-40b6-bc07-229ce854001a

        0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
run_1    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_2    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_3    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_4    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_5    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_6    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_7    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_8    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_9    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_10   4  50  36  52  53  51  48   6  69  49  95  71  97  54  10


## Perform a Hybrid Search with sticky session

In [22]:

all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        #scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
#print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 81ffe9fe-27f5-40b6-bc07-229ce854001a

        0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
run_1    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_2    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_3    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_4    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_5    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_6    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_7    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_8    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_9    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_10   4  50  36  52  53  51  48   6  69  49  95  71  97  54  10


## Perform a Hybrid Search without scoring_statistics & sticky session

In [23]:

all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15,
        scoring_statistics="global",
        session_id=search_sticky_session_string
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
#print(pd.DataFrame(all_call_score_results, index=row_names))

session_id: 81ffe9fe-27f5-40b6-bc07-229ce854001a

        0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
run_1    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_2    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_3    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_4    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_5    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_6    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_7    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_8    4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
run_9    4  50  36  52  53  51  48   6  49  71  68  97  54  19  10
run_10   4  50  36  52  53  51  48   6  69  49  95  71  97  54  10
