# Azure Cognitive Search Vector Search Code Sample with Azure OpenAI
This code demonstrates how to use Azure Cognitive Search with OpenAI and Azure Python SDK
## Prerequisites
To run the code, install the following packages. 

In [None]:
! pip install azure-search-documents==11.4.0b6
! pip install python-dotenv
! pip install pandas
! pip install openai==0.28.1
! pip install tenacity

## Import required libraries and environment variables

In [None]:
# Import required libraries
import os
import json
import uuid
import openai
import random
import pandas as pd
from dotenv import load_dotenv
from tenacity import retry, wait_random_exponential, stop_after_attempt
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    SearchIndex,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchField,
    SemanticSettings,
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
)

# Configure environment variables
load_dotenv()
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
credential = AzureKeyCredential(key)

openai.api_type = "azure"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")

In [7]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def generate_embeddings(text):
    response = openai.Embedding.create(input=text, engine=os.getenv("AZURE_OPENAI_EMBEDDING_ENGINE"))
    embeddings = response['data'][0]['embedding']
    return embeddings

## Create your search index
Create your search index schema and vector search configuration:

In [42]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SimpleField(name="session_id", type=SearchFieldDataType.String,filterable=True, retrievable=True),
    SimpleField(name="content_order", type=SearchFieldDataType.Int32, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 test-replica-issue created


## Insert text and embeddings into vector store multiple times 
Add texts and metadata from the JSON data to the vector store:

In [43]:
with open('input_data.json', 'r', encoding='utf-8') as file:
    base_documents = json.load(file)

session_ids = []
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
for i in range(100):
    session_id = str(uuid.uuid4())
    documents = base_documents
    for document in documents:
        document['id'] = str(uuid.uuid4())
        document['session_id'] = session_id

    result = search_client.upload_documents(documents)
    print(f"Uploaded {len(documents)} documents for session {session_id}")
    session_ids.append(session_id)

Uploaded 108 documents for session e044179b-238b-41ea-a6e3-df4770802288
Uploaded 108 documents for session fa100f23-4a60-43cd-83cc-78bf3e4b54ae
Uploaded 108 documents for session d2222cc3-8652-4b1b-a8b7-fdac8d51c3e0
Uploaded 108 documents for session a3685650-4d42-4ae5-ae62-ae83d04c16a2
Uploaded 108 documents for session 72a7973e-ecf6-4c6e-a036-4e5b4b53ba64
Uploaded 108 documents for session 0260bdcb-679f-4385-a7ba-0d08595237df
Uploaded 108 documents for session 6d5fb11f-2b1a-41da-8f7b-d718797b2162
Uploaded 108 documents for session 8ba8e684-4e99-4ca8-b8ba-a16165baad2a
Uploaded 108 documents for session 3f08c267-0143-491c-9c47-e5f777cc9c82
Uploaded 108 documents for session 1e3b710a-f21a-4687-9852-c5598719debb
Uploaded 108 documents for session 9f1a30eb-0679-4406-a37f-b5dff333545c
Uploaded 108 documents for session eba94cad-3329-40fb-a5c2-0b4810d684d7
Uploaded 108 documents for session b6603e38-89d3-442c-8f14-d86f49052d37
Uploaded 108 documents for session e0890e3a-53b7-4e9c-9e7e-c2cf3

## Perform a Hybrid Search

this will pull one random session_id from above and try search 10 times to check is we getting consistent results

run below block mutiple times if you are not see inconsistent results


In [46]:
# Hybrid Search
query = "scalable storage solution"
query_embeddings = generate_embeddings(query)
session_id = random.choice(session_ids)
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))

all_call_score_results = []
all_call_content_results = []
all_call_content_orders_results = []
for i in range(10):
    results = search_client.search(
        search_text=query,
        vector=query_embeddings, top_k=15,
        vector_fields="content_vector",
        select=["content", "content_order"],
        filter=f"session_id eq '{session_id}'",
        top=15
    )

    results = list(results)
    all_call_score_results.append([result['@search.score'] for result in results])
    all_call_content_results.append([result['content'][:10] for result in results])
    all_call_content_orders_results.append([result['content_order'] for result in results])


print(f"session_id: {session_id}\n")

num_rows = len(all_call_content_orders_results)
row_names = [f"run_{i+1}" for i in range(num_rows)]
print(pd.DataFrame(all_call_content_orders_results, index=row_names))
#print(pd.DataFrame(all_call_content_results,, index=row_names))
#print(pd.DataFrame(all_call_score_results, index=row_names))


session_id: bd67dbe0-ea72-4958-8d18-29c49aeee6ad

        0   1   2   3   4   5   6   7   8   9   10  11  12  13   14
run_1   50  69  71  54  10  55  74   4  70  75  36  51  52  53   48
run_2   50  69  46  19  10  55  70  74   4  75  36  51  52  53   48
run_3   50  36  52   6  95  10  55  74   4  75  51  53  48  56  103
run_4   50  36  52  46  95  10  55  74   4  75  51  53  48   6   56
run_5   50  69  71  54  10  55  74   4  70  75  36  51  52  53   48
run_6   50  69  46  19  10  55  70  74   4  75  36  51  52  53   48
run_7   50  36  52   6  95  10  55  74   4  75  51  53  48  56  103
run_8   50  36  52  46  95  10  55  74   4  75  51  53  48   6   56
run_9   50  69  71  54  10  55  74   4  70  75  36  51  52  53   48
run_10  50  69  46  19  10  55  70  74   4  75  36  51  52  53   48
