# Azure Cognitive Search Vector Search Recall Measurement Code Sample

This code demonstrates how to measure recall for vectors in Azure Cognitive Search using Azure Python SDK

## Prerequisites

To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`.

# https://learn.microsoft.com/en-gb/azure/search/vector-search-overview
# https://learn.microsoft.com/en-us/python/api/overview/azure/?view=azure-python
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/?view=azure-python
# https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/search/azure-search-documents
# https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/search/azure-search-documents/samples

In [None]:
## https://pypi.org/project/azure-search-documents/11.4.0b8/
##! pip install azure-search-documents --pre
# ! pip install azure-search-documents==11.4.0b8
# ! pip install python-dotenv
# ! pip install numpy
# ! pip install azure.storage.blob
# ! pip install scipy

## Import required libraries and environment variables

In [14]:
# Import required libraries  
import os  
import json
import numpy as np
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,
    SearchableField,  
    SearchIndex,  
    SearchIndexerDataContainer,  
    SearchIndexer,  
    SearchIndexerDataSourceConnection,  
    IndexingParameters,
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,  
)  
from azure.storage.blob import BlobServiceClient 
from scipy.spatial.distance import cdist



In [15]:
load_dotenv() 

True

In [18]:
# Configure environment variables  
 

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
print(service_endpoint)

service_name = os.getenv("AZURE_SERVICE_NAME")
print(service_name)

index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  
print(index_name)

key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
print(key)

blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
print(blob_connection_string)

container_name = os.getenv("BLOB_CONTAINER_NAME")
print(container_name)

index_blob_folder = os.getenv("INDEX_BLOB_FOLDER")
print(index_blob_folder)

query_blob_name = os.getenv("QUERY_BLOB_NAME")
print(query_blob_name)

admin_key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
print(admin_key)

index_name=os.getenv("AZURE_SEARCH_INDEX_NAME")
print(index_name)

#credential = AzureKeyCredential(key)
search_field = "contentVector"

# Supports cosine and euclidean
metric = "cosine"
k = 10

https://cog-search-nonprod-eastus-001.search.windows.net
cog-search-nonprod-eastus-001
cogsrch-vector-index
K43abh5MhRQvlEErXNJX5zpu9e4btDtOHJ3qOAPigDAzSeC14WZRY
DefaultEndpointsProtocol=https;AccountName=stcogsvceastus001;AccountKey=bPR606bf28x4VK0mqbTnbOikOJFd5FjDdcU+kGEqSIfF+UOl/CmcIvtcCU/OzEQ7Acyr6hTt+q0Q+AStmMlyOg==;EndpointSuffix=core.windows.net
vector
indexblobfolder
YOUR-QUERYING-DATASET-BLOB-NAME
K43abh5MhRQvlEErXNJX5zpu9e4btDtOHJ3qOAPigDAzSeC14WZRY
cogsrch-vector-index


## Create the seach client

In [19]:
# Set the service endpoint and API key from the environment

# Create an SDK client
endpoint = f"https://{service_name}.search.windows.net/"

index_client = SearchIndexClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

search_client = SearchClient(endpoint=endpoint,
                      index_name=index_name,
                      credential=AzureKeyCredential(admin_key))

search_indexer_client = SearchIndexerClient(endpoint, AzureKeyCredential(admin_key))

## Create your search index

Create your search index schema and vector search configuration:

In [20]:
# Create a search index
# https://learn.microsoft.com/en-us/python/api/azure-search-documents/azure.search.documents.indexes.models.searchfield?view=azure-python

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String, searchable=True, retrievable=True),
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True, retrievable=True),
    SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
    SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, dimensions=1536, vector_search_configuration="my-vector-config"),
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": metric
            }
        )
    ]
)

semantic_config = SemanticConfiguration(
    name="vector-semantic-config",
    prioritized_fields=PrioritizedFields(
        title_field=SemanticField(field_name="title"),
        prioritized_content_fields=[SemanticField(field_name="content")]
    )
)

# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_settings=semantic_settings)


try:
    result = index_client.create_or_update_index(index)
    print(f' {result.name} created')

except Exception as ex:
    print (ex)


dimensions is not a known attribute of class <class 'azure.search.documents.indexes.models._index.SearchField'> and will be ignored
dimensions is not a known attribute of class <class 'azure.search.documents.indexes.models._index.SearchField'> and will be ignored


Operation returned an invalid status 'Forbidden'


## Connect to Blob Storage

In [None]:
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)

blob_url = container_client.get_blob_client(query_blob_name).url
print(f"URL of the blob: {blob_url}")

## Connect your Blob Storage to a data source in Cognitive Search

In [None]:
# Create a data source 
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name=container_name, query=index_blob_folder)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

## Create an indexer

Create or update an indexer to populate the search index

In [None]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  
parameters = IndexingParameters(configuration={"parsingMode": "jsonArray"})

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index SciFact dataset",  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    parameters=parameters
)  
  
indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')

## Pre-processing indexing dataset from container
Fetch train_vectors, id and map ids to it's list indices for calculating ground truth values and recall.

In [None]:
def load_train_data_from_container(container_client, folder_name):
    blobs = container_client.list_blobs(name_starts_with=folder_name)

    json_array = []
    for blob in blobs:
        blob_client = container_client.get_blob_client(blob)
        blob_data = blob_client.download_blob().readall()
        json_array.append(blob_data.decode('utf-8'))

    return json_array

# Load train data
data = load_train_data_from_container(container_client, index_blob_folder)

# Fetch vectors and id from train dataset and map ids to indices for ground truth and recall calculation
train_data = []
for d in data:
    train_data.extend(json.loads(d))
print(f'Total no. of documents: {len(train_data)}')

train_vectors, train_ids = zip(*[(h["contentVector"], h["id"]) for h in train_data])

train_id_to_indices = {}
for i in range(len(train_ids)):
    train_id_to_indices[f'{train_ids[i]}'] = i

## Load querying dataset from container

In [None]:
# Fetch query vectors from query dataset
test_data_contents = blob_service_client.get_blob_client(container=container_name, blob=query_blob_name).download_blob().readall()
test_data = json.loads(test_data_contents)

query_vectors = [h[search_field] for h in test_data]
print(f'No. of query vectors: {len(query_vectors)}')

## Calculate Ground Truth Values

We calculate ground truth values for each query vector by finding the top 'k' neighbors in train_data based on their distance to the query vector.

In [None]:
# Calculate metric distances between train and test vectors
distances = cdist(query_vectors, train_vectors, metric=metric)

# Get the indices of k closest neighbors for each test vector
indices = np.argsort(distances, axis=1)[:, :k]

# Create a 2D array of k closest neighbors for each test vector
neighbors = np.take(train_ids, indices, axis=0)
print(f'Top {k} neighbors id for 1st query vector: {neighbors[0]}')

# Create a 2D array of the distances of k closest neighbors for each test vector
distances = distances[np.arange(len(query_vectors))[:, np.newaxis], indices]
print(f'Distance of top {k} neighbors for 1st query vector: {distances[0]}')

## Perform Vector search

Perform vector search for all the query vectors and store the response ids for recall measurement

In [None]:
# Perform vector search and fetch the ids from response for each query vector  
search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))  
results = []  
for query in test_data:  
    vector = Vector(value=query["contentVector"], k=k, fields=search_field)  
    result = search_client.search(  
        search_text=None,  
        vectors=[vector],  
        select=["id"]  
    )  
    result_ids = [int(h['id']) for h in result]  
    results.append(result_ids)  
  
print(f'Result ids for 1st query vector: {results[0]}')  

## Measure distance between query vector and its response.

In [None]:
def calculate_query_response_distance(vector1, vector2, metric):
    return float(cdist(vector1, vector2, metric=metric)[0])

query_response_distances = np.empty((len(results), k))
for i in range(len(results)):
    for j in range(k):
        if j >= len(results[i]):
            query_response_distances[i][j] = float('inf')
            continue
        query_response_distances[i][j] = calculate_query_response_distance([query_vectors[i]], [train_vectors[train_id_to_indices[f"{results[i][j]}"]]], metric)

print(f'Query Response distance for 1st query vector: {query_response_distances[0]}')

## Measure Recall

Calculate a threshold value for each query which would be approximately the distance of the k'th vector for the respective query vector in the ground truth value. 
We count all those response vectors as relevant result which are below these threshold values and get the recall.

In [None]:
def calculate_threshold(data, count, epsilon):
    return data[count - 1] + epsilon

recall = np.zeros(len(results))

for i in range(len(results)):
    threshold = calculate_threshold(distances[i], k, 1e-3)
    count = 0
    for d in query_response_distances[i]:
        if d <= threshold:
            count += 1
    recall[i] = count

overall_recall = np.mean(recall) / float(k)
print(f'Recall: {overall_recall}')