Install the required libraries

In [None]:
! pip install python-dotenv
! pip install numpy
! pip install azure-core
! pip install azure-search-documents
! pip install openai

Import the necessary libraries

In [None]:
import os
import json
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
import numpy as np
from openai import AzureOpenAI
from azure.core.exceptions import ResourceNotFoundError
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient, SearchIndexingBufferedSender 
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.indexes.models import (
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,
    SearchField,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SimpleField,
    SearchableField,
    VectorSearchAlgorithmMetric,
)

load_dotenv()

Get the credentials for Azure AI Search from the environment variables

In [None]:
key = os.environ.get('AI_SEARCH_KEY')
endpoint = os.environ.get('AI_SEARCH_ENDPOINT')
index_name = os.environ.get('AI_SEARCH_INDEX_NAME')

In [None]:
index_name

Create a search index in Azure AI Search with Vector Search enabled

> _Note: If you want to change the fields in AI Search Index you should update it the below cell_

In [None]:
# Create the SearchIndexClient
if key is None:
    credential = DefaultAzureCredential()
else:
    credential = AzureKeyCredential(key)
index_client: SearchIndexClient = SearchIndexClient(
    endpoint=endpoint, credential=credential)

# Check if the index exists, if not create it
try:
    index_client.get_index(name=index_name)
    print(f'Index {index_name} already exists')
except ResourceNotFoundError as ex:
    # AI Search fields configuration
    fields = [
        SimpleField(name='id', type=SearchFieldDataType.String,
                    key=True, filterable=True),
        SearchableField(name='title', type=SearchFieldDataType.String),
        SearchField(name='title_vector', type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=1536, vector_search_profile_name='defaultHnswProfile'),
        SearchableField(name='content', type=SearchFieldDataType.String),
        SearchField(name='content_vector', type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=1536, vector_search_profile_name='defaultHnswProfile'),
        SearchableField(name='tag', type=SearchFieldDataType.String, filterable=True),
        SearchableField(name='metadata', type=SearchFieldDataType.String)
    ]
    # Vector search configuration
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name='defaultHnsw',
                kind=VectorSearchAlgorithmKind.HNSW,
                parameters=HnswParameters(
                    m=4,
                    ef_construction=400,
                    ef_search=500,
                    metric=VectorSearchAlgorithmMetric.COSINE
                )
            )
        ],
        profiles=[
            VectorSearchProfile(
                name="defaultHnswProfile",
                algorithm_configuration_name="defaultHnsw",
            )
        ]
    )
    
    # Create the search index with the vector search
    index = SearchIndex(name=index_name, fields=fields,
                        vector_search=vector_search)
    index_client.create_index(index)
    print(f'Successfully created index {index_name} with vector search')

Define a function to generate embeddings for the provided text and perform cosine similarity

In [None]:
def generate_embeddings(text, client, model): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

Get the credentials for Azure OpenAI Service from the environment variables and initialize the client

In [None]:
# Get the environment variables for the Azure OpenAI service
aoai_endpoint = os.environ.get('AOAI_ENDPOINT')
aoai_key = os.environ.get('AOAI_KEY')
aoai_deployment = os.environ.get('AOAI_EMBEDDINGS_DEPLOYMENT')
aoai_api_version = os.environ.get('AOAI_API_VERSION')

# Initialize the AzureOpenAI client
aoai_client = AzureOpenAI(api_key=aoai_key, api_version=aoai_api_version, azure_endpoint=aoai_endpoint)

Get the product data from your API or load it from file

In [None]:
# Specify the file path
file_path = '/path_to_the_file/file.json'

# Read the JSON file (make sure your data is list of dictionaries)
with open(file_path, 'r', encoding='utf-8') as file:
    products_data = json.load(file)


Create Search Client for Azure AI Search

In [None]:
search_client = SearchClient(endpoint=endpoint, index_name=index_name, credential=AzureKeyCredential(key))

Generate embeddings of the title and content of the product data using the Azure OpenAI Service

In [None]:
VECTOR_FILE = 'products_vector.json'

with open(VECTOR_FILE, 'w', encoding='utf-8') as f:
    f.write('[\n')

In [None]:
for i, product_data in enumerate(products_data):
    print(f'Processing {i+1} of {len(products_data)}')
    title_vector = generate_embeddings(product_data['title'], aoai_client, model=aoai_deployment)
    content_vector = generate_embeddings(product_data['content'], aoai_client, model=aoai_deployment)
    product_data['title_vector'] = title_vector
    product_data['content_vector'] = content_vector

    with open(VECTOR_FILE, 'a', encoding='utf-8') as f:
        json.dump(product_data, f, ensure_ascii=False)
        if i != len(products_data) - 1:
                f.write(',\n')

with open(VECTOR_FILE, 'a', encoding='utf-8') as f:
    f.write('\n]')

Loop through your content such that you map your content based on the search index schema and upload the documents.

_Note: Maximum of 1000 documents shall be uploaded per `upload_documents` API call_

In [None]:
MAX_UPLOAD_BATCH_SIZE = 999

with open(VECTOR_FILE, 'r', encoding='utf-8') as file:
    vectorized_data = json.load(file)

# Split vectorized_data into batches
batches = [vectorized_data[i:i+MAX_UPLOAD_BATCH_SIZE] for i in range(0, len(vectorized_data), MAX_UPLOAD_BATCH_SIZE)]

# Upload documents in batches
for i, batch in enumerate(batches):
    response = search_client.upload_documents(documents=batch)
    # Check if all documents were successfully uploaded
    if not all([r.succeeded for r in response]):
        raise Exception(response)
    
    print(f'Batch {i+1} with {len(vectorized_data)} documents uploaded successfully')

Perform a hybrid search (keyword + vector) and get top 5 search results

In [None]:
query = 'Product'


query_embedding = generate_embeddings(query, aoai_client, model=aoai_deployment)
vector_query = VectorizedQuery(vector=query_embedding, k_nearest_neighbors=5, fields="title_vector, content_vector")

results = search_client.search(  
    search_text=query,
    vector_queries=[ vector_query ],
    top=5
)

for result in results:
    print(f"Id: {result['id']}")
    print(f"Title: {result['title']}")
    print(f"Content: {result['content']}\n")  