# Azure AI Search integrated vectorization sample


In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os


In [2]:

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = os.environ["BLOB_CONTAINER_NAME"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None
azure_openai_embedding_deployment = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT"]

In [None]:
import re 
def get_page_number(chunk_id : str): 
    page_re = r'_pages_(\d+)$'

    match = re.search(page_re, chunk_id)
    if match:
        page_number = match.group(1)
        return page_number

## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

If you indexed the health plan PDF file, send queries that ask plan-related questions.

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "eye test"
# "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"  

topk = 5
  
search_client = SearchClient(endpoint, index_name, credential=credential)


vector_fields = ["vector"]

vector_queries = [VectorizableTextQuery(
        text = query,
        k_nearest_neighbors=topk, 
        fields=vector,
        exhaustive=True, 
    ) for vector in vector_fields]


# vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=topk, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
# results = search_client.search(  
#     search_text=None,  
#     vector_queries= vector_queries,
#     select=["parent_id", "chunk_id", "chunk","title"],
#     top=topk
# )  



results = search_client.search(
        search_text = query, 
        vector_queries=vector_queries, 
        select=["parent_id", "chunk_id", "chunk","title"], 
        top=topk, 
        include_total_count=True, 
        query_type="semantic",
        semantic_configuration_name="my-semantic-config", 
        query_answer="extractive", 
        query_answer_count=5, 
        query_caption="extractive",
        query_caption_highlight_enabled=True,
    )


for result in results:  
    print(f"title: {result['title']}")
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Page: {get_page_number(result['chunk_id'])}")
    print(f"search captions: {result['@search.captions'][0].text}")
    print(f"search captions highlights: {result['@search.captions'][0].highlights}")
    
    # print(f"Content: {result['chunk']}")   


In [None]:
print(result.keys())

In [None]:
search_caption = result["@search.captions"]

In [None]:
for sc in search_caption: 
    print(sc)

In [None]:
print(sc.text)
print(sc.highlights)
print(sc.additional_properties)
sc.as_dict().keys()

## Perform a hybrid search

In [None]:
# Hybrid Search
query = "eye test" #"Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"  
  
search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [], #[vector_query],
    select=["parent_id", "chunk_id", "chunk", "title"],

    # top=1
)  
  

for result in results:  
    # print(f"{len(result['parent_id'])} : {len(result['chunk_id'])} ")
    if result["parent_id"] == None:
        print(f"chunk_id: {result['chunk_id']}")  
        print(f"Score: {result['@search.score']}")  
        
    print(f"title: {result['title']}")
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Page: {get_page_number(result['chunk_id'])}")

print(result.keys())

In [None]:
import re

page_re = r'_pages_(\d+)$'

results = search_client.search(  
    search_text=query,  
    vector_queries= [], #[vector_query],
    select=["parent_id", "chunk_id", "chunk","title"],
    # filter="parent_id ne null", # retrieves chunks only
    filter="parent_id eq null", # retrieves parent documents only

    # top=1
)  

for result in results:  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}") 
    # print(f"Content: {result['chunk']}") 
    print(f"Title: {result['title']}")
    match = re.search(page_re, result['chunk_id'])
    if match:
        page_number = match.group(1)
        print(f"Page: {page_number}")
     
print(result.keys())

In [None]:
# for result in results: 
#     print(result['parent_id'])

parent_ids = [result['parent_id'] for result in results]

In [None]:
len(parent_ids)

In [None]:
print(result.keys())
print(result)

## Perform a hybrid search + semantic reranking

In [None]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
# Semantic Hybrid Search
query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"

search_client = SearchClient(endpoint, index_name, credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='my-semantic-config',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)

semantic_answers = results.get_answers()
if semantic_answers:
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")
