In [5]:
from IPython.display import Markdown, display


In [6]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path=".openaidev.env")

True

# Search

In [7]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os


In [8]:
load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX"]

In [9]:
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    HnswParameters,
    VectorSearchAlgorithmMetric,
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,

    VectorSearchProfile,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex
)

# Create a search index
# Fields of type Edm.String or Collection(Edm.String) are searchable by default
index_client = SearchIndexClient(endpoint=endpoint, credential=credential)
fields = [
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
    SearchField(name="file_name", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="base_name", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
    SearchField(name="content", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False, analyzer_name="en.microsoft"),
    # Vector fields must be searchable and retrievable, but they can't be filterable, facetable, or sortable.
    # Vector fields are indexed using algorithms specified in a vector search profile
    SearchField(
        name="content_vector",
        searchable=True,
        filterable=False,
        facetable=False,
        sortable=False,
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
        vector_search_dimensions=1536,
        vector_search_profile_name="myHnswProfile"),
]

# Configure the vector search configuration
vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
        ExhaustiveKnnAlgorithmConfiguration(
            name="myExhaustiveKnn",
            parameters=ExhaustiveKnnParameters(
                metric=VectorSearchAlgorithmMetric.COSINE,
            ),
        ),
    ],
    profiles=[
        VectorSearchProfile(
            name="myHnswProfile",
            algorithm_configuration_name="myHnsw",
            # vectorizer="myOpenAI",
        ),
        VectorSearchProfile(
            name="myExhaustiveKnnProfile",
            algorithm_configuration_name="myExhaustiveKnn",
            # vectorizer="myOpenAI",
        ),
    ],
    # vectorizers=[
    #     AzureOpenAIVectorizer(
    #         name="myOpenAI",
    #         kind="azureOpenAI",
    #         azure_open_ai_parameters=AzureOpenAIParameters(
    #             resource_uri=azure_openai_endpoint,
    #             deployment_id=azure_openai_embedding_deployment,
    #             api_key=azure_openai_key,
    #         ),
    #     ),
    # ],
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="base_name"),
        content_fields=[SemanticField(field_name="content")]
    ),
)

# Create the semantic search with the configuration -- not available for free tier
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f"{result.name} created")


anlp created


## Indexing

https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/basic-vector-workflow/azure-search-vector-python-sample.ipynb


https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python

https://learn.microsoft.com/en-us/python/api/overview/azure/search-documents-readme?view=azure-python#adding-documents-to-your-index 

In [10]:
# DOCUMENT = {
#     "category": "Hotel",
#     "hotelId": "1000",
#     "rating": 4.0,
#     "rooms": [],
#     "hotelName": "Azure Inn",
# }

# # https://learn.microsoft.com/en-us/rest/api/searchservice/addupdate-or-delete-documents#request-body
# result = search_client.upload_documents(documents=[DOCUMENT])

# print("Upload of new document succeeded: {}".format(result[0].succeeded))

### OR

In [11]:
import json

def load_json_files(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                try:
                    # Load JSON content
                    document = json.load(file)
                    # Add document to the buffered sender
                    documents.append(document)
                    print(f"loaded document from {file_name}")
                except Exception as e:
                    print(f"Failed to index document from {file_name}: {e}")
    return documents

In [12]:
documents = load_json_files("./data/documents/search_json")
print(type(documents))

loaded document from Benefit_Options_0000.json
loaded document from Benefit_Options_0001.json
loaded document from Benefit_Options_0002.json
loaded document from Benefit_Options_0003.json
loaded document from Benefit_Options_0004.json
loaded document from dometic_fridge_stove_Sept 2025 20250918_114057_0000.json
loaded document from dometic_fridge_stove_Sept 2025 20250918_114057_0001.json
loaded document from dometic_fridge_stove_Sept 2025 20250918_114057_0002.json
loaded document from employee_handbook_0000.json
loaded document from employee_handbook_0001.json
loaded document from employee_handbook_0002.json
loaded document from employee_handbook_0003.json
loaded document from employee_handbook_0004.json
loaded document from employee_handbook_0005.json
loaded document from employee_handbook_0006.json
loaded document from employee_handbook_0007.json
loaded document from employee_handbook_0008.json
loaded document from employee_handbook_0009.json
loaded document from employee_handbook_00

In [13]:
print(len(documents))

# print(json.dumps(documents[0], indent=2))

print(documents[5])

356
{'chunk_id': 'dometic_fridge_stove_Sept_2025_20250918_114057_0000', 'file_name': './data/documents\\dometic_fridge_stove_Sept 2025 20250918_114057.pdf', 'base_name': 'dometic_fridge_stove_Sept 2025 20250918_114057.md', 'content': '<figure>  \nDOMETIC  \n</figure>', 'content_vector': [0.0030515885446220636, 0.00674192700535059, -0.04250023514032364, -0.0024732083547860384, -0.030504202470183372, 0.02212255634367466, -0.022870361804962158, 0.010967803187668324, 0.001612453954294324, -0.02481777034699917, 0.018866490572690964, -0.04605230689048767, 0.019583135843276978, -0.00475557055324316, 0.031049475073814392, 0.040319137275218964, -0.021997923031449318, 0.02607969008386135, -0.0027847937308251858, 0.07939193397760391, 0.030613256618380547, 0.005382636096328497, 0.0060875979252159595, -0.021873287856578827, -0.00931640062481165, 0.003879236988723278, -0.043746575713157654, 0.055244073271751404, 0.042593710124492645, -0.02782456763088703, -0.02017514780163765, -0.07951656728982925, 

In [14]:
from azure.search.documents import SearchIndexingBufferedSender
import time
from tqdm import tqdm

# use tqdm to show progress bar
with tqdm(total=len(documents), desc="Uploading documents", unit="document") as pbar:
    # https://learn.microsoft.com/en-us/rest/api/searchservice/addupdate-or-delete-documents#request-body
# Use SearchIndexingBufferedSender to upload the documents in batches optimized for indexing
    with SearchIndexingBufferedSender(
        endpoint=endpoint,
        index_name=index_name,
        credential=credential,
    ) as batch_client:
        # Add upload actions for all documents
        for document in documents:
            batch_client.merge_or_upload_documents(documents=[document])
            # insert a wait for 1 sec
            time.sleep(1)
            pbar.update(1)
    print(f"Uploaded {len(documents)} documents in total")



Uploading documents: 100%|██████████| 356/356 [05:57<00:00,  1.00s/document]

Uploaded 356 documents in total





In [15]:
from azure.search.documents.indexes import SearchIndexClient

client = SearchIndexClient(endpoint=endpoint, credential=credential)



In [16]:
index = client.get_index(index_name)

In [17]:
for field in index.fields:
    print(f"{field.name}: {field.type}")

chunk_id: Edm.String
file_name: Edm.String
base_name: Edm.String
content: Edm.String
content_vector: Collection(Edm.Single)


In [18]:
index_statistics = client.get_index_statistics(index_name)
print(f"Document count: {index_statistics}")

Document count: {'document_count': 356, 'storage_size': 7680619, 'vector_index_size': 6625676}
