# Azure AI Search: vector search, step by step using Push Method

In [1]:
import os

import dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

dotenv.load_dotenv()

AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")

blob_container_name = os.getenv("BLOB_CONTAINER_NAME", "int-vec")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS"))
azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
azure_ai_service_key = os.getenv("AI_SERVICE_KEY")

azure_credential = AzureKeyCredential(AZURE_SEARCH_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=azure_credential)

## Create Index

In [17]:
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex,
    VectorSearchAlgorithmKind,
    HnswParameters
)

index_name = "index-name-push-method"
# Create a search index  
fields = [  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, retrievable=True, searchable=True, sortable=True, filterable=True, facetable=True, key=True, analyzer_name='keyword'),  
    SearchField(name="parent_id", type=SearchFieldDataType.String, retrievable=True, searchable=False, sortable=True, filterable=True, facetable=True),  
    SearchField(name="content", type=SearchFieldDataType.String, retrievable=True, searchable=True, index_analyzer_name="keyword", search_analyzer_name="standard"),  
    SearchField(name="title", type=SearchFieldDataType.String, retrievable=True, filterable=True, searchable=True, index_analyzer_name="keyword", search_analyzer_name="standard"),  
    SearchField(name="url", type=SearchFieldDataType.String, retrievable=True, searchable=False),  
    SearchField(name="text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_model_dimensions, vector_search_profile_name="myHnswProfile", searchable=True),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw", 
                                   kind=VectorSearchAlgorithmKind.HNSW, 
                                   parameters=HnswParameters(metric="cosine")),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                model_name=azure_openai_model_name,
                deployment_id=azure_openai_model_name,
                api_key=azure_openai_key,
            ),
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="content")]  
    ),  
)
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

index-name-push-method created


## Data Preparation using Docs Intelligence

In [2]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat, AnalyzeResult
import base64 

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
doc_int_endpoint = os.getenv("DOC_INT_ENDPOINT")
doc_int_key = os.getenv("DOC_INT_API_KEY")

In [4]:
def analyze_layout_local_file(file_path):     
    with open(file_path, "rb") as f:
        base64_encoded_pdf = base64.b64encode(f.read()).decode("utf-8")

    analyze_request = {
        "base64Source": base64_encoded_pdf
    }

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=doc_int_endpoint, credential=AzureKeyCredential(doc_int_key)
    )

    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", analyze_request=analyze_request,
         output_content_format=ContentFormat.MARKDOWN,
    )

    result = poller.result()
    return result

In [5]:
def save_to_markdown(content, output_file_path):
    """
    Saves the structured content to a Markdown file.
    """
    with open(output_file_path, 'w', encoding='utf-8') as f:
        for line in content:
            f.write(line)

In [6]:
result = analyze_layout_local_file("Credit_Card_Benefits_Table.pdf")

In [10]:
result.content

'Travel Credit Card\n===\n\n| Judul | Credit Card Types |||\n|| Gold | Premium | Standard |\n| - | - | - | - |\n| Benefits - | 1x point setiap transaksi Rp30.000 di merchant partner | 2x point setiap transaksi Rp50.000 di merchant partner | · Tidak mendapatkan point · Mendapatkan diskon 5% setiap belanja minimal Rp500.000 di merchant partner · Khusus transaksi menggunakan QRIS mendapatkan potongan 7% |\n| Syarat | · Minimal 21 tahun · Mnimal pendapatan Rp5juta | · Minimal 25 tahun · Pendapatan perbulan Rp20juta | · Minimal 17 tahun · Minimal pendapatan perbulan 3juta |\n| Annual Fee | Rp.300.000/tahun | Rp.1000.000/tahun | Rp.200.000/tahun atau gratis dengan transaksi minimal 3 kali sebulan |\n| Welcome Bonus | - | 1000 points | - |\n| More Benefits | Travel benefits Bebas akses airport lounge di seluruh bandara di Indonesia Travel Insurance Gratis perlindungan asuransi perjalanan sd Rp1Miliar. | Travel benefits Bebas akses airport lounge di seluruh bandara di Indonesia dan luar negeri

## Data Prep for Indexing

In [13]:
import uuid
from openai import AzureOpenAI
generated_index = str(uuid.uuid4())

client = AzureOpenAI(azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
                     api_key=os.getenv("AZURE_OPENAI_KEY"),
                     api_version="2024-04-01-preview")

def embed(text):
    return client.embeddings.create(
        model=azure_openai_model_name,
        input=text
    ).data[0].embedding

In [14]:
value = {
    "chunk_id":generated_index+"_page_1",
    "parent_id":generated_index,
    "content":result.content,
    "title":"Credit_Card_Benefits_Table",
    "url":"",
    "text_vector": embed(result.content),
}

## Push to AI Search

In [18]:
from azure.search.documents import SearchClient

# Upload documents to the index
search_client = SearchClient(endpoint=AZURE_SEARCH_ENDPOINT,
                      index_name=index_name,
                      credential=azure_credential)
try:
    result = search_client.upload_documents(documents=value)
    print("Upload of new document succeeded: {}".format(result[0].succeeded))
except Exception as ex:
    print (ex.message)


Upload of new document succeeded: True
