# Azure AI Search: vector search, step by step using Pull Method

In [27]:
import os

import dotenv
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential

dotenv.load_dotenv()

AZURE_SEARCH_SERVICE = os.getenv("AZURE_SEARCH_SERVICE")
AZURE_SEARCH_ENDPOINT = f"https://{AZURE_SEARCH_SERVICE}.search.windows.net"
AZURE_SEARCH_KEY = os.getenv("AZURE_SEARCH_ADMIN_KEY")

blob_container_name = os.getenv("BLOB_CONTAINER_NAME", "int-vec")
azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS"))
azure_storage_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
azure_ai_service_key = os.getenv("AI_SERVICE_KEY")

azure_credential = AzureKeyCredential(AZURE_SEARCH_KEY)
index_client = SearchIndexClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=azure_credential)

## Create Index
Configrue fields (columns), vector search, and semantic search.

In [28]:
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIParameters,
    SemanticConfiguration,
    SemanticSearch,
    SemanticPrioritizedFields,
    SemanticField,
    SearchIndex,
    VectorSearchAlgorithmKind,
    HnswParameters
)

index_name = "index-name-pull-method"
# Create a search index  
fields = [  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, retrievable=True, searchable=True, sortable=True, filterable=True, facetable=True, key=True, analyzer_name='keyword'),  
    SearchField(name="parent_id", type=SearchFieldDataType.String, retrievable=True, searchable=False, sortable=True, filterable=True, facetable=True),  
    SearchField(name="content", type=SearchFieldDataType.String, retrievable=True, searchable=True, index_analyzer_name="keyword", search_analyzer_name="standard"),  
    SearchField(name="keyphrase", type=SearchFieldDataType.Collection(SearchFieldDataType.String), retrievable=True, searchable=True, index_analyzer_name="keyword", search_analyzer_name="standard"),  
    SearchField(name="title", type=SearchFieldDataType.String, retrievable=True, filterable=True, searchable=True, index_analyzer_name="keyword", search_analyzer_name="standard"),  
    SearchField(name="url", type=SearchFieldDataType.String, retrievable=True, searchable=False),  
    SearchField(name="text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=azure_openai_model_dimensions, vector_search_profile_name="myHnswProfile", searchable=True),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw", 
                                   kind=VectorSearchAlgorithmKind.HNSW, 
                                   parameters=HnswParameters(metric="cosine")),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=azure_openai_endpoint,  
                model_name=azure_openai_model_name,
                deployment_id=azure_openai_model_name,
                api_key=azure_openai_key,
            ),
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=SemanticPrioritizedFields(  
        content_fields=[SemanticField(field_name="content")]  
    ),  
)
  
# Create the semantic search with the configuration  
semantic_search = SemanticSearch(configurations=[semantic_config])  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

index-name-pull-method created


## Create Skillset
A skillset is an array of one or more skills that perform an enrichment, such as translating text or optical character recognition (OCR) on an image file. Skills can be the built-in skills from Microsoft, or custom skills for processing logic that you host externally. A skillset produces enriched documents that are either consumed during indexing or projected to a knowledge store.

In [29]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    OcrSkill,
    MergeSkill,
    KeyPhraseExtractionSkill
)

# Create a skillset  
skillset_name = f"{index_name}-skillset"

ocr_skill = OcrSkill(
    name="OCRSkill",
    description="Extract Text from Images",
    context="/document/normalized_images/*",
    default_language_code="en",
    should_detect_orientation=True,
    inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
    outputs=[OutputFieldMappingEntry(name="text", target_name="text")]
)

merge_skill = MergeSkill(
    name="MergeSkill",
    description="merge OCR and text",
    context="/document",
    inputs=[InputFieldMappingEntry(name="text", source="/document/content"),
            InputFieldMappingEntry(name="itemsToInsert", source="/document/normalized_images/*/text"),
            InputFieldMappingEntry(name="offsets", source= "/document/normalized_images/*/contentOffset")],
    outputs=[OutputFieldMappingEntry(name="mergedText", target_name="mergedText")]
)

split_skill = SplitSkill(
    description="Split skill to chunk documents",
    text_split_mode="pages",
    context="/document",
    maximum_page_length=2000,
    page_overlap_length=500,
    default_language_code="id",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/mergedText"), #missing or empty
    ],
    outputs=[
        OutputFieldMappingEntry(name="textItems", target_name="pages")
    ],
)

key_phrase_skill = KeyPhraseExtractionSkill(
    name="KeyPhraseExtractionSkill",
    description="extracting important keywords from chunks",
    context="/document/pages/*",
    default_language_code="en",
    max_key_phrase_count=2,
    inputs=[InputFieldMappingEntry(name="text", source="/document/pages/*")],
    outputs=[OutputFieldMappingEntry(name="keyPhrases", target_name="keyphrases")]
)

embedding_skill = AzureOpenAIEmbeddingSkill(
    description="Skill to generate embeddings via Azure OpenAI",
    context="/document/pages/*",
    resource_uri=azure_openai_endpoint,
    deployment_id=azure_openai_model_name,
    model_name=azure_openai_model_name,
    dimensions=azure_openai_model_dimensions,
    api_key=azure_openai_key,
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/pages/*"), #cannot iterate non array
    ],
    outputs=[
        OutputFieldMappingEntry(name="embedding", target_name="vector")
    ],
)

index_projections = SearchIndexerIndexProjections(
    selectors=[
        SearchIndexerIndexProjectionSelector(
            target_index_name=index_name,
            parent_key_field_name="parent_id",
            source_context="/document/pages/*",
            mappings=[
                InputFieldMappingEntry(name="content", source="/document/pages/*"),
                InputFieldMappingEntry(name="text_vector", source="/document/pages/*/vector"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
                InputFieldMappingEntry(name="url", source="/document/metadata_storage_path"),
                InputFieldMappingEntry(name="keyphrase", source="/document/pages/*/keyphrases"),
            ],
        ),
    ],
    parameters=SearchIndexerIndexProjectionsParameters(
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
    ),
)

cognitive_services_account = CognitiveServicesAccountKey(key=azure_ai_service_key)
skills = [ocr_skill, merge_skill, split_skill, key_phrase_skill, embedding_skill]

skillset = SearchIndexerSkillset(
    name=skillset_name,
    description="Skillset to chunk documents and generating embeddings",
    skills=skills,
    index_projections=index_projections,
    cognitive_services_account=cognitive_services_account
)

client = SearchIndexerClient(AZURE_SEARCH_ENDPOINT, azure_credential)
client.create_or_update_skillset(skillset)
print(f"{skillset.name} created")


index-name-pull-method-skillset created


## Create DataSource
Connect Data Source (Azure Storage Accounts) with AI Search

In [30]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source 
indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=azure_credential)
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=azure_storage_connection_string,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'index-name-pull-method-blob' created or updated


## Create Indexer
Create Indexer to store logs, indexing process, and schedule indexing

In [31]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping,
    IndexingParameters,
    IndexingParametersConfiguration,
    BlobIndexerImageAction
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  

indexer_parameters = IndexingParameters(
    configuration=IndexingParametersConfiguration(
        image_action=BlobIndexerImageAction.GENERATE_NORMALIZED_IMAGE_PER_PAGE,
        query_timeout=None))

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    parameters=indexer_parameters
)  

indexer_client = SearchIndexerClient(endpoint=AZURE_SEARCH_ENDPOINT, credential=azure_credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} is created and running. If queries return no results, please wait a bit and try again.')  


 index-name-pull-method-indexer is created and running. If queries return no results, please wait a bit and try again.
