## Mixed Content with Multimodal embeddings and Document Extraction Skill with Chunking


- For Knowledge store - AI Search needs **Blob Data Contributor**

Azure AI Search can extract and index both text and images from PDF documents stored in Azure Blob Storage. This tutorial shows you how to build a multimodal indexing pipeline by embedding both text and images into a unified semantic search index.

- Azure AI Mutlimodal Embeddings: https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/how-to/image-retrieval?tabs=csharp

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
import json
import requests

In [None]:
load_dotenv(override=True) # take environment variables from .env.

def check_empty(variable_name, value):  
    if not value:  
        print(f"{variable_name} is empty.")  
  
AZURE_SEARCH_SERVICE_NAME = os.getenv("AZURE_SEARCH_SERVICE_NAME")
AZURE_SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")


AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS = os.getenv("AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS")

AZURE_SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
BLOB_CONTAINER_NAME = os.getenv("BLOB_CONTAINER_NAME") 

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
AZURE_OPENAI_EMBEDDING_DIMENSIONS = os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS")
AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")

AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_SERVICES_API_KEY = os.getenv("AZURE_AI_SERVICES_API_KEY")

check_empty("AZURE_SEARCH_SERVICE_NAME", AZURE_SEARCH_SERVICE_NAME)
check_empty("AZURE_SEARCH_API_KEY", AZURE_SEARCH_API_KEY)
check_empty("BLOB_CONNECTION_STRING", BLOB_CONNECTION_STRING)
check_empty("BLOB_CONTAINER_NAME", BLOB_CONTAINER_NAME)
check_empty("AZURE_OPENAI_ENDPOINT", AZURE_OPENAI_ENDPOINT)
check_empty("AZURE_OPENAI_EMBEDDING_MODEL_NAME", AZURE_OPENAI_EMBEDDING_MODEL_NAME)
check_empty("AZURE_OPENAI_EMBEDDING_DIMENSIONS", AZURE_OPENAI_EMBEDDING_DIMENSIONS)
check_empty("AZURE_AI_SERVICES_ENDPOINT", AZURE_AI_SERVICES_ENDPOINT)
check_empty("AZURE_OPENAI_KEY", AZURE_OPENAI_KEY)
check_empty("AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS", AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS)
check_empty("AZURE_AI_SERVICES_ENDPOINT", AZURE_AI_SERVICES_ENDPOINT)
check_empty("AZURE_AI_SERVICES_API_KEY", AZURE_AI_SERVICES_API_KEY)



In [None]:
import requests

# Replace with your actual base URL and API key

azure_search_api_key = AZURE_SEARCH_API_KEY
base_url = AZURE_SEARCH_SERVICE_ENDPOINT
index_name = AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS

url = '{0}/datasources/{1}-datasource?api-version=2025-05-01-preview'.format(base_url, index_name)
print(url)

payload = json.dumps(  
    {
    "description": None,
    "type": "azureblob",
    "subtype": None,
    "credentials": {
      "connectionString": BLOB_CONNECTION_STRING
    },
    "container": {
      "name": BLOB_CONTAINER_NAME,
      "query": None
    },
    "dataChangeDetectionPolicy": None,
    "dataDeletionDetectionPolicy": None,
    "encryptionKey": None,
    "identity": None
  }
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)

In [None]:
import requests

# Replace with your actual values
vectorizer = "my-vectorizer"
cognitiveServicesKey = AZURE_AI_SERVICES_API_KEY
cognitiveServicesUrl = AZURE_AI_SERVICES_ENDPOINT
modelVersion = "2023-04-15"

url = '{0}/indexes/{1}/?api-version=2025-05-01-preview'.format(base_url, index_name)

payload = json.dumps(
    {
    "fields": [
        {
            "name": "content_id",
            "type": "Edm.String",
            "retrievable": True,
            "key": True,
            "analyzer": "keyword"
        },
        {
            "name": "text_document_id",
            "type": "Edm.String",
            "searchable": False,
            "filterable": True,
            "retrievable": True,
            "stored": True,
            "sortable": False,
            "facetable": False
        },          
        {
            "name": "document_title",
            "type": "Edm.String",
            "searchable": True
        },
        {
            "name": "image_document_id",
            "type": "Edm.String",
            "filterable": True,
            "retrievable": True
        },
        {
            "name": "content_text",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True
        },
        {
            "name": "content_embedding",
            "type": "Collection(Edm.Single)",
            "dimensions": 1024,
            "searchable": True,
            "retrievable": True,
            "vectorSearchProfile": "hnsw"
        },
        {
            "name": "content_path",
            "type": "Edm.String",
            "searchable": False,
            "retrievable": True
        },
        {
            "name": "offset",
            "type": "Edm.String",
            "searchable": False,
            "retrievable": True
        },
        {
            "name": "location_metadata",
            "type": "Edm.ComplexType",
            "fields": [
                {
                "name": "page_number",
                "type": "Edm.Int32",
                "searchable": False,
                "retrievable": True
                },
                {
                "name": "bounding_polygons",
                "type": "Edm.String",
                "searchable": False,
                "retrievable": True,
                "filterable": False,
                "sortable": False,
                "facetable": False
                }
            ]
        }         
    ],
    "vectorSearch": {
        "profiles": [
            {
                "name": "hnsw",
                "algorithm": "defaulthnsw",
                "vectorizer": vectorizer
            }
        ],
        "algorithms": [
            {
                "name": "defaulthnsw",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "metric": "cosine"
                }
            }
        ],
        "vectorizers": [
            {
                "name": vectorizer,
                "kind": "aiServicesVision",
                "aiServicesVisionParameters": {
                    "resourceUri": cognitiveServicesUrl,
                    "apiKey": cognitiveServicesKey,
                    "modelVersion": modelVersion
                }
            }
        ]     
    },
    "semantic": {
        "defaultConfiguration": "semanticconfig",
        "configurations": [
            {
                "name": "semanticconfig",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "document_title"
                    },
                    "prioritizedContentFields": [
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    }
}
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)


In [None]:
import requests



imageProjectionContainer = "imageprojection2"
storageConnectionString = BLOB_CONNECTION_STRING


url = '{0}/skillsets/{1}-skillset?api-version=2025-05-01-preview'.format(base_url, index_name)

payload = json.dumps(
    
{
  "description": "A test skillset",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",
      "name": "document-extraction-skill",
      "description": "Document extraction skill to exract text and images from documents",
      "parsingMode": "default",
      "dataToExtract": "contentAndMetadata",
      "configuration": {
          "imageAction": "generateNormalizedImages",
          "normalizedImageMaxWidth": 2000,
          "normalizedImageMaxHeight": 2000
      },
      "context": "/document",
      "inputs": [
        {
          "name": "file_data",
          "source": "/document/file_data"
        }
      ],
      "outputs": [
        {
          "name": "content",
          "targetName": "extracted_content"
        },
        {
          "name": "normalized_images",
          "targetName": "normalized_images"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "split-skill",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 200,
      "unit": "characters",
      "inputs": [
        {
          "name": "text",
          "source": "/document/extracted_content",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    },  
  { 
    "@odata.type": "#Microsoft.Skills.Vision.VectorizeSkill", 
    "name": "text-embedding-skill",
    "description": "Vision Vectorization skill for text",
    "context": "/document/pages/*", 
    "modelVersion": modelVersion, 
    "inputs": [ 
      { 
        "name": "text", 
        "source": "/document/pages/*" 
      } 
    ], 
    "outputs": [ 
      { 
        "name": "vector",
        "targetName": "text_vector"
      } 
    ] 
  },
  { 
    "@odata.type": "#Microsoft.Skills.Vision.VectorizeSkill", 
    "name": "image-embedding-skill",
    "description": "Vision Vectorization skill for images",
    "context": "/document/normalized_images/*", 
    "modelVersion": modelVersion, 
    "inputs": [ 
      { 
        "name": "image", 
        "source": "/document/normalized_images/*" 
      } 
    ], 
    "outputs": [ 
      { 
        "name": "vector",
  "targetName": "image_vector"
      } 
    ] 
  },  
    {
      "@odata.type": "#Microsoft.Skills.Util.ShaperSkill",
      "name": "shaper-skill",
      "description": "Shaper skill to reshape the data to fit the index schema",
      "context": "/document/normalized_images/*",
      "inputs": [
        {
          "name": "normalized_images",
          "source": "/document/normalized_images/*",
          "inputs": []
        },
        {
          "name": "imagePath",
          "source": "='{imageProjectionContainer}/'+$(/document/normalized_images/*/imagePath)".format(imageProjectionContainer=imageProjectionContainer),
          "inputs": []
        },
        {
          "name": "dataUri",
          "source": "='{dataContainer}:image/jpeg;base64,'+$(/document/normalized_images/*/data)".format(dataContainer=BLOB_CONTAINER_NAME),
          "inputs": []
        },
        {
          "name": "location_metadata",
          "sourceContext": "/document/normalized_images/*",
          "inputs": [
            {
              "name": "page_number",
              "source": "/document/normalized_images/*/pageNumber"
            },
            {
              "name": "bounding_polygons",
              "source": "/document/normalized_images/*/boundingPolygon"
            }              
          ]
        }          
      ],
      "outputs": [
        {
          "name": "output",
          "targetName": "new_normalized_images"
        }
      ]
    }  
  ],
  "cognitiveServices": {
    "@odata.type": "#Microsoft.Azure.Search.AIServicesByKey",
    "subdomainUrl": cognitiveServicesUrl,
    "key": cognitiveServicesKey
  },
  "indexProjections": {
      "selectors": [
        {
          "targetIndexName": index_name,
          "parentKeyFieldName": "text_document_id",
          "sourceContext": "/document/pages/*",
          "mappings": [              
            {
              "name": "content_embedding",
              "source": "/document/pages/*/text_vector"
            },
            {
              "name": "content_text",
              "source": "/document/pages/*"
            },             
            {
              "name": "document_title",
              "source": "/document/document_title"
            }      
          ]
        },
        {
          "targetIndexName": index_name,
          "parentKeyFieldName": "image_document_id",
          "sourceContext": "/document/normalized_images/*",
          "mappings": [                                   
            {
              "name": "content_embedding",
              "source": "/document/normalized_images/*/image_vector"
            },
            {
              "name": "content_path",
              "source": "/document/normalized_images/*/new_normalized_images/imagePath"
            },
            {
              "name": "location_metadata",
              "source": "/document/normalized_images/*/new_normalized_images/location_metadata"
            },                      
            {
              "name": "document_title",
              "source": "/document/document_title"
            }                
          ]
        }
      ],
      "parameters": {
        "projectionMode": "skipIndexingParentDocuments"
      }
  },
  "knowledgeStore": {
    "storageConnectionString": storageConnectionString,
    "projections": [
      {
        "files": [
          {
            "storageContainer": imageProjectionContainer,
            "source": "/document/normalized_images/*"
          }
        ]
      }
    ]
  }
}
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)


response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)

In [None]:


url = '{0}/indexers/{1}-indexer/?api-version=2025-05-01-preview'.format(base_url, index_name)
print(url)


payload = json.dumps({
  "dataSourceName": "{0}-datasource".format(index_name),
  "skillsetName": "{0}-skillset".format(index_name),
  "targetIndexName": "{0}".format(index_name),
  "parameters": {
    "maxFailedItems": -1,
    "maxFailedItemsPerBatch": 0,
    "batchSize": 1,
    "configuration": {
      "allowSkillsetToReadFileData": True
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_name",
      "targetFieldName": "document_title"
    }
  ],
  "outputFieldMappings": []
}
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)

## Vector Search

- Pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

- **Filter to get images only**

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
# Pure Vector Search
query = "What does Bisphenol-A-based epoxy look like"
endpoint = AZURE_SEARCH_SERVICE_ENDPOINT
index_name = AZURE_SEARCH_INDEX_DOCINTELL_IMAGE_VERBALIZATIONS
print(f"Using index: {index_name}")

search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=5, fields="content_embedding", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    filter="image_document_id ne null",
    top=5
)  
formatted_results = []
for result in results:  
    formatted_result = {  
        "content_text": result.get('content_text', ""),  
        "image_document_id": result.get('image_document_id', None),  
        "content_id": result.get('content_id', None),  
        "content_path": result.get('content_path', None)  
    }  
    formatted_results.append(formatted_result)  

print("Results:")
for result in formatted_results:  
    print('------')
    print(result)

## Hybrid SEarch

In [None]:
query = "Who wrote the intro into Chemistry and TEchnology of Eposxy-Resins?"

search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="content_embedding", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["document_title", "content_text", "content_path"],
    top=1
)  
  
for result in results:  
    print(f"content: {result['content_text']}") 

In [None]:
## Hybrid Search + Semantic Reranking

In [None]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
# Semantic Hybrid Search
query = "Who wrote the intro into Chemistry and TEchnology of Eposxy-Resins?"


search_client = SearchClient(endpoint, index_name, credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="content_embedding", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["document_title", "content_text", "content_path"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='semanticconfig',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)


semantic_answers = results.get_answers()
if semantic_answers:
    print(f"Semantic Answers Count: {len(semantic_answers)}\n")
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")
else:
    print("No semantic answers found.\n")

for result in results:
    print(f"context: {result['content_text']}")  

    print(f"Reranker Score: {result['@search.reranker_score']}")


    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")

## Hybrid Search + Semantic Reranking

In [None]:
from azure.search.documents.models import (
    QueryType,
    QueryCaptionType,
    QueryAnswerType
)
# Semantic Hybrid Search
query = "Who wrote the intro into Chemistry and TEchnology of Eposxy-Resins?"



search_client = SearchClient(endpoint, index_name, credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="content_embedding", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["document_title", "content_text", "content_path"],
    query_type=QueryType.SEMANTIC,
    semantic_configuration_name='semanticconfig',
    query_caption=QueryCaptionType.EXTRACTIVE,
    query_answer=QueryAnswerType.EXTRACTIVE,
    top=1
)

semantic_answers = results.get_answers()

print(semantic_answers)
print('-----------')
if semantic_answers:
    for answer in semantic_answers:
        if answer.highlights:
            print(f"Semantic Answer: {answer.highlights}")
        else:
            print(f"Semantic Answer: {answer.text}")
        print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"context: {result['content_text']}")  

    print(f"Reranker Score: {result['@search.reranker_score']}")
    print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")