## Tutorial: Index mixed content using image verbalizations and the Document Extraction skill

- Image Verbalization

- Azure OpenAI describes image - so it can be retrieved.

In [None]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
import json
import requests

In [None]:
load_dotenv(override=True) # take environment variables from .env.

def check_empty(variable_name, value):  
    if not value:  
        print(f"{variable_name} is empty.")  
  
AZURE_SEARCH_SERVICE_NAME = os.getenv("AZURE_SEARCH_SERVICE_NAME")
AZURE_SEARCH_SERVICE_ENDPOINT = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")


AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB = os.getenv("AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB")

AZURE_SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
BLOB_CONTAINER_NAME = os.getenv("BLOB_CONTAINER_NAME") 

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
AZURE_OPENAI_EMBEDDING_DIMENSIONS = os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS")
AZURE_OPENAI_CHATCOMPLETIONS_ENDPOINT = os.getenv("AZURE_OPENAI_CHATCOMPLETIONS_ENDPOINT")
AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY")

AZURE_AI_SERVICES_ENDPOINT = os.getenv("AZURE_AI_SERVICES_ENDPOINT")
AZURE_AI_SERVICES_API_KEY = os.getenv("AZURE_AI_SERVICES_API_KEY")

check_empty("AZURE_SEARCH_SERVICE_NAME", AZURE_SEARCH_SERVICE_NAME)
check_empty("AZURE_SEARCH_API_KEY", AZURE_SEARCH_API_KEY)
check_empty("BLOB_CONNECTION_STRING", BLOB_CONNECTION_STRING)
check_empty("BLOB_CONTAINER_NAME", BLOB_CONTAINER_NAME)
check_empty("AZURE_OPENAI_ENDPOINT", AZURE_OPENAI_ENDPOINT)
check_empty("AZURE_OPENAI_EMBEDDING_MODEL_NAME", AZURE_OPENAI_EMBEDDING_MODEL_NAME)
check_empty("AZURE_OPENAI_EMBEDDING_DIMENSIONS", AZURE_OPENAI_EMBEDDING_DIMENSIONS)
check_empty("AZURE_AI_SERVICES_ENDPOINT", AZURE_AI_SERVICES_ENDPOINT)
check_empty("AZURE_OPENAI_KEY", AZURE_OPENAI_KEY)
check_empty("AZURE_OPENAI_CHATCOMPLETIONS_ENDPOINT", AZURE_OPENAI_CHATCOMPLETIONS_ENDPOINT)
check_empty("AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB", AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB)
check_empty("AZURE_AI_SERVICES_ENDPOINT", AZURE_AI_SERVICES_ENDPOINT)
check_empty("AZURE_AI_SERVICES_API_KEY", AZURE_AI_SERVICES_API_KEY)



## Create the Datasource

In [None]:
BLOB_CONTAINER_NAME

In [None]:
import requests

# Replace with your actual base URL and API key

azure_search_api_key = AZURE_SEARCH_API_KEY
base_url = AZURE_SEARCH_SERVICE_ENDPOINT
index_name = AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB
index = AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB

url = '{0}/datasources/{1}-datasource?api-version=2025-05-01-preview'.format(base_url, index_name)
print(url)

payload = json.dumps(  
    {
    "description": None,
    "type": "azureblob",
    "subtype": None,
    "credentials": {
      "connectionString": BLOB_CONNECTION_STRING
    },
    "container": {
      "name": BLOB_CONTAINER_NAME,
      "query": None
    },
    "dataChangeDetectionPolicy": None,
    "dataDeletionDetectionPolicy": None,
    "encryptionKey": None,
    "identity": None
  }
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)

## Create the index - the Vectorizer here is actually OpenAI - not Azure AI Multimodal Model

In [None]:
import requests

# Replace with your actual values
vectorizer = "my-vectorizer"
openAIResourceUri = AZURE_OPENAI_ENDPOINT
openAIKey = AZURE_OPENAI_KEY

url = '{0}/indexes/{1}/?api-version=2025-05-01-preview'.format(base_url, index_name)

payload = json.dumps(
    {
    "fields": [
        {
            "name": "content_id",
            "type": "Edm.String",
            "retrievable": True,
            "key": True,
            "analyzer": "keyword"
        },
        {
            "name": "text_document_id",
            "type": "Edm.String",
            "searchable": False,
            "filterable": True,
            "retrievable": True,
            "stored": True,
            "sortable": False,
            "facetable": False
        },          
        {
            "name": "document_title",
            "type": "Edm.String",
            "searchable": True
        },
        {
            "name": "image_document_id",
            "type": "Edm.String",
            "filterable": True,
            "retrievable": True
        },
        {
            "name": "content_text",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True
        },
        {
            "name": "content_embedding",
            "type": "Collection(Edm.Single)",
            "dimensions": 3072,
            "searchable": True,
            "retrievable": True,
            "vectorSearchProfile": "hnsw"
        },
        {
            "name": "content_path",
            "type": "Edm.String",
            "searchable": False,
            "retrievable": True
        },
        {
            "name": "offset",
            "type": "Edm.String",
            "searchable": False,
            "retrievable": True
        },
        {
            "name": "location_metadata",
            "type": "Edm.ComplexType",
            "fields": [
                {
                "name": "page_number",
                "type": "Edm.Int32",
                "searchable": False,
                "retrievable": True
                },
                {
                "name": "bounding_polygons",
                "type": "Edm.String",
                "searchable": False,
                "retrievable": True,
                "filterable": False,
                "sortable": False,
                "facetable": False
                }
            ]
        }         
    ],
    "vectorSearch": {
        "profiles": [
            {
                "name": "hnsw",
                "algorithm": "defaulthnsw",
                "vectorizer": vectorizer
            }
        ],
        "algorithms": [
            {
                "name": "defaulthnsw",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "metric": "cosine"
                }
            }
        ],
         "vectorizers": [
            {
              "name": vectorizer,
              "kind": "azureOpenAI",    
              "azureOpenAIParameters": {
                "resourceUri": openAIResourceUri,
                "deploymentId": "text-embedding-3-large",
                "apiKey": openAIKey,
                "modelName": "text-embedding-3-large"
              }
            }
        ]     
    },
    "semantic": {
        "defaultConfiguration": "semanticconfig",
        "configurations": [
            {
                "name": "semanticconfig",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "document_title"
                    },
                    "prioritizedContentFields": [
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    }
}
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)


## Skillset

TODO: Add back in datauri if data doesnt land in blob storage

In [None]:
import requests



imageProjectionContainer = "imageprojection2"
storageConnectionString = BLOB_CONNECTION_STRING
chatcompletionsuri = AZURE_OPENAI_CHATCOMPLETIONS_ENDPOINT


url = '{0}/skillsets/{1}-skillset?api-version=2025-05-01-preview'.format(base_url, index_name)

payload = json.dumps(
{
  "description": "A test skillset",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",
      "name": "document-extraction-skill",
      "description": "Document extraction skill to exract text and images from documents",
      "parsingMode": "default",
      "dataToExtract": "contentAndMetadata",
      "configuration": {
          "imageAction": "generateNormalizedImages",
          "normalizedImageMaxWidth": 2000,
          "normalizedImageMaxHeight": 2000
      },
      "context": "/document",
      "inputs": [
        {
          "name": "file_data",
          "source": "/document/file_data"
        }
      ],
      "outputs": [
        {
          "name": "content",
          "targetName": "extracted_content"
        },
        {
          "name": "normalized_images",
          "targetName": "normalized_images"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "split-skill",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 200,
      "unit": "characters",
      "inputs": [
        {
          "name": "text",
          "source": "/document/extracted_content",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    }, 
    {
    "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
    "name": "text-embedding-skill",
    "description": "Embedding skill for text",
    "context": "/document/pages/*",
    "inputs": [
        {
        "name": "text",
        "source": "/document/pages/*"
        }
    ],
    "outputs": [
        {
        "name": "embedding",
        "targetName": "text_vector"
        }
    ],
    "resourceUri": openAIResourceUri,
    "deploymentId": "text-embedding-3-large",
    "apiKey": openAIKey,
    "dimensions": 3072,
    "modelName": "text-embedding-3-large"
    },
    {
    "@odata.type": "#Microsoft.Skills.Custom.ChatCompletionSkill",
    "name": "genAI-prompt-skill",
    "description": "GenAI Prompt skill for image verbalization",
    "uri": chatcompletionsuri,
    "timeout": "PT1M",
    "apiKey": openAIKey,
    "context": "/document/normalized_images/*",
    "inputs": [
        {
        "name": "systemMessage",
        "source": "='You are tasked with generating concise, accurate descriptions of images, figures, diagrams, or charts in documents. The goal is to capture the key information and meaning conveyed by the image without including extraneous details like style, colors, visual aesthetics, or size.\n\nInstructions:\nContent Focus: Describe the core content and relationships depicted in the image.\n\nFor diagrams, specify the main elements and how they are connected or interact.\nFor charts, highlight key data points, trends, comparisons, or conclusions.\nFor figures or technical illustrations, identify the components and their significance.\nClarity & Precision: Use concise language to ensure clarity and technical accuracy. Avoid subjective or interpretive statements.\n\nAvoid Visual Descriptors: Exclude details about:\n\nColors, shading, and visual styles.\nImage size, layout, or decorative elements.\nFonts, borders, and stylistic embellishments.\nContext: If relevant, relate the image to the broader content of the technical document or the topic it supports.\n\nExample Descriptions:\nDiagram: \"A flowchart showing the four stages of a machine learning pipeline: data collection, preprocessing, model training, and evaluation, with arrows indicating the sequential flow of tasks.\"\n\nChart: \"A bar chart comparing the performance of four algorithms on three datasets, showing that Algorithm A consistently outperforms the others on Dataset 1.\"\n\nFigure: \"A labeled diagram illustrating the components of a transformer model, including the encoder, decoder, self-attention mechanism, and feedforward layers.\"'"
        },
        {
        "name": "userMessage",
        "source": "='Please describe this image.'"
        },
        {
        "name": "image",
        "source": "/document/normalized_images/*/data"
        }
        ],
        "outputs": [
            {
            "name": "response",
            "targetName": "verbalizedImage"
            }
        ]
    },    
    {
    "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
    "name": "verblized-image-embedding-skill",
    "description": "Embedding skill for verbalized images",
    "context": "/document/normalized_images/*",
    "inputs": [
        {
        "name": "text",
        "source": "/document/normalized_images/*/verbalizedImage",
        "inputs": []
        }
    ],
    "outputs": [
        {
        "name": "embedding",
        "targetName": "verbalizedImage_vector"
        }
    ],
    "resourceUri": openAIResourceUri,
    "deploymentId": "text-embedding-3-large",
    "apiKey": openAIKey,
    "dimensions": 3072,
    "modelName": "text-embedding-3-large"
    },
    {
      "@odata.type": "#Microsoft.Skills.Util.ShaperSkill",
      "name": "shaper-skill",
      "description": "Shaper skill to reshape the data to fit the index schema",
      "context": "/document/normalized_images/*",
      "inputs": [
        {
          "name": "normalized_images",
          "source": "/document/normalized_images/*",
          "inputs": []
        },
        {
          "name": "imagePath",
          "source": "='{imageProjectionContainer}/'+$(/document/normalized_images/*/imagePath)".format(imageProjectionContainer=imageProjectionContainer),
          "inputs": []
        },
        {
          "name": "location_metadata",
          "sourceContext": "/document/normalized_images/*",
          "inputs": [
            {
              "name": "page_number",
              "source": "/document/normalized_images/*/pageNumber"
            },
            {
              "name": "bounding_polygons",
              "source": "/document/normalized_images/*/boundingPolygon"
            }              
          ]
        }        
      ],
      "outputs": [
        {
          "name": "output",
          "targetName": "new_normalized_images"
        }
      ]
    }      
  ], 
   "indexProjections": {
      "selectors": [
        {
          "targetIndexName": index,
          "parentKeyFieldName": "text_document_id",
          "sourceContext": "/document/pages/*",
          "mappings": [    
            {
            "name": "content_embedding",
            "source": "/document/pages/*/text_vector"
            },                      
            {
              "name": "content_text",
              "source": "/document/pages/*"
            },             
            {
              "name": "document_title",
              "source": "/document/document_title"
            }   
          ]
        },        
        {
          "targetIndexName": index,
          "parentKeyFieldName": "image_document_id",
          "sourceContext": "/document/normalized_images/*",
          "mappings": [    
            {
            "name": "content_text",
            "source": "/document/normalized_images/*/verbalizedImage"
            },  
            {
            "name": "content_embedding",
            "source": "/document/normalized_images/*/verbalizedImage_vector"
            },                                           
            {
              "name": "content_path",
              "source": "/document/normalized_images/*/new_normalized_images/imagePath"
            },                    
            {
              "name": "document_title",
              "source": "/document/document_title"
            },
            {
              "name": "location_metadata",
              "source": "/document/normalized_images/*/new_normalized_images/location_metadata"
            }            
          ]
        }
      ],
      "parameters": {
        "projectionMode": "skipIndexingParentDocuments"
      }
  },  
  "knowledgeStore": {
    "storageConnectionString": storageConnectionString,
    "projections": [
      {
        "files": [
          {
            "storageContainer": imageProjectionContainer,
            "source": "/document/normalized_images/*"
          }
        ]
      }
    ]
  }
}

)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)
print(response.text)


In [None]:


url = '{0}/indexers/{1}-indexer/?api-version=2025-05-01-preview'.format(base_url, index_name)
print(url)


payload = json.dumps({
  "dataSourceName": "{0}-datasource".format(index_name),
  "skillsetName": "{0}-skillset".format(index_name),
  "targetIndexName": "{0}".format(index_name),
  "parameters": {
    "maxFailedItems": -1,
    "maxFailedItemsPerBatch": 0,
    "batchSize": 1,
    "configuration": {
      "allowSkillsetToReadFileData": True
    }
  },
  "fieldMappings": [
    {
      "sourceFieldName": "metadata_storage_name",
      "targetFieldName": "document_title"
    }
  ],
  "outputFieldMappings": []
}
)

headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

response = requests.request("PUT", url, headers=headers, data=payload)

print("Status Code:", response.status_code)

In [None]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery
from azure.identity import DefaultAzureCredential

credential = DefaultAzureCredential()
# Pure Vector Search
query = "what does the filament winding process scheme look like"
query = "What does compound (BDMA) structure look like"
endpoint = AZURE_SEARCH_SERVICE_ENDPOINT
index_name = AZURE_SEARCH_INDEX_DOC_EXTRACT_IMAGE_VERB
print(f"Using index: {index_name}")

search_client = SearchClient(endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=5, fields="content_embedding", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    top=5
)  
  
for result in results:  
    print('------')
    print(result['content_text'])
    print("image id = " + str(result['image_document_id']) if result['image_document_id'] is not None else "image_document_id is missing") 
    print("content id = " + str(result['content_id']) if result['content_id'] is not None else "content id is missing") 
    print("path = " + str(result['content_path']) if result['content_path'] is not None else "path is missing") 
    # print(f"parent_id: {result['parent_id']}")  
    # print(f"chunk_id: {result['chunk_id']}")  
    # print(f"page_number: {result['page_number']}")
    # print(f"Score: {result['@search.score']}")  
    # print(f"Content: {result['chunk']}")   