In [51]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
import json
import requests

In [52]:
load_dotenv(override=True) # take environment variables from .env.

def check_empty(variable_name, value):  
    if not value:  
        print(f"{variable_name} is empty.")  
  
AZURE_SEARCH_SERVICE_NAME = os.getenv("AZURE_SEARCH_SERVICE_NAME")


AZURE_SEARCH_INDEX = os.getenv("AZURE_INDEX_BASE")


AZURE_SEARCH_API_KEY = os.getenv("AZURE_SEARCH_API_KEY")
BLOB_CONNECTION_STRING = os.getenv("BLOB_CONNECTION_STRING")
BLOB_CONTAINER_NAME = os.getenv("BLOB_CONTAINER_NAME") 

AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_MODEL_NAME = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME")
AZURE_OPENAI_EMBEDDING_DIMENSIONS = os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS")

check_empty("AZURE_SEARCH_SERVICE_NAME", AZURE_SEARCH_SERVICE_NAME)
check_empty("AZURE_SEARCH_INDEX", AZURE_SEARCH_INDEX)
check_empty("AZURE_SEARCH_API_KEY", AZURE_SEARCH_API_KEY)
check_empty("BLOB_CONNECTION_STRING", BLOB_CONNECTION_STRING)
check_empty("BLOB_CONTAINER_NAME", BLOB_CONTAINER_NAME)
check_empty("AZURE_OPENAI_ENDPOINT", AZURE_OPENAI_ENDPOINT)
check_empty("AZURE_OPENAI_EMBEDDING_MODEL_NAME", AZURE_OPENAI_EMBEDDING_MODEL_NAME)
check_empty("AZURE_OPENAI_EMBEDDING_DIMENSIONS", AZURE_OPENAI_EMBEDDING_DIMENSIONS)



print(AZURE_SEARCH_INDEX)

index04


## Create a data source

In [None]:
def create_datasource(service_name, index_name, search_api_key, storage_connectionstring, storage_container):
    endpoint = "https://{}.search.windows.net/".format(service_name)
    url = '{0}/datasources/{1}-datasource?api-version=2025-05-01-preview'.format(endpoint, index_name)
    print(url)
    payload = json.dumps({
        "name": index_name + "-datasource",
        "description": None,
        "type": "azureblob",
        "subtype": None,
        "indexerPermissionOptions": [],
        "credentials": {
            "connectionString": storage_connectionstring
        },
        "container": {
            "name": storage_container,
            "query": None
        },
        "dataChangeDetectionPolicy": None,
        "dataDeletionDetectionPolicy": {
            "@odata.type": "#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy"
        },
        "encryptionKey": None,
        "identity": None
        })
    headers = {
    'api-key': search_api_key,
    'Content-Type': 'application/json'
            }


    response = requests.request("PUT", url, headers=headers, data=payload)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        return response, False

## Create Index

In [None]:
def create_index_semantic(service_name, index,azure_search_api_key, openai_api_base, text_embbeding_model, text_embbeding_model_dimensions):

    endpoint = "https://{}.search.windows.net/".format(service_name)
    url = '{0}/indexes/{1}/?api-version=2025-05-01-preview'.format(endpoint, index)

    resourceUri = openai_api_base
    deploymentId =  text_embbeding_model

    print(url)

    payload = json.dumps({

  "name": index,
  "fields": [
    {
      "name": "chunk_id",
      "type": "Edm.String",
      "searchable": True,
      "filterable": False,
      "retrievable": True,
      "stored": True,
      "sortable": True,
      "facetable": False,
      "key": True,
      "analyzer": "keyword",
      "synonymMaps": []
    },
    {
      "name": "parent_id",
      "type": "Edm.String",
      "searchable": False,
      "filterable": True,
      "retrievable": True,
      "stored": True,
      "sortable": False,
      "facetable": False,
      "key": False,
      "synonymMaps": []
    },
    {
      "name": "chunk",
      "type": "Edm.String",
      "searchable": True,
      "filterable": False,
      "retrievable": True,
      "stored": True,
      "sortable": False,
      "facetable": False,
      "key": False,
      "synonymMaps": []
    },
    {
      "name": "title",
      "type": "Edm.String",
      "searchable": True,
      "filterable": False,
      "retrievable": True,
      "stored": True,
      "sortable": False,
      "facetable": False,
      "key": False,
      "synonymMaps": []
    },
    {
      "name": "text_vector",
      "type": "Collection(Edm.Single)",
      "searchable": True,
      "filterable": False,
      "retrievable": True,
      "stored": True,
      "sortable": False,
      "facetable": False,
      "key": False,
      "dimensions": text_embbeding_model_dimensions,
      "vectorSearchProfile": "{index}-azureOpenAi-text-profile".format(index=index),
      "synonymMaps": []
    }
  ],
  "scoringProfiles": [],
  "suggesters": [],
  "analyzers": [],
  "normalizers": [],
  "tokenizers": [],
  "tokenFilters": [],
  "charFilters": [],
  "similarity": {
    "@odata.type": "#Microsoft.Azure.Search.BM25Similarity"
  },
  "semantic": {
    "defaultConfiguration": "{index}-semantic-configuration".format(index=index),
    "configurations": [
      {
        "name": "{index}-semantic-configuration".format(index=index),
        "flightingOptIn": False,
        "rankingOrder": "BoostedRerankerScore",
        "prioritizedFields": {
          "titleField": {
            "fieldName": "title"
          },
          "prioritizedContentFields": [
            {
              "fieldName": "chunk"
            }
          ],
          "prioritizedKeywordsFields": []
        }
      }
    ]
  },
  "vectorSearch": {
    "algorithms": [
      {
        "name": "{index}-algorithm".format(index=index),
        "kind": "hnsw",
        "hnswParameters": {
          "metric": "cosine",
          "m": 4,
          "efConstruction": 400,
          "efSearch": 500
        }
      }
    ],
    "profiles": [
      {
        "name": "{index}-azureOpenAi-text-profile".format(index=index),
        "algorithm": "{index}-algorithm".format(index=index),
        "vectorizer": "{index}-azureOpenAi-text-vectorizer".format(index=index),
      }
    ],
    "vectorizers": [
      {
        "name": "{index}-azureOpenAi-text-vectorizer".format(index=index),
        "kind": "azureOpenAI",
        "azureOpenAIParameters": {
        "resourceUri": openai_api_base,
        "deploymentId": deploymentId,
        "modelName": deploymentId
        }
      }
    ],
    "compressions": []
  }
})
    headers = {
    'api-key': azure_search_api_key,
    'Content-Type': 'application/json'
    }

    response = requests.request("PUT", url, headers=headers, data=payload)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        print('************************')
        print(response.status_code)
        print(response.text)
        return response, False

## Create a Skillset

In [None]:
def create_skillset(service_name, search_api_key, index, openai_api_base, text_embbeding_model, dimensions):

    endpoint = "https://{}.search.windows.net/".format(service_name)
    resourceUri = openai_api_base
    deploymentId =  text_embbeding_model
    modelName =  text_embbeding_model


    url = '{0}/skillsets/{1}-skillset?api-version=2025-05-01-preview'.format(endpoint, index)
    print(url)
    payload = json.dumps(
        {

  "name": "{index}-skillset".format(index=index),,
  "description": "Skillset to chunk documents and generate embeddings",
  "skills": [
    {
      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
      "name": "#1",
      "description": "Split skill to chunk documents",
      "context": "/document",
      "defaultLanguageCode": "en",
      "textSplitMode": "pages",
      "maximumPageLength": 2000,
      "pageOverlapLength": 500,
      "maximumPagesToTake": 0,
      "unit": "characters",
      "inputs": [
        {
          "name": "text",
          "source": "/document/content",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "textItems",
          "targetName": "pages"
        }
      ]
    },
    {
      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
      "name": "#2",
      "context": "/document/pages/*",
      "resourceUri": resourceUri,
      "deploymentId": deploymentId,
      "dimensions": dimensions,
      "modelName": modelName,
      "inputs": [
        {
          "name": "text",
          "source": "/document/pages/*",
          "inputs": []
        }
      ],
      "outputs": [
        {
          "name": "embedding",
          "targetName": "text_vector"
        }
      ]
    }
  ],
  "indexProjections": {
    "selectors": [
      {
        "targetIndexName": index,
        "parentKeyFieldName": "parent_id",
        "sourceContext": "/document/pages/*",
        "mappings": [
          {
            "name": "text_vector",
            "source": "/document/pages/*/text_vector",
            "inputs": []
          },
          {
            "name": "chunk",
            "source": "/document/pages/*",
            "inputs": []
          },
          {
            "name": "title",
            "source": "/document/title",
            "inputs": []
          }
        ]
      }
    ],
    "parameters": {
      "projectionMode": "skipIndexingParentDocuments"
    }
  }
}
    )
    
    headers = {
        'Content-Type': 'application/json',
        'api-key': '{0}'.format(search_api_key)
    }


    
    response = requests.request("PUT", url, headers=headers, data=payload)


    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:

        return response, False

## Create the Indexer

In [56]:
def create_indexer(service_name, index, search_key):
        endpoint = "https://{}.search.windows.net/".format(service_name)
        url = '{0}/indexers/{1}-indexer/?api-version=2024-11-01-preview'.format(endpoint, index)
        print(url)

        payload = json.dumps({
            "name": "{0}-indexer".format(index),
            "description": "",
            "dataSourceName": "{0}-datasource".format(index),
            "skillsetName": "{0}-skillset".format(index),
            "targetIndexName": "{0}".format(index),
            "disabled": False,
            "schedule": None,
            "parameters": {
                "batchSize": None,
                "maxFailedItems": 0,
                "maxFailedItemsPerBatch": 0,
                "base64EncodeKeys": None,
                "configuration": {
                "parsingMode": "default",
                "excludedFileNameExtensions": "",
                "indexedFileNameExtensions": "",
                "failOnUnsupportedContentType": False,
                "failOnUnprocessableDocument": False,
                "indexStorageMetadataOnlyForOversizedDocuments": False,
                "firstLineContainsHeaders": True,
                "dataToExtract": "contentAndMetadata",
                "imageAction": "generateNormalizedImagePerPage",
                "allowSkillsetToReadFileData": False,
                "pdfTextRotationAlgorithm": "none",
                "executionEnvironment": "standard"
                }
            },
            "fieldMappings": [],
            "outputFieldMappings": [],
            "cache": None,
            "encryptionKey": None
            })
        headers = {
        'Content-Type': 'application/json',
        'api-key': '{0}'.format(search_key)
        }


        response = requests.request("PUT", url, headers=headers, data=payload)


        if response.status_code == 201 or response.status_code == 204:
            print('good')
            return response, True
        else:
            print(response.status_code)
            print('************************')
            print(response.status_code)
            print(response.text)
            return response, False

In [57]:

create_datasource(AZURE_SEARCH_SERVICE_NAME, AZURE_SEARCH_INDEX, AZURE_SEARCH_API_KEY, BLOB_CONNECTION_STRING, BLOB_CONTAINER_NAME)

https://mmxdisaisearch.search.windows.net//datasources/index04-datasource?api-version=2025-05-01-preview


(<Response [201]>, True)

In [58]:

create_index_semantic(AZURE_SEARCH_SERVICE_NAME, AZURE_SEARCH_INDEX,AZURE_SEARCH_API_KEY, AZURE_OPENAI_ENDPOINT,  AZURE_OPENAI_EMBEDDING_MODEL_NAME, AZURE_OPENAI_EMBEDDING_DIMENSIONS)

https://mmxdisaisearch.search.windows.net//indexes/index04/?api-version=2025-05-01-preview


(<Response [201]>, True)

In [59]:
#service_name, search_api_key, index, openai_api_base, text_embbeding_model, azure_ai_services_endpoint
create_skillset(AZURE_SEARCH_SERVICE_NAME, AZURE_SEARCH_API_KEY, AZURE_SEARCH_INDEX,  AZURE_OPENAI_ENDPOINT, AZURE_OPENAI_EMBEDDING_MODEL_NAME, AZURE_OPENAI_EMBEDDING_DIMENSIONS)

https://mmxdisaisearch.search.windows.net//skillsets/index04-skillset?api-version=2025-05-01-preview


(<Response [201]>, True)

In [60]:
#create_indexer(service_name, index, search_key)
create_indexer(AZURE_SEARCH_SERVICE_NAME, AZURE_SEARCH_INDEX, AZURE_SEARCH_API_KEY)

https://mmxdisaisearch.search.windows.net//indexers/index04-indexer/?api-version=2024-11-01-preview
good


(<Response [201]>, True)