In [None]:
import json
import requests
from dotenv import load_dotenv
import os

In [None]:
from dotenv import dotenv_values

config = dotenv_values(".env")

FORMS_RECOGNIZER_ENDPOINT = os.getenv('FORMS_RECOGNIZER_ENDPOINT')
FORMS_RECOGNIZER_KEY = os.getenv('FORMS_RECOGNIZER_KEY')

COG_SEARCH_ENDPOINT = os.getenv('COG_SEARCH_ENDPOINT')
COG_SEARCH_INDEX_NAME = os.getenv('COG_SEARCH_INDEX_NAME')
COG_SEARCH_KEY = os.getenv('COG_SEARCH_KEY')

TEXT_EMBEDDING_MODEL = os.getenv('TEXT_EMBEDDING_MODEL')
OPENAI_API_BASE = os.getenv('OPENAI_API_BASE')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
OPENAI_API_VERSION = os.getenv('OPENAI_API_VERSION')
OPENAI_API_TYPE = os.getenv('OPENAI_API_TYPE')

STORAGE_ACCOUNT = os.getenv('STORAGE_ACCOUNT')
STORAGE_ACCOUNT_CONTAINER = os.getenv('STORAGE_ACCOUNT_CONTAINER')
STORAGE_CONNECTION_STRING = os.getenv('STORAGE_CONNECTION_STRING')

functionAppUrlAndKey = os.getenv('functionAppUrlAndKey')

print(COG_SEARCH_ENDPOINT)
print(STORAGE_CONNECTION_STRING)
print(functionAppUrlAndKey)

In [None]:
def create_datasource(COG_SEARCH_ENDPOINT, index_name, search_api_key, storage_connectionstring, storage_container):
    endpoint = COG_SEARCH_ENDPOINT
    url = '{0}/datasources/{1}-datasource?api-version=2023-07-01-Preview'.format(endpoint, index_name)
    print(url)
    payload = json.dumps({
                "description": "Demo files to demonstrate cognitive search capabilities.",
                "type": "azureblob",
                "credentials": {
                    "connectionString": storage_connectionstring
                },
                "container": {
                    "name": storage_container
                }
                })
    headers = {
    'api-key': search_api_key,
    'Content-Type': 'application/json'
            }
    
    response = requests.request("PUT", url, headers=headers, data=payload)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        return response, False

In [None]:
def create_skillset(COG_SEARCH_ENDPOINT, index, cognitive_search_key, embeddingFunctionAppUriAndKey):
    endpoint = COG_SEARCH_ENDPOINT
    url = '{0}/skillsets/{1}-skillset?api-version=2023-07-01-Preview'.format(endpoint, index)
    print(url)
    payload = json.dumps(
                        {
							"@odata.context": "{}/$metadata#skillsets/$entity".format(endpoint),
							"@odata.etag": "\"0x8DB2B4BF82370CF\"",
							"name": "{0}-skillset".format(index),
							"description": "Skillset using form recognizer",
							"skills": [
								{
								  "@odata.type": "#Microsoft.Skills.Custom.WebApiSkill",
								  "uri": embeddingFunctionAppUriAndKey,
								  "httpMethod": "POST",
								  "timeout": "PT230S",
								  "batchSize": 1,
								  "degreeOfParallelism": 1,
								  "name": "Embeddings",
								  "description": "",
								  "context": "/document",
								  "inputs": [
										{
										  "name": "source",
										  "source": "/document/metadata_storage_name"
										}
								  ],
								  "outputs": [
										  {
											"name": "embeddings",
											"targetName": "embeddings"
										  },
										  {
											"name": "embeddings_text",
											"targetName": "embeddings_text"
										  },
										  {
											"name": "vector_search_keys",
											"targetName": "vector_search_keys"
										  }
								  ]
								}
							],
							"cognitiveServices": None,
							"knowledgeStore": None,
							"encryptionKey": None
                        })
    
    headers = {
        'Content-Type': 'application/json',
        'api-key': '{0}'.format(cognitive_search_key)
    }

    
    response = requests.request("PUT", url, headers=headers, data=payload)

    print(response.text)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        return response, False

In [None]:
def update_index_semantic(COG_SEARCH_ENDPOINT, index, cognitive_search_key):

    endpoint = COG_SEARCH_ENDPOINT
    url = '{0}/indexes/{1}/?api-version=2023-07-01-Preview'.format(endpoint, index)
    print(url)

    payload = json.dumps({
    "name": index,
    "defaultScoringProfile": "",
    "fields": [
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_last_modified",
            "type": "Edm.DateTimeOffset",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_name",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_storage_path",
            "type": "Edm.String",
            "searchable": False,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": True,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "metadata_creation_date",
            "type": "Edm.DateTimeOffset",
            "searchable": False,
            "filterable": False,
            "retrievable": False,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": None,
            "synonymMaps": []
        },
        {
            "name": "embeddings_text",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "embeddings",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        },
        {
            "name": "vector_search_keys",
            "type": "Collection(Edm.String)",
            "searchable": True,
            "filterable": False,
            "retrievable": True,
            "sortable": False,
            "facetable": False,
            "key": False,
            "indexAnalyzer": None,
            "searchAnalyzer": None,
            "analyzer": "standard.lucene",
            "synonymMaps": []
        }
    ],
    "scoringProfiles": [],
    "corsOptions": None,
    "suggesters": [],
    "semantic": {"defaultConfiguration": None,"configurations": []},
    "analyzers": [],
    "tokenizers": [],
    "tokenFilters": [],
    "charFilters": [],
    "encryptionKey": None,
    "similarity": {
        "@odata.type": "#Microsoft.Azure.Search.BM25Similarity",
        "k1": None,
        "b": None
    }
    })
    headers = {
    'api-key': cognitive_search_key,
    'Content-Type': 'application/json'
    }

    response = requests.request("PUT", url, headers=headers, data=payload)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        # print('************************')
        # print(response.status_code)
        # print(response.text)
        return response, False

In [None]:
def create_indexer(COG_SEARCH_ENDPOINT, index, search_key):
    endpoint = COG_SEARCH_ENDPOINT
    url = '{0}/indexers/{1}-indexer/?api-version=2023-07-01-Preview'.format(endpoint, index)
    print(url)

    payload = json.dumps({
    "name": "{0}-indexer".format(index),
    "description": "",
    "dataSourceName": "{0}-datasource".format(index),
    "skillsetName": "{0}-skillset".format(index),
    "targetIndexName": "{0}".format(index),
    "disabled": None,
    "schedule": None,
    "parameters": {
        "batchSize": None,
        "maxFailedItems": 0,
        "maxFailedItemsPerBatch": 0,
        "base64EncodeKeys": None,
        "configuration": {
        "dataToExtract": "contentAndMetadata",
        "parsingMode": "default",
        "imageAction": "generateNormalizedImages"
        }
    },
    "fieldMappings": [
        {
        "sourceFieldName": "metadata_storage_path",
        "targetFieldName": "metadata_storage_path",
        "mappingFunction": {
            "name": "base64Encode",
            "parameters": None
        }
        }
    ],
    "outputFieldMappings": [
        {
        "sourceFieldName": "/document/embeddings",
        "targetFieldName": "embeddings"
        },
        {
        "sourceFieldName": "/document/embeddings_text",
        "targetFieldName": "embeddings_text"
        },
        {
        "sourceFieldName": "/document/vector_search_keys",
        "targetFieldName": "vector_search_keys"
        },
        
    ],
    "cache": None,
    "encryptionKey": None
    })
    headers = {
    'Content-Type': 'application/json',
    'api-key': '{0}'.format(search_key)
    }


    response = requests.request("PUT", url, headers=headers, data=payload)


    if response.status_code == 201 or response.status_code == 204:
        print('good')
        return response, True
    else:
        print(response.status_code)
        return response, False

In [None]:
# Creating vector search index
## This creates the JSON that will create the search index with vector search enabled.
def index_for_vectors(service_name_endpoint, index, cognitive_search_key):
    url = '{0}/indexes/{1}-vector/?api-version=2023-07-01-Preview'.format(service_name_endpoint, index)
    print(url)

    payload = json.dumps({
    "name": index + "-vector",
    "defaultScoringProfile": "",
    "fields": [
        {
            "name": "key",
            "type": "Edm.String",
            "searchable": False,
            "retrievable": True,
            "key": True,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "index",
            "type": "Edm.Int32",
            "searchable": False,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "title",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "content",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },
        {
            "name": "path",
            "type": "Edm.String",
            "searchable": True,
            "retrievable": True,
            "key": False,
            "filterable": False,
            "facetable": False,
            "sortable": False
        },        
        {
            "name": "titleVector",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "retrievable": True,
            "dimensions": 1536,
            "vectorSearchConfiguration": "my-vector-config"
        },
        {
            "name": "contentVector",
            "type": "Collection(Edm.Single)",
            "searchable": True,
            "retrievable": True,
            "dimensions": 1536,
            "vectorSearchConfiguration": "my-vector-config"
        }
    ],
    "corsOptions": {
        "allowedOrigins": [
            "*"
        ],
        "maxAgeInSeconds": 60
    },
    "vectorSearch": {
        "algorithmConfigurations": [
            {
                "name": "my-vector-config",
                "kind": "hnsw",
                "hnswParameters": {
                    "m": 4,
                    "efConstruction": 400,
                    "metric": "cosine"
                }
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "content"
                        }
                    ],
                    "prioritizedKeywordsFields": [
                        {
                            "fieldName": "content"
                        }
                    ]
                }
            }
        ]
    }
    })
    headers = {
    'api-key': cognitive_search_key,
    'Content-Type': 'application/json'
    }

    response = requests.request("PUT", url, headers=headers, data=payload)

    if response.status_code == 201 or response.status_code == 204:
        return response, True
    else:
        print('************************')
        print(response.status_code)
        print(response.text)
        return response, False

In [None]:
response, success =  index_for_vectors(COG_SEARCH_ENDPOINT, COG_SEARCH_INDEX_NAME, COG_SEARCH_KEY)
print(response)

In [None]:
response, success = create_datasource(COG_SEARCH_ENDPOINT, COG_SEARCH_INDEX_NAME, COG_SEARCH_KEY, STORAGE_CONNECTION_STRING, STORAGE_ACCOUNT_CONTAINER)
print(response)


In [None]:
if success == True:
    print(functionAppUrlAndKey)
    response, success = create_skillset(COG_SEARCH_ENDPOINT, COG_SEARCH_INDEX_NAME, COG_SEARCH_KEY, functionAppUrlAndKey)
    print(response)


In [None]:
if success == True:
    response, success =  update_index_semantic(COG_SEARCH_ENDPOINT, COG_SEARCH_INDEX_NAME, COG_SEARCH_KEY)
    print(response)

In [None]:
if success == True:
    response, success =  create_indexer(COG_SEARCH_ENDPOINT, COG_SEARCH_INDEX_NAME, COG_SEARCH_KEY)
    print(response)