In [24]:
import os
import re
import requests
import sys
from num2words import num2words
import os
import pandas as pd
import numpy as np
import tiktoken
from openai import AzureOpenAI
import json
from dotenv import load_dotenv
load_dotenv("credentials.env")

True

In [27]:
aoai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
aoai_api_key = os.environ["AZURE_OPENAI_API_KEY"]
deployment_name = "gpt-35-turbo"
aoai_api_version = "2024-02-15-preview" #Using latest version as of this date. Older versions may not work with the code as syntax has changed drastically.
embedding_deployment = "text-embedding-ada-002"
# Azure AI Search setup
search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"] # Add your Azure AI Search endpoint here
search_key = os.environ["AZURE_SEARCH_KEY"] # Add your Azure AI Search admin key here
search_index_name = "contosobenefits-vindex"; # Add your Azure AI Search index name here
AZURE_SEARCH_API_VERSION="2024-05-01-preview"

blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = "contoso-benefits"
blob_sas_token = os.environ["BLOB_SAS_TOKEN"]

# Create an Azure OpenAI client.  
client = AzureOpenAI(        
    #base_url=f"{aoai_endpoint}/openai/deployments/{deployment_name}/extensions/",
    azure_endpoint=aoai_endpoint,    
    api_key=aoai_api_key,    
    api_version=aoai_api_version
)

Create a blob data source connector on Azure AI Search


In [21]:
from azure.core.credentials import AzureKeyCredential

# Create a client
search_key_credential = AzureKeyCredential(search_key)
search_client = SearchClient(endpoint=search_endpoint,
                      index_name=search_index_name,
                      credential=search_key_credential)

In [28]:
# Setup the Payloads header
headers = {'Content-Type': 'application/json','api-key': search_key}
params = {'api-version': AZURE_SEARCH_API_VERSION}

create data source

In [38]:
search_datasource_name = f"{search_index_name}-source"

In [29]:
# The following code sends the json paylod to Azure Search engine to create the Datasource

datasource_payload = {
    "name": search_datasource_name,
    "description": "Demo files to demonstrate cognitive search capabilities.",
    "type": "azureblob",
    "credentials": {
        "connectionString": blob_connection_string
    },
    "dataDeletionDetectionPolicy" : {
        "@odata.type" :"#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy" # this makes sure that if the item is deleted from the source, it will be deleted from the index
    },
    "container": {
        "name": blob_container_name
    }
}
r = requests.put(os.environ['AZURE_SEARCH_ENDPOINT'] + "/datasources/" + search_datasource_name,
                 data=json.dumps(datasource_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


201 - Successfully created

204 - Succesfully overwritten

40X - Authentication Error

create index

In [30]:
# Create an index
# Queries operate over the searchable fields and filterable fields in the index
index_payload = {
    "name": search_index_name,
    "vectorSearch": {
        "algorithms": [
            {
                "name": "myalgo",
                "kind": "hnsw"
            }
        ],
        "vectorizers": [
            {
                "name": "openai",
                "kind": "azureOpenAI",
                "azureOpenAIParameters":
                {
                    "resourceUri" : aoai_endpoint,
                    "apiKey" : aoai_api_key,
                    "deploymentId" : embedding_deployment,
                    "modelName" : embedding_deployment,
     
                }
            }
        ],
        "profiles": [
            {
                "name": "myprofile",
                "algorithm": "myalgo",
                "vectorizer":"openai"
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "chunk"
                        }
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    },
    "fields": [
        {"name": "id", "type": "Edm.String", "key": "true", "analyzer": "keyword", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false","facetable": "false"},
        {"name": "ParentKey", "type": "Edm.String", "searchable": "true", "retrievable": "true", "facetable": "false", "filterable": "true", "sortable": "false"},
        {"name": "title", "type": "Edm.String", "searchable": "true", "retrievable": "true", "facetable": "false", "filterable": "true", "sortable": "false"},
        {"name": "name", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {"name": "location", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},   
        {"name": "chunk","type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {
            "name": "chunkVector",
            "type": "Collection(Edm.Single)",
            "dimensions": 1536, # IMPORTANT: Make sure these dimmensions match your embedding model name
            "vectorSearchProfile": "myprofile",
            "searchable": "true",
            "retrievable": "true",
            "filterable": "false",
            "sortable": "false",
            "facetable": "false"
        }
    ]
}

r = requests.put(search_endpoint + "/indexes/" + search_index_name,
                 data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [None]:
#r.text

Create Skillset - OCR, Text Splitter, AzureOpenAIEmbeddingSkill

In [36]:
skillset_name = "contoso-benefits-skill"

# Create a skillset
skillset_payload = {
"name": skillset_name,
"description": "e2e Skillset for RAG - Files",
"skills":
[
    {
        "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
        "description": "Extract text (plain and structured) from image.",
        "context": "/document/normalized_images/*",
        "defaultLanguageCode": "en",
        "detectOrientation": True,
        "inputs": [
            {
                "name": "image",
                "source": "/document/normalized_images/*"
            }
        ],
            "outputs": [
            {
                "name": "text",
                "targetName" : "images_text"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
        "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field. This is useful for PDF and other file formats that supported embedded images.",
        "context": "/document",
        "insertPreTag": " ",
        "insertPostTag": " ",
        "inputs": [
            {
                "name":"text", "source": "/document/content"
            },
            {
                "name": "itemsToInsert", "source": "/document/normalized_images/*/images_text"
            },
            {
                "name":"offsets", "source": "/document/normalized_images/*/contentOffset"
            }
        ],
        "outputs": [
            {
                "name": "mergedText", 
                "targetName" : "merged_text"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
        "context": "/document",
        "textSplitMode": "pages",  # although it says "pages" it actally means chunks, not actual pages
        "maximumPageLength": 5000, # 5000 characters is default and a good choice
        "pageOverlapLength": 750,  # 15% overlap among chunks
        "defaultLanguageCode": "en",
        "inputs": [
            {
                "name": "text",
                "source": "/document/merged_text"
            }
        ],
        "outputs": [
            {
                "name": "textItems",
                "targetName": "chunks"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
        "description": "Azure OpenAI Embedding Skill",
        "context": "/document/chunks/*",
        "resourceUri": aoai_endpoint,
        "apiKey": aoai_api_key,
        "deploymentId": embedding_deployment,
        "modelName": embedding_deployment,
        "inputs": [
            {
                "name": "text",
                "source": "/document/chunks/*"
            }
        ],
        "outputs": [
            {
                "name": "embedding",
                "targetName": "vector"
            }
        ]
    }
],
"indexProjections": {
    "selectors": [
        {
            "targetIndexName": search_index_name,
            "parentKeyFieldName": "ParentKey",
            "sourceContext": "/document/chunks/*",
            "mappings": [
                {
                    "name": "title",
                    "source": "/document/title"
                },
                {
                    "name": "name",
                    "source": "/document/name"
                },
                {
                    "name": "location",
                    "source": "/document/location"
                },
                {
                    "name": "chunk",
                    "source": "/document/chunks/*"
                },
                {
                    "name": "chunkVector",
                    "source": "/document/chunks/*/vector"
                }
            ]
        }
    ],
    "parameters": {
        "projectionMode": "skipIndexingParentDocuments"
    }
},
"cognitiveServices": {
    "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
    "description": os.environ['COG_SERVICES_NAME'],
    "key": os.environ['COG_SERVICES_KEY']
}
}

r = requests.put(search_endpoint + "/skillsets/" + skillset_name,
                data=json.dumps(skillset_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [37]:
r.text

'{"@odata.context":"https://cog-search-hf3ytnqvd5vby.search.windows.net/$metadata#skillsets/$entity","@odata.etag":"\\"0x8DCB01D2C1EEF6A\\"","name":"contoso-benefits-skill","description":"e2e Skillset for RAG - Files","skills":[{"@odata.type":"#Microsoft.Skills.Vision.OcrSkill","name":null,"description":"Extract text (plain and structured) from image.","context":"/document/normalized_images/*","textExtractionAlgorithm":null,"lineEnding":null,"defaultLanguageCode":"en","detectOrientation":true,"inputs":[{"name":"image","source":"/document/normalized_images/*","sourceContext":null,"inputs":[]}],"outputs":[{"name":"text","targetName":"images_text"}]},{"@odata.type":"#Microsoft.Skills.Text.MergeSkill","name":null,"description":"Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field. This is useful for PDF and other file formats that supported embedded images.","context":"/document","insertPreTag":" ","insertPostTa

Create and Run the Indexer - (runs the pipeline)

The three components you have created thus far (data source, skillset, index) are inputs to an indexer. Creating the indexer on Azure Cognitive Search is the event that puts the entire pipeline into motion.

In [39]:
search_indexer_name = search_index_name + "-indexer"
# Create an indexer
indexer_payload = {
    "name": search_indexer_name,
    "dataSourceName": search_datasource_name,
    "targetIndexName": search_index_name,
    "skillsetName": skillset_name,
    "schedule" : { "interval" : "PT30M"}, # How often do you want to check for new content in the data source
    "fieldMappings": [
        {
          "sourceFieldName" : "metadata_title",
          "targetFieldName" : "title"
        },
        {
          "sourceFieldName" : "metadata_storage_name",
          "targetFieldName" : "name"
        },
        {
          "sourceFieldName" : "metadata_storage_path",
          "targetFieldName" : "location"
        }
    ],
    "outputFieldMappings":[],
    "parameters":
    {
        "maxFailedItems": -1,
        "maxFailedItemsPerBatch": -1,
        "configuration":
        {
            "dataToExtract": "contentAndMetadata",
            "imageAction": "generateNormalizedImages"
        }
    }
}

r = requests.put(search_endpoint + "/indexers/" + search_indexer_name ,
                 data=json.dumps(indexer_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [None]:
# Uncomment if you find an error
# r.text

Note: If you get a 400 unauthorize error, make sure that you are using the Azure Search MANAGEMENT KEY, not the QUERY key

In [41]:
# Optionally, get indexer status to confirm that it's running
try:
    r = requests.get(search_endpoint + "/indexers/" + search_indexer_name +
                     "/status", headers=headers, params=params)
    # pprint(json.dumps(r.json(), indent=1))
    print(r.status_code)
    print("Status:",r.json().get('lastResult').get('status'))
    print("Items Processed:",r.json().get('lastResult').get('itemsProcessed'))
    print(r.ok)
    
except Exception as e:
    print("Wait a few seconds until the process starts and run this cell again.")

200
Status: success
Items Processed: 6
True


When the indexer finishes running we will have all documents indexed in the AI Search Engine!.

In [None]:
results = client.search(search_text="luxury")

for result in results:
    print("{}: {})".format(result["hotelId"], result["hotelName"]))

Perform vector similarity search

In [51]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"
# if use_ocr:
#     query = "Who is the national director?"
  
search_client = SearchClient(search_endpoint, search_index_name, credential=search_key_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["ParentKey", "title", "chunk"],
    top=1
)  


    
# for result in results:      
#     # print(f"ParentKey: {result['ParentKey']}")  
#     print(f"title: {result['title']}")  
    # print(f"Score: {result['@search.score']}")  
    # print(f"Content: {result['chunk']}")   

AttributeError: 'SearchItemPaged' object has no attribute 'get_results'

In [54]:
query = "Which is more comprehensive, Northwind Health Plus vs Northwind Standard?"

agg_search_results = dict()
k = 10


search_payload = {
    "search": query, # Text query
    "select": "id, title, name, location, chunk",
    "queryType": "semantic",
    "vectorQueries": [{"text": query, "fields": "chunkVector", "kind": "text", "k": k, 
                        "threshold": { 
                                "kind": "vectorSimilarity", 
                                "value": 0.8 
                            }
                        }], # Vector query
    "semanticConfiguration": "my-semantic-config",
    "captions": "extractive",
    "answers": "extractive",
    "count":"true",
    "top": k
}

r = requests.post(search_endpoint + "/indexes/" + search_index_name + "/docs/search",
                    data=json.dumps(search_payload), headers=headers, params=params)
print(r.status_code)

search_results = r.json()
agg_search_results[search_index_name]=search_results
print("Index:", search_index_name, "Results Found: {}, Results Returned: {}".format(search_results['@odata.count'], len(search_results['value'])))

200
Index: contosobenefits-vindex Results Found: 143, Results Returned: 10


In [55]:
agg_search_results

{'contosobenefits-vindex': {'@odata.context': "https://cog-search-hf3ytnqvd5vby.search.windows.net/indexes('contosobenefits-vindex')/$metadata#docs(*)",
  '@odata.count': 143,
  '@search.answers': [{'key': 'd2779449d2d0_aHR0cHM6Ly82d2hxand0YWVvZ2Npc2EuYmxvYi5jb3JlLndpbmRvd3MubmV0L2NvbnRvc28tYmVuZWZpdHMvQmVuZWZpdF9PcHRpb25zLnBkZg2_chunks_0',
    'text': 'Northwind Standard only offers coverage for doctor visits and lab  tests Northwind Health Plus is a comprehensive plan that offers more coverage than Northwind Standard Northwind Health Plus offers coverage for emergency services, mental health and substance abuse  coverage, and out-of-network services, while Northwind Standard does not Northwind H...',
    'highlights': '<em>Northwind Standard</em> only offers coverage for doctor visits and lab  tests Northwind Health Plus is a comprehensive plan that offers more coverage than Northwind Standard Northwind Health Plus offers coverage for emergency services, mental health and substance a

Display the top results  based on the score

In [57]:
from collections import OrderedDict
from IPython.display import display, HTML, Markdown

display(HTML('<h4>Top Answers</h4>'))

for index,search_results in agg_search_results.items():

    for result in search_results['@search.answers']:
        if result['score'] > 0.5: # Show answers that are at least 50% of the max possible score=1
            display(HTML('<h5>' + 'Answer - score: ' + str(round(result['score'],2)) + '</h5>'))
            display(HTML(result['text']))

            
print("\n\n")
display(HTML('<h4>Top Results</h4>'))

content = dict()
ordered_content = OrderedDict()


for index,search_results in agg_search_results.items():
    for result in search_results['value']:
        if result['@search.rerankerScore'] > 1:# Show answers that are at least 25% of the max possible score=4
            content[result['id']]={
                                    "title": result['title'],
                                    "chunk": result['chunk'], 
                                    "name": result['name'], 
                                    "location": result['location'] ,
                                    "caption": result['@search.captions'][0]['text'],
                                    "score": result['@search.rerankerScore'],
                                    "index": index
                                    }
    
#After results have been filtered we will Sort and add them as an Ordered list\n",
for id in sorted(content, key= lambda x: content[x]["score"], reverse=True):
    ordered_content[id] = content[id]
    url = str(ordered_content[id]['location']) + os.environ['BLOB_SAS_TOKEN']
    title = str(ordered_content[id]['title']) if (ordered_content[id]['title']) else ordered_content[id]['name']
    score = str(round(ordered_content[id]['score'],2))
    display(HTML('<h5><a href="'+ url + '">' + title + '</a> - score: '+ score + '</h5>'))
    display(HTML(ordered_content[id]['caption']))




