In [82]:
import os
import re
import requests
import sys
import pandas as pd
import numpy as np
from openai import AzureOpenAI
import json
from dotenv import load_dotenv
import azure.search.documents as azsearch
load_dotenv("credentials.env", override=True)

True

In [83]:
aoai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
aoai_api_key = os.environ["AZURE_OPENAI_API_KEY"]
deployment_name = "gpt-4o"
aoai_api_version = os.environ["AZURE_OPENAI_API_VERSION"] #Using latest version as of this date. Older versions may not work with the code as syntax has changed drastically.
embedding_deployment = "text-embedding-ada-002"
# Azure AI Search setup
search_endpoint = os.environ["AZURE_SEARCH_ENDPOINT"] # Add your Azure AI Search endpoint here
search_key = os.environ["AZURE_SEARCH_KEY"] # Add your Azure AI Search admin key here
search_index_name = "elonmusk-vindex-6whqjwtaeogci"; # Add your Azure AI Search index name here
AZURE_SEARCH_API_VERSION="2024-05-01-preview"

blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]
blob_container_name = "elonmusk-wiki"
blob_sas_token = os.environ["BLOB_SAS_TOKEN"]

# Create an Azure OpenAI client.  
client = AzureOpenAI(        
    #base_url=f"{aoai_endpoint}/openai/deployments/{deployment_name}/extensions/",
    azure_endpoint=aoai_endpoint,    
    api_key=aoai_api_key,    
    api_version=aoai_api_version
)

Create a blob data source connector on Azure AI Search


In [84]:
from azure.core.credentials import AzureKeyCredential

# Create a client
search_key_credential = AzureKeyCredential(search_key)
search_client = azsearch.SearchClient(endpoint=search_endpoint,
                      index_name=search_index_name,
                      credential=search_key_credential)

In [85]:
# Setup the Payloads header
headers = {'Content-Type': 'application/json','api-key': search_key}
params = {'api-version': AZURE_SEARCH_API_VERSION}

create data source

In [41]:
search_datasource_name = f"{search_index_name}-source"

In [172]:
# The following code sends the json paylod to Azure Search engine to create the Datasource

datasource_payload = {
    "name": search_datasource_name,
    "description": "elon musk wiki files to demonstrate cognitive search capabilities.",
    "type": "azureblob",
    "credentials": {
        "connectionString": blob_connection_string
    },
    "dataDeletionDetectionPolicy" : {
        "@odata.type" :"#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy" # this makes sure that if the item is deleted from the source, it will be deleted from the index
    },
    "container": {
        "name": blob_container_name
    }
}
r = requests.put(os.environ['AZURE_SEARCH_ENDPOINT'] + "/datasources/" + search_datasource_name,
                 data=json.dumps(datasource_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


201 - Successfully created

204 - Succesfully overwritten

40X - Authentication Error

create index

In [173]:
# Create an index
# Queries operate over the searchable fields and filterable fields in the index
index_payload = {
    "name": search_index_name,
    "vectorSearch": {
        "algorithms": [
            {
                "name": "myalgo",
                "kind": "hnsw"
            }
        ],
        "vectorizers": [
            {
                "name": "openai",
                "kind": "azureOpenAI",
                "azureOpenAIParameters":
                {
                    "resourceUri" : aoai_endpoint,
                    "apiKey" : aoai_api_key,
                    "deploymentId" : embedding_deployment,
                    "modelName" : embedding_deployment,
     
                }
            }
        ],
        "profiles": [
            {
                "name": "myprofile",
                "algorithm": "myalgo",
                "vectorizer":"openai"
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "chunk"
                        }
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    },
    "fields": [
        {"name": "id", "type": "Edm.String", "key": "true", "analyzer": "keyword", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false","facetable": "false"},
        {"name": "ParentKey", "type": "Edm.String", "searchable": "true", "retrievable": "true", "facetable": "false", "filterable": "true", "sortable": "false"},
        {"name": "title", "type": "Edm.String", "searchable": "true", "retrievable": "true", "facetable": "false", "filterable": "true", "sortable": "false"},
        {"name": "name", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {"name": "filepath", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},   
        {"name": "chunk","type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {
            "name": "chunkVector",
            "type": "Collection(Edm.Single)",
            "dimensions": 1536, # IMPORTANT: Make sure these dimmensions match your embedding model name
            "vectorSearchProfile": "myprofile",
            "searchable": "true",
            "retrievable": "true",
            "filterable": "false",
            "sortable": "false",
            "facetable": "false"
        }
    ]
}

r = requests.put(search_endpoint + "/indexes/" + search_index_name,
                 data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [174]:
r.text

'{"@odata.context":"https://6whqjwtaeogci-search.search.windows.net/$metadata#indexes/$entity","@odata.etag":"\\"0x8DD2C32B4E565F8\\"","name":"elonmusk-vindex-6whqjwtaeogci","defaultScoringProfile":null,"fields":[{"name":"id","type":"Edm.String","searchable":true,"filterable":false,"retrievable":true,"stored":true,"sortable":false,"facetable":false,"key":true,"indexAnalyzer":null,"searchAnalyzer":null,"analyzer":"keyword","normalizer":null,"dimensions":null,"vectorSearchProfile":null,"vectorEncoding":null,"synonymMaps":[]},{"name":"ParentKey","type":"Edm.String","searchable":true,"filterable":true,"retrievable":true,"stored":true,"sortable":false,"facetable":false,"key":false,"indexAnalyzer":null,"searchAnalyzer":null,"analyzer":null,"normalizer":null,"dimensions":null,"vectorSearchProfile":null,"vectorEncoding":null,"synonymMaps":[]},{"name":"title","type":"Edm.String","searchable":true,"filterable":true,"retrievable":true,"stored":true,"sortable":false,"facetable":false,"key":false,"

Create Skillset - OCR, Text Splitter, AzureOpenAIEmbeddingSkill

In [175]:
skillset_name = "elonmusk-wiki-skill"

# Create a skillset
skillset_payload = {
"name": skillset_name,
"description": "e2e Skillset for RAG - Files",
"skills":
[
    {
        "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
        "description": "Extract text (plain and structured) from image.",
        "context": "/document/normalized_images/*",
        "defaultLanguageCode": "en",
        "detectOrientation": True,
        "inputs": [
            {
                "name": "image",
                "source": "/document/normalized_images/*"
            }
        ],
            "outputs": [
            {
                "name": "text",
                "targetName" : "images_text"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.MergeSkill",
        "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field. This is useful for PDF and other file formats that supported embedded images.",
        "context": "/document",
        "insertPreTag": " ",
        "insertPostTag": " ",
        "inputs": [
            {
                "name":"text", "source": "/document/content"
            },
            {
                "name": "itemsToInsert", "source": "/document/normalized_images/*/images_text"
            },
            {
                "name":"offsets", "source": "/document/normalized_images/*/contentOffset"
            }
        ],
        "outputs": [
            {
                "name": "mergedText", 
                "targetName" : "merged_text"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
        "context": "/document",
        "textSplitMode": "pages",  # although it says "pages" it actally means chunks, not actual pages
        "maximumPageLength": 500, # 5000 characters is default and a good choice
        "pageOverlapLength": 50,  # 15% overlap among chunks
        "defaultLanguageCode": "en",
        "inputs": [
            {
                "name": "text",
                "source": "/document/merged_text"
            }
        ],
        "outputs": [
            {
                "name": "textItems",
                "targetName": "chunks"
            }
        ]
    },
    {
        "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
        "description": "Azure OpenAI Embedding Skill",
        "context": "/document/chunks/*",
        "resourceUri": aoai_endpoint,
        "apiKey": aoai_api_key,
        "deploymentId": embedding_deployment,
        "modelName": embedding_deployment,
        "inputs": [
            {
                "name": "text",
                "source": "/document/chunks/*"
            }
        ],
        "outputs": [
            {
                "name": "embedding",
                "targetName": "vector"
            }
        ]
    }
],
"indexProjections": {
    "selectors": [
        {
            "targetIndexName": search_index_name,
            "parentKeyFieldName": "ParentKey",
            "sourceContext": "/document/chunks/*",
            "mappings": [
                {
                    "name": "title",
                    "source": "/document/title"
                },
                {
                    "name": "name",
                    "source": "/document/name"
                },
                {
                    "name": "filepath",
                    "source": "/document/location"
                },
                {
                    "name": "chunk",
                    "source": "/document/chunks/*"
                },
                {
                    "name": "chunkVector",
                    "source": "/document/chunks/*/vector"
                }
            ]
        }
    ],
    "parameters": {
        "projectionMode": "skipIndexingParentDocuments"
    }
},
"cognitiveServices": {
    "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
    "description": os.environ['COG_SERVICES_NAME'],
    "key": os.environ['COG_SERVICES_KEY']
}
}

r = requests.put(search_endpoint + "/skillsets/" + skillset_name,
                data=json.dumps(skillset_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [176]:
r.text

'{"@odata.context":"https://6whqjwtaeogci-search.search.windows.net/$metadata#skillsets/$entity","@odata.etag":"\\"0x8DD2C32BD9AD1CC\\"","name":"elonmusk-wiki-skill","description":"e2e Skillset for RAG - Files","skills":[{"@odata.type":"#Microsoft.Skills.Vision.OcrSkill","name":null,"description":"Extract text (plain and structured) from image.","context":"/document/normalized_images/*","textExtractionAlgorithm":null,"lineEnding":null,"defaultLanguageCode":"en","detectOrientation":true,"inputs":[{"name":"image","source":"/document/normalized_images/*","sourceContext":null,"inputs":[]}],"outputs":[{"name":"text","targetName":"images_text"}]},{"@odata.type":"#Microsoft.Skills.Text.MergeSkill","name":null,"description":"Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field. This is useful for PDF and other file formats that supported embedded images.","context":"/document","insertPreTag":" ","insertPostTag":" ",

Create and Run the Indexer - (runs the pipeline)

The three components you have created thus far (data source, skillset, index) are inputs to an indexer. Creating the indexer on Azure Cognitive Search is the event that puts the entire pipeline into motion.

In [177]:
search_indexer_name = search_index_name + "-indexer"
# Create an indexer
indexer_payload = {
    "name": search_indexer_name,
    "dataSourceName": search_datasource_name,
    "targetIndexName": search_index_name,
    "skillsetName": skillset_name,
    "schedule" : { "interval" : "PT30M"}, # How often do you want to check for new content in the data source
    "fieldMappings": [
        {
          "sourceFieldName" : "metadata_title",
          "targetFieldName" : "title"
        },
        {
          "sourceFieldName" : "metadata_storage_name",
          "targetFieldName" : "name"
        },
        {
          "sourceFieldName" : "metadata_storage_name",
          "targetFieldName" : "filepath"
        },
        {
          "sourceFieldName" : "id",
          "targetFieldName" : "chunk_id"
        }
    ],
    "outputFieldMappings":[],
    "parameters":
    {
        "maxFailedItems": -1,
        "maxFailedItemsPerBatch": -1,
        "configuration":
        {
            "dataToExtract": "contentAndMetadata",
            "imageAction": "generateNormalizedImages"
        }
    }
}

r = requests.put(search_endpoint + "/indexers/" + search_indexer_name ,
                 data=json.dumps(indexer_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

201
True


In [140]:
# Uncomment if you find an error
r.text

'{"error":{"code":"","message":"This indexer refers to a skillset \'elonmusk-wiki-skill\' that doesn\'t exist"}}'

Note: If you get a 400 unauthorize error, make sure that you are using the Azure Search MANAGEMENT KEY, not the QUERY key

In [171]:
# Optionally, get indexer status to confirm that it's running
try:
    r = requests.get(search_endpoint + "/indexers/" + search_indexer_name +
                     "/status", headers=headers, params=params)
    # pprint(json.dumps(r.json(), indent=1))
    print(r.status_code)
    print("Status:",r.json().get('lastResult').get('status'))
    print("Items Processed:",r.json().get('lastResult').get('itemsProcessed'))
    print(r.ok)
    
except Exception as e:
    print("Wait a few seconds until the process starts and run this cell again.")

200
Wait a few seconds until the process starts and run this cell again.


When the indexer finishes running we will have all documents indexed in the AI Search Engine!.

In [161]:
results = search_client.search(search_text="elon musk", top=5)

for result in results:
    print("{})".format(result))

{'filepath': None, 'chunkVector': [-0.009935394, -0.0195259, 0.013424057, -0.008715025, -0.00033162197, 0.0018073397, -0.040802766, 0.014670955, -0.014339333, 0.006884472, 0.014418922, 0.02615834, 0.0045830156, 0.021528898, -0.010664962, -0.009013485, 0.03934363, -0.00090118265, 0.011036378, -0.00736864, -0.010844038, -0.0043077692, -0.022908445, 0.001721118, 0.0017111693, -0.009278783, 0.002100825, -0.03366626, 0.01957896, -0.003415706, 0.011600136, -0.018610625, -0.010452724, -0.008469624, -0.0072359913, 0.010068042, -0.0061814333, -0.011341471, 0.026728729, -0.020467708, 0.014206684, 0.04135989, 0.0032366302, -0.007832911, -0.024168609, 0.0036975848, -0.011494017, -0.0012941547, -0.00019596785, 0.012197056, 0.038919155, 0.024128813, -0.010989952, -0.006387039, -0.017549435, -0.020189146, -0.01517502, 0.020215675, 0.01620968, 0.019353457, 0.01960549, -0.013954652, -0.024221668, 0.009252253, -0.013477116, 0.016594363, -0.037964083, 0.020069761, -0.00026674842, 0.008144636, 0.034117267

Perform vector similarity search

In [164]:
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizableTextQuery

# Pure Vector Search
query = "Who is Elon Musk?"
# if use_ocr:
#     query = "Who is the national director?"
  
search_client = SearchClient(search_endpoint, search_index_name, credential=search_key_credential)
vector_query = VectorizableTextQuery(text=query, k_nearest_neighbors=1, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k_nearest_neighbors=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["ParentKey", "title", "chunk"],
    top=1
)  


    
# for result in results:      
#     # print(f"ParentKey: {result['ParentKey']}")  
#     print(f"title: {result['title']}")  
    # print(f"Score: {result['@search.score']}")  
    # print(f"Content: {result['chunk']}")   

In [165]:
query = "Which is Elon Musk?"

agg_search_results = dict()
k = 10


search_payload = {
    "search": query, # Text query
    "select": "id, title, name, filepath, chunk", # Select fields to return
    "queryType": "semantic",
    "vectorQueries": [{"text": query, "fields": "chunkVector", "kind": "text", "k": k, 
                        "threshold": { 
                                "kind": "vectorSimilarity", 
                                "value": 0.8 
                            }
                        }], # Vector query
    "semanticConfiguration": "my-semantic-config",
    "captions": "extractive",
    "answers": "extractive",
    "count":"true",
    "top": k
}

r = requests.post(search_endpoint + "/indexes/" + search_index_name + "/docs/search",
                    data=json.dumps(search_payload), headers=headers, params=params)
print(r.status_code)

search_results = r.json()
agg_search_results[search_index_name]=search_results
print("Index:", search_index_name, "Results Found: {}, Results Returned: {}".format(search_results['@odata.count'], len(search_results['value'])))

200
Index: elonmusk-vindex-6whqjwtaeogci Results Found: 956, Results Returned: 10


In [166]:
agg_search_results

{'elonmusk-vindex-6whqjwtaeogci': {'@odata.context': "https://6whqjwtaeogci-search.search.windows.net/indexes('elonmusk-vindex-6whqjwtaeogci')/$metadata#docs(*)",
  '@odata.count': 956,
  '@search.answers': [{'key': 'c75748b74652_aHR0cHM6Ly82d2hxand0YWVvZ2Npc2EuYmxvYi5jb3JlLndpbmRvd3MubmV0L2Vsb25tdXNrLXdpa2kvRWxvbl9NdXNrLnBkZg2_chunks_0',
    'text': '... chairman of X (formerly Twitter)  Founder of The Boring Company, X Corp., and xAI  Co-founder of Neuralink, OpenAI, Zip2, and X.com (part of PayPal)  President of the Musk Foundation  Elon Musk Elon Reeve Musk (/ˈiːlɒn mʌsk/; born June 28, 1971 businessman known for his key roles in the space company SpaceX and the automotive company Tesla, Inc',
    'highlights': '',
    'score': 0.9950000047683716}],
  'value': [{'@search.score': 0.028431374579668045,
    '@search.rerankerScore': 3.356907844543457,
    '@search.captions': [{'text': '...lvania (BA, BS)  Occupation Businessman  Title Founder, CEO, and chief engineer  of SpaceX  CEO an

Display the top results  based on the score

In [169]:
from collections import OrderedDict
from IPython.display import display, HTML, Markdown

display(HTML('<h4>Top Answers</h4>'))

for index,search_results in agg_search_results.items():

    for result in search_results['@search.answers']:
        if result['score'] > 0.5: # Show answers that are at least 50% of the max possible score=1
            display(HTML('<h5>' + 'Answer - score: ' + str(round(result['score'],2)) + '</h5>'))
            display(HTML(result['text']))

            
print("\n\n")
display(HTML('<h4>Top Results</h4>'))

content = dict()
ordered_content = OrderedDict()


for index,search_results in agg_search_results.items():
    for result in search_results['value']:
        if result['@search.rerankerScore'] > 1:# Show answers that are at least 25% of the max possible score=4
            content[result['id']]={
                                    "title": result['title'],
                                    "chunk": result['chunk'], 
                                    "name": result['name'], 
                                    "filepath": result['filepath'] ,
                                    "caption": result['@search.captions'][0]['text'],
                                    "score": result['@search.rerankerScore'],
                                    "index": index
                                    }
    
#After results have been filtered we will Sort and add them as an Ordered list\n",
for id in sorted(content, key= lambda x: content[x]["score"], reverse=True):
    ordered_content[id] = content[id]
    url = str(ordered_content[id]['filepath']) +  os.environ['BLOB_SAS_TOKEN']
    
    title = str(ordered_content[id]['title']) if (ordered_content[id]['title']) else ordered_content[id]['name']
    score = str(round(ordered_content[id]['score'],2))
    display(HTML('<h5><a href="'+ url + '">' + title + '</a> - score: '+ score + '</h5>'))
    #use bs4 to parse the html content and display the text
    from bs4 import BeautifulSoup
    htmlContent = ordered_content[id]['caption']
    soup = BeautifulSoup(htmlContent, features="html.parser")
    contextTxt = soup.get_text()
    display(contextTxt)






'...lvania (BA, BS)  Occupation Businessman  Title Founder, CEO, and chief engineer  of SpaceX  CEO and product architect of Tesla, Inc.  Owner, CTO and executive chairman of X (formerly Twitter)  Founder of The Boring Company, X Corp., and xAI  Co-founder of Neuralink, OpenAI, Zip2, and X.com (part of PayPal)  President of the Musk Foundation .'

"Elon Reeve Musk was born on June 28, 1971, in Pretoria, South Africa's administrative capital.[3][4] He is of British and Pennsylvania Dutch ancestry.[5][6] His mother, Maye (née Haldeman), is a model and dietitian born in Saskatchewan, Canada, and raised in South Africa.[7][8][9] His father, Errol Musk, is a South African electromechanical."

"2012)    (m. 2013; div. 2016)   Children 12[1]  Parents Errol Musk (father)  Maye Musk (mother)  Relatives Kimbal Musk (brother)  Tosca Musk (sister)  Lyndon Rive (cousin)  Awards Full list  Elon Musk's voice  Elon Musk speaking about India and his meeting with its prime minister, Narendra Modi  Recorded June 20, 2023  Signature  In 2015, he."

'"Elon Musk: \'I am moderate\' " (https://www.thenews.com.pk/latest/1065654-elon-musk-i-am- moderate). www.thenews.com.pk. Archived (https://web.archive.org/web/20240812153155/h ttps://www.thenews.com.pk/latest/1065654-elon-musk-i-am-moderate) from the original on August 12, 2024. "Elon Musk, SpaceX Founder, Battles Entrenched Rivals Over NASA.'

'and its people.[476]: 207–208  In 2022, Musk wrote an article for China Cyberspace, the official publication of Cyberspace Administration of China, which enforces Internet censorship in China. His writing the article was described as conflicting with his advocacy for free speech.[477][478] Musk later advocated for Taiwan to become a "special.'

'a "scam" designed to influence the election.[655] in November 2024, he was sued again regarding the lottery[656] and two US senators called for a probe into alleged contacts with Vladimir Putin.[657]. Although his ventures have been highly influential within their separate industries starting in the 2000s, Musk only became a public figure in the.'

'remarks on the issue.[391][392] In the same interview, Musk stated that the economy would collapse without oil and gas, repeating previous statements that it was wrong to "vilify" the oil and gas industries.[390][393][394][395]  Musk has long promoted the colonization of Mars and argues that humanity should become a "multiplanetary species".[396]..'

'and the hub of a dark money network supporting Trump. The organization created "Progress 2028", which promoted misinformation about the agenda of Democratic presidential candidate Harris, and was presented as the left\'s response to Project 2025, Musk has also promoted the Fair Election Fund, which is heavily funded by Building America\'s Future,.'

'Doubt Ur Vibe", featuring his own lyrics and vocals.[318]. While Guardian critic Alexi Petridis described it as "indistinguishable ... from umpteen competent but unthri... Musk uses a private jet owned by Falcon Landing LLC, a SpaceX-linked company, and acquired a second jet in August 2020.[320][321] His heavy use of the jets—which flew over.'

'[cs.CE (https://arxiv.org/archive/cs.C E)].  277. "Revealed: Elon Musk Explains the Hyperloop, the Solar- Powered High-Speed Future of Inter-City Transportation" (https://web.archive.org/web/2015 0127202031/http://www.businessweek.com/articles/2013-08-12/revealed-elon-musk-explain s-the-hyperloop). Bloomberg BusinessWeek. Archived from the.'