In [24]:
# ! pip install azure-search-documents==11.4.0

In [25]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes.models import (
    SearchFieldDataType,
    SearchIndex,
    SimpleField,
    SearchableField,
    SearchField,
    SearchField,  
    VectorSearch,  
    HnswAlgorithmConfiguration, 
    VectorSearchProfile
)
import pandas as pd
import os
import json
from datetime import datetime
import dotenv


# Load environment variables
dotenv.load_dotenv()

True

In [37]:
filenames = ['trivia_questions_1.csv', 'trivia_questions_2.csv', 'trivia_questions_3.csv', 'trivia_questions_4.csv', 'trivia_questions_5.csv']
data = pd.concat([pd.read_csv(f) for f in filenames])
data.head()

Unnamed: 0,Question,Answer,AnswerType,QuestionId,Entity,BackgroundContent,Embeddings
0,Forbes Magazine named which author as the top ...,E L JAMES,WikipediaEntity,sfq_21348,e l james,"E L James, the author of the popular Fifty Sha...","[-0.009561179205775261, -0.016303369775414467,..."
1,Pancetta is a type of what?,Bacon,WikipediaEntity,qf_301,bacon,Pancetta is a type of bacon that is commonly u...,"[0.0036497116088867188, 0.011051403358578682, ..."
2,What is the oldest medical journal in the Unit...,New England Journal of Medicine,WikipediaEntity,jp_4293,new england journal of medicine,The New England Journal of Medicine (NEJM) is ...,"[-0.013680185191333294, 0.017804736271500587, ..."
3,The songs 'If I Loved You' and 'When The Child...,'CAROUSEL',WikipediaEntity,odql_13140,carousel,Carousel is a classic stage musical that was f...,"[-0.008880705572664738, -0.015163435600697994,..."
4,What is the surname of the TV sibling characte...,GRIFFIN,WikipediaEntity,sfq_22341,griffin,The surname of the TV sibling characters Chris...,"[0.002033527009189129, -0.0027350776363164186,..."


In [27]:
data = data[['QuestionId', 'BackgroundContent', 'Embeddings']]

### Helper Functions

In [28]:
def create_vector_index(stem_name, user_fields):
    # Get the search key, endpoint, and service name from environment variables
    search_key = os.environ['SEARCH_KEY']
    search_endpoint = os.environ['SEARCH_ENDPOINT']
    search_service_name = os.environ['SEARCH_SERVICE_NAME']

    # Get the current time and format it as a string
    now =  datetime.now()
    timestamp  = datetime.strftime(now, "%Y%m%d%H%M%S")

    # Create the index name by appending the timestamp to the stem name
    index_name  = f'{stem_name}-{timestamp}'

    # Create a SearchIndexClient object
    credential = AzureKeyCredential(search_key)
    client = SearchIndexClient(endpoint=search_endpoint, credential=credential)

    # Define the fields for the index
    fields = [SimpleField(name="QuestionId", type=SearchFieldDataType.String, key=True)]
    
    # Add user-defined fields to the index
    for field, field_type in user_fields.items():
        if field_type == 'string':
            fields.append(SearchableField(name=field, type=SearchFieldDataType.String, searchable=True,  filterable=True))
        elif field_type == 'int':
            fields.append(SearchableField(name=field, type=SearchFieldDataType.Int32, searchable=True, filterable=True))
        elif field_type == 'datetime':
            fields.append(SearchableField(name=field, type=SearchFieldDataType.DateTimeOffset, searchable=True, filterable=True))
        elif field_type == 'double':
            fields.append(SearchableField(name=field, type=SearchFieldDataType.Double, searchable=True, filterable=True))
        elif field_type == 'bool':
            fields.append(SearchableField(name=field, type=SearchFieldDataType.Boolean, searchable=True, filterable=True))

    # Add a field for vector embeddings
    fields = fields + [ SearchField(name="Embeddings", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                    searchable=True, vector_search_dimensions=1536, vector_search_profile_name="vector-config")]
    
    # Define vector search configurations
    vector_search = VectorSearch(
        algorithms=[
            HnswAlgorithmConfiguration(
                name="algorithm-config",
            )
        ],
        profiles=[VectorSearchProfile(name="vector-config", algorithm_configuration_name="algorithm-config")],
    )

    # Create the search index with the specified fields and vector search configuration
    index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)
    result = client.create_or_update_index(index)

    return result.name

def insert_documents_vector(documents, index_name):
    """
    Inserts a document vector into the specified search index on Azure Cognitive Search.

    Args:
    documents (list): The list of documents to insert.
    index_name (str): The name of the search index.
    """
    # Get the search key, endpoint, and service name from environment variables
    search_key = os.environ['SEARCH_KEY']
    search_endpoint = os.environ['SEARCH_ENDPOINT']
    search_service_name = os.environ['SEARCH_SERVICE_NAME']

    # Create a SearchClient object
    credential = AzureKeyCredential(search_key)
    client = SearchClient(endpoint=search_endpoint, index_name=index_name, credential=credential)

    # Upload the document to the search index
    result = client.upload_documents(documents=documents)

    return result

def split_list(input_list, size):
    return [input_list[i:i+size] for i in range(0, len(input_list), size)]

In [29]:
fields = {
    'BackgroundContent': 'string'
}

index_name = create_vector_index('trivia', fields)

documents = data.to_dict('records')
for doc in documents:
    doc['Embeddings'] = json.loads(doc['Embeddings'])
    

split_documents = split_list(documents, 50)

for batch in split_documents:
    try:
        insert_documents_vector(batch, index_name)
    except Exception as e:
        for doc in batch:
            try:
                insert_documents_vector([doc], index_name)
            except Exception as e:
                print('Error with Question: ' + doc['QuestionId'])
                pass


Error with Question: qz_2768
Error with Question: sfq_12491
Error with Question: wh_974
Error with Question: qb_3618
Error with Question: odql_1625
Error with Question: qz_6077
Error with Question: sfq_22389
Error with Question: sfq_7383
Error with Question: qw_16567
Error with Question: jp_1381
Error with Question: odql_3551
Error with Question: sfq_23723
Error with Question: qz_1430
Error with Question: odql_8725
Error with Question: qg_2585
Error with Question: sfq_739
Error with Question: qw_14136
Error with Question: qf_1162
Error with Question: jp_2475
Error with Question: qw_16467
Error with Question: sfq_21826
Error with Question: sfq_22996
Error with Question: sfq_21710
Error with Question: qb_275
Error with Question: qz_4240
Error with Question: sfq_13530
Error with Question: sfq_22250
Error with Question: odql_3310
