# Store chunks into Vector Database using Azure Cognitive Search (ACS)

In [None]:
import os
import re
import pandas as pd
import json  
import openai  
from dotenv import dotenv_values
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryCaptionResult,
    QueryAnswerResult,
    SemanticErrorMode,
    SemanticErrorReason,
    SemanticSearchResultsType,
    QueryType,
    VectorizedQuery,
    VectorQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    ExhaustiveKnnAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticPrioritizedFields,
    SemanticField,  
    SearchField,  
    SemanticSearch,
    VectorSearch,  
    HnswAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    SemanticField,  
    SearchField,  
    VectorSearch,  
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
)  
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ast import literal_eval
import requests

In [None]:
# Choose name of index to create or update
index_name = "msft-transcripts-1"

# Load environment variables and keys 

In [None]:
# Specify the name of the .env file name 
env_name = "../../.env" # change to your own .env file name
config = dotenv_values(env_name)

if config['KEYS_FROM'] == "KEYVAULT":
    print('keyvault was selected.')
    keyVaultName = config["KEY_VAULT_NAME"]
    KVUri = f"https://{keyVaultName}.vault.azure.net"

    credential = DefaultAzureCredential()
    client = SecretClient(vault_url=KVUri, credential=credential)

    # Azure OpenAI 
    openai.api_type = client.get_secret("OPENAI-API-TYPE").value
    openai.api_key = client.get_secret("OPENAI-API-KEY").value
    openai.api_base = client.get_secret("OPENAI-API-BASE").value
    openai.api_version = client.get_secret("OPENAI-API-VERSION").value
    deployment_embedding = client.get_secret("OPENAI-DEPLOYMENT-EMBEDDING").value

    ## Cog Search
    cogsearch_key = client.get_secret("COGSEARCH-API-KEY").value
    service_endpoint = client.get_secret("COGSEARCH-ADDRESS").value
else:
    # Azure OpenAI 
    openai.api_type = config["OPENAI_API_TYPE"] #"azure"
    openai.api_key = config['OPENAI_API_KEY']
    openai.api_base = config['OPENAI_API_BASE']
    openai.api_version = config['OPENAI_API_VERSION']
    deployment_embedding = config["OPENAI_DEPLOYMENT_EMBEDDING"]
    
    ## Cog Search
    cogsearch_key = config["COGSEARCH_API_KEY"]
    service_endpoint = config["COGSEARCH_ADDRESS"]


In [None]:
def createEmbeddings(text, endpoint, api_key, api_version, embedding_model_deployment):
    request_url = f"{endpoint}/openai/deployments/{embedding_model_deployment}/embeddings?api-version={api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    request_payload = {
        'input': text
    }
    embedding_response = requests.post(request_url, json=request_payload, headers=headers, timeout=None)
    if embedding_response.status_code == 200:
        data_values = embedding_response.json()["data"]
        embeddings_vectors = [data_value["embedding"] for data_value in data_values]
        return embeddings_vectors
    else:
        raise Exception(f"failed to get embedding: {embedding_response.json()}")


#### Store the embeddings in Azure Cognitive Search Vector Store

[AzureCogSearch](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main) more about Vector Search.

There are two steps to store data in AzureCogSearch vector database:
- First, we create the index (or schema) of the vector database
- Second, we add the chunked documents and their embeddings to the vector datastore

In [None]:
df_chunks_embedding = pd.read_csv('AnalyzedPDF/ChunksEmbedding.csv')

In [None]:
df_chunks_embedding.head(3)
#columns should look like the following with order preserved
#Id, Chunk, PageNumber, LineNumber, DocId, Embedding

In [None]:

# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=AzureKeyCredential(cogsearch_key)
)
fields = [
    SimpleField(name="Id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
    SearchableField(name="Ticker", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Year", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Quarter", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="Chunk", type=SearchFieldDataType.String, searchable=True),
    SearchableField(name="PageNumber", type=SearchFieldDataType.String, filterable=True),
    SearchableField(name="LineNumber", type=SearchFieldDataType.String, filterable=True),
    
    SearchField(name="Embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_profile_name="my-vector-config"),
]

vector_search = VectorSearch(
    algorithms=[
        HnswAlgorithmConfiguration(
            name="myHnsw",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric=VectorSearchAlgorithmMetric.COSINE
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="my-vector-config",
            algorithm_configuration_name="myHnsw",
        ),
    ]
)

semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        title_field=SemanticField(field_name="Ticker"),
        prioritized_content_fields=[SemanticField(field_name="Chunk")]
    )
)

# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])

# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
                    vector_search=vector_search, semantic_search=semantic_search)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')

In [None]:

## Upload data to Index
def batch_append_payload(df, search_client):
    """append payload for batch insertion (note: max 1000 rows per insertion) of embeddings to Cognitive Search"""
    value_list = []
    for index, row in df.iterrows():
        value_list.append(
            {
                "Id": str(index),
                "Ticker": row["Ticker"],
                "Year": str(row["Year"]),
                "Quarter": str(row["Quarter"]),
                "Chunk": row["Chunk"],
                "PageNumber": str(row["PageNumber"]),
                "LineNumber": str(row["LineNumber"]),
                "Embedding": literal_eval(row['Embedding']),
            }
        )
        
#         print(len(value_list))
        
        if len(value_list) >= 1000:
            result = search_client.upload_documents(value_list)
            print(f"Uploaded {len(value_list)} payload")
            value_list = []
    result = search_client.upload_documents(value_list)
    print(f"Uploaded {len(value_list)} payload")
    
            
            
#     print('payload of size {}'.format(len(value_list)))

    return value_list


search_client = SearchClient(
    endpoint=service_endpoint,
    index_name=index_name,
    credential=AzureKeyCredential(cogsearch_key)
)
payload = batch_append_payload(df_chunks_embedding, search_client)
 
# print(f"Uploaded {len(payload)} payload") 


# Initialize Search Client for the four types below

In [None]:
search_client = SearchClient(
	service_endpoint,
	index_name,
	credential=AzureKeyCredential(cogsearch_key)
)

## Search Types 1: Pure Vector Search

In [None]:
# Pure Vector Search
query = "Microsoft earnings call for year 2023 for Quarter 2"  

query_embedding = createEmbeddings(
    query,
    openai.api_base,
    openai.api_key,
    openai.api_version,
    deployment_embedding
)[0]

vector_query = VectorizedQuery(vector=query_embedding, k_nearest_neighbors=3, fields="Embedding")
  
results = search_client.search(  
    search_text=None,  
    vector_queries=[vector_query],
    #select=["Ticker", "Quarter", "Year"],
    #top=3,
)

# results
  
for result in results: 
    print(result['Ticker'])
    print(result['Quarter'])
    print(result['Year'])
    print(result['Chunk'])
    break

# Search Types 2: Pure Filter

In [None]:
results = search_client.search(  
    search_text=None,  
    filter="(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') ",
)  

for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(result['Chunk'])
    print()
    break


# Search Types 3: Vector Search with filters

In [None]:
# Pure Vector Search with Filter
query = "What are the KPIs?"  
  
query_embedding = createEmbeddings(
    query,
    openai.api_base,
    openai.api_key,
    openai.api_version,
    deployment_embedding
)[0]

vector_query = VectorizedQuery(vector=query_embedding, k_nearest_neighbors=3, fields="Embedding")

results = search_client.search(  
    search_text=None,  
    vector_queries=[vector_query],
    filter="(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') ",
    #select=["Ticker", "Quarter", "Year"],
    top=3,
)  
  
for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(result['Chunk'])
    print()

    break

# Search Types 4: Hybrid Search with filters

In [None]:
# Pure Vector Search with Filter
query = "What are the KPIs?"  
  
query_embedding = createEmbeddings(
    query,
    openai.api_base,
    openai.api_key,
    openai.api_version,
    deployment_embedding
)[0]

vector_query = VectorizedQuery(vector=query_embedding, k_nearest_neighbors=3, fields="Embedding")

results = search_client.search(  
    search_text=query,  
    vector_queries=[vector_query],
    filter="(Ticker eq 'MSFT') and (Year eq '23') and (Quarter eq '1') ",
#     select=["Ticker", "Quarter", "Year"],
    top=3,
)  
  
for result in results:
    print(f"Ticker: {result['Ticker']}")
    print(f"Quarter: {result['Quarter']}") 
    print(f"Year: {result['Year']}") 
    print(result['Chunk'])
    print()
    break