# ComosDb - MongoDB + Azure Cogntive Search
This sample shows how to create and use search index on Azure Cognitive Search when your data is in CosmosDB - MongoDB.

### Requirements
1. Install python packages in requirements.txt
2. retrieve your CosmosDB connection string and update the value in your .env file. Your connection string can be found on [portal.azure.com](portal.azure.com)
   1. Look for your CosmosDB - MongoDB account and navigate to "Connection strings" section. Copy any of the primary or secondary connection string in your .env file


In [None]:
import pandas as pd
import pymongo
import openai
import requests
import json
from dotenv import dotenv_values

### Load environment variables and keys

In [None]:
env_name = "example.env" # specify the name of your .env file name 
config = dotenv_values(env_name)

cosmos_connection_string = config['cosmos_connection_string']
openai_api_key = config['openai_api_key']
openai_api_base = config['openai_api_base']
openai_api_version = config['openai_api_version']
openai_deployment_embedding = config['openai_deployment_embedding']

# Azure Cognitive Search
cogsearch_name = config['cogsearch_name']
cogsearch_api_key = config['cogsearch_api_key']

# Your database and collection names
db_name = "reviews"
collection_name = "food_reviews"
# Your cognitive search index name (must only contain lowercase, numbers, and dashes)
cogsearch_index_name = "ind_food_review_mongodb"

# We are using text-embedding-ada-002
embedding_length = 1536

### Establish a connection to the database

In [None]:
client = pymongo.MongoClient(cosmos_connection_string)

### Create the database and collection

In [None]:
# Create database if it doesn't exist
db = client[db_name]
if db_name not in client.list_database_names():
    # Create a database with 400 RU throughput that can be shared across
    # the DB's collections
    db.command({"customAction": "CreateDatabase", "offerThroughput": 400})
    print("Created db '{}' with shared throughput.\n".format(db_name))
else:
    print("Using database: '{}'.\n".format(db_name))

In [None]:
# Create collection if it doesn't exist
collection = db[collection_name]
if collection_name not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    db.command(
        {"customAction": "CreateCollection", "collection": collection_name}
    )
    print("Created collection '{}'.\n".format(collection_name))
else:
    print("Using collection: '{}'.\n".format(collection_name))

In [None]:
collection = client[db_name][collection_name]

### Create an index on Id and insert our dataframe to the collection

In [None]:
df = pd.read_csv('../../DataSet/Reviews_small.csv')

if collection.count_documents({}) == 0:
    collection.create_index('Id')
    starting_index, batch_size = 0, 1000
    while starting_index < len(df):
        print(f"Inserting documents with index {starting_index} to {starting_index + batch_size} into collection '{collection_name}'.\n")
        collection.insert_many(df[starting_index:starting_index + batch_size].to_dict('records'))
        starting_index += batch_size
else:
    print(f"Collection '{collection_name}' already contains documents.\n")

### Create content and generate embeddings

In [None]:
# We will combine productid, score, and text into a single field to run embeddings on
df['combined'] = 'productid: ' + df['ProductId'] + ' ' + 'score: ' + df['Score'].astype(str) + ' ' + 'text: ' + df['Text']
df['combined'].head()

In [None]:
openai.api_type = "azure"
openai.api_key = openai_api_key
openai.api_base = openai_api_base
openai.api_version = openai_api_version

def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=openai_deployment_embedding)
    embeddings = response['data'][0]['embedding']
    return embeddings

df['embeddings'] = None
# iterate over the dataframe and create embeddings for each row
for index, row in df.iterrows():
    df.at[index, 'embeddings'] = createEmbeddings(row['combined'])
    
df.head()

### Store the embeddings in Azure Cognitive Search Vector Store

[Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about Vector search [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).

There are two steps to store data in AzureCogSearch vector database:
- First, we create the index (or schema) of the vector database
- Second, we add the chunked documents and their embeddings to the vector datastore

### Create search index

In [None]:
# Create Index for Cog Search with fields as id,  and contentVector
# Note the datatypes for each field below

url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}?api-version=2023-07-01-Preview"
payload = json.dumps({
  "name": cogsearch_index_name,
  "fields": [
    {
      "name": "id",
      "type": "Edm.String",
      "key": True,
      "filterable": True
    },
    {
      "name": "contentVector",
      "type": "Collection(Edm.Single)",
      "searchable": True,
      "retrievable": True,
      "dimensions": embedding_length,
      "vectorSearchConfiguration": "vectorConfig"
    }
  ],
  "vectorSearch": {
    "algorithmConfigurations": [
      {
        "name": "vectorConfig",
        "kind": "hnsw",
      }
    ]
  },
  "semantic": {
    "configurations": [
      {
        "name": "my-semantic-config",
        "prioritizedFields": {
          "prioritizedContentFields": [
            {
              "fieldName": "id"
            }
          ],
        }
      }
    ]
  }
})
headers = {
  'Content-Type': 'application/json',
  'api-key': cogsearch_api_key
}

response = requests.request("PUT", url, headers=headers, data=payload)
print(response.status_code)
print(response.text)

### Insert embeddings in search index by batch

In [None]:
def batch_append_payload(df):
    """append payload for batch insertion (note: max 1000 rows per insertion) of embeddings to Cognitive Search"""
    value_list = []
    for index, row in df.iterrows():
        value_list.append(
            {
            "id": str(index),
            "contentVector": row['embeddings'],
            "@search.action": "upload"
            }
        )
    print('payload of size {}'.format(len(value_list)))
    print('start: {}'.format(value_list[0]))
    print('end: {}'.format(value_list[-1]))
    payload = json.dumps({
        "value": value_list
    })
    return payload

def BatchInsertToCogSearch(df):
    """Batch insertion of embedding to Cognitive Search, note: column name must be 'embeddings'"""
    url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview"
    payload = batch_append_payload(df)
    headers = {
    'Content-Type': 'application/json',
    'api-key': cogsearch_api_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    print(response.json())

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return "Failure"

In [None]:
BatchInsertToCogSearch(df)

### User Query

In [None]:
userQuestion = "Great taffy"
retrieve_k = 3 # Retrieve the top 3 documents from vector database

In [None]:
# retrieve k chnuks
def retrieve_k_chunk(k, questionEmbedding):
    # Retrieve the top K entries
    url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview"

    payload = json.dumps({
    "vector": {
        "value": questionEmbedding,
        "fields": "contentVector",
        "k": k
    }
    })
    headers = {
    'Content-Type': 'application/json',
    'api-key': cogsearch_api_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    print(response.status_code)
    output = json.loads(response.text)
    return output

# Generate embeddings for the question and retrieve the top k document chunks
questionEmbedding = createEmbeddings(userQuestion)
output = retrieve_k_chunk(retrieve_k, questionEmbedding)

In [None]:
matching_ids = [int(value['id']) for value in output['value']]
matching_ids

### Retrieve text from database

In [None]:
documents = list(collection.find({'Id': {'$in': matching_ids}}))

In [None]:
df_retrieved = pd.DataFrame(documents)
df_retrieved.head()

## OPTIONAL: Offer Response to User's Question
To offer a response, one can either follow a simple prompting method as shown below or leverage ways used by other libraries, such as [langchain](https://python.langchain.com/en/latest/index.html).

In [None]:
# create a prompt template 
template = """
    context :{context}
    Answer the question based on the context above. Provide the product id associated with the answer as well. If the
    information to answer the question is not present in the given context then reply "I don't know".
    Query: {query}
    Answer: """

In [None]:
# Create context for the prompt by combining the productid, score, and text of retrieved rows
df_retrieved['combined'] = 'productid: ' + df_retrieved['ProductId'] + ' ' + 'score: ' + df_retrieved['Score'].astype(str) + ' ' + 'text: ' + df_retrieved['Text']
context = '\n'.join(df_retrieved['combined'])

print(context)

In [None]:
prompt = template.format(context=context, query=userQuestion)
print(prompt)

In [None]:
response = openai.Completion.create(
    engine= config["openai_deployment_completion"],
    prompt=prompt,
    max_tokens=1024,
    n=1,
    stop=None,
    temperature=1,
)

print(response['choices'][0]['text'])

In [None]:
client.close()