In [1]:
import pandas as pd
import pymongo
import openai
import requests
import json
from dotenv import dotenv_values

### Load environment variables and keys

In [2]:
# specify the name of the .env file name 
env_name = "llm.env"
config = dotenv_values(env_name)

cosmos_connection_string = config['cosmos_connection_string']
openai_api_key = config['openai_api_key']
openai_api_base = config['openai_api_base']
openai_api_version = config['openai_api_version']
openai_deployment_embedding = config['openai_deployment_embedding']

# Azure Cognitive Search
cogsearch_name = config['cogsearch_name']
cogsearch_api_key = config['cogsearch_api_key']

# Your database and collection names
db_name = "reviews"
collection_name = "food_reviews"
# Your cognitive search index name (must only contain lowercase, numbers, and dashes)
cogsearch_index_name = "ind_food_review_mongodb"

# We are using text-embedding-ada-002
embedding_length = 1536

### Establish a connection to the database

In [3]:
client = pymongo.MongoClient(cosmos_connection_string)

### Create the database and collection

In [4]:
# Create database if it doesn't exist
db = client[db_name]
if db_name not in client.list_database_names():
    # Create a database with 400 RU throughput that can be shared across
    # the DB's collections
    db.command({"customAction": "CreateDatabase", "offerThroughput": 400})
    print("Created db '{}' with shared throughput.\n".format(db_name))
else:
    print("Using database: '{}'.\n".format(db_name))

Using database: 'reviews'.



In [5]:
# Create collection if it doesn't exist
collection = db[collection_name]
if collection_name not in db.list_collection_names():
    # Creates a unsharded collection that uses the DBs shared throughput
    db.command(
        {"customAction": "CreateCollection", "collection": collection_name}
    )
    print("Created collection '{}'.\n".format(collection_name))
else:
    print("Using collection: '{}'.\n".format(collection_name))

Using collection: 'food_reviews'.



In [6]:
collection = client[db_name][collection_name]

### Create an index on Id and insert out dataframe to the collection

In [7]:
df = pd.read_csv('Reviews_small.csv')

if collection.count_documents({}) == 0:
    print(f"Inserting documents into collection '{collection_name}'.\n")
    collection.create_index('Id')
    collection.insert_many(df.to_dict('records'))
else:
    print(f"Collection '{collection_name}' already contains documents.\n")

Collection 'food_reviews' already contains documents.



### Create content and generate embeddings

In [8]:
# We will combine productid, score, and text into a single field to run embeddings on
df['combined'] = 'productid: ' + df['ProductId'] + ' ' + 'score: ' + df['Score'].astype(str) + ' ' + 'text: ' + df['Text']
df['combined'].head()

0    productid: B001E4KFG0 score: 5 text: I have bo...
1    productid: B00813GRG4 score: 1 text: Product a...
2    productid: B000LQOCH0 score: 4 text: This is a...
3    productid: B000UA0QIQ score: 2 text: If you ar...
4    productid: B006K2ZZ7K score: 5 text: Great taf...
Name: combined, dtype: object

In [9]:
openai.api_type = "azure"
openai.api_key = openai_api_key
openai.api_base = openai_api_base
openai.api_version = openai_api_version

def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=openai_deployment_embedding)
    embeddings = response['data'][0]['embedding']
    return embeddings

df['embedding'] = None
# iterate over the dataframe and create embeddings for each row
for index, row in df.iterrows():
    df.at[index, 'embedding'] = createEmbeddings(row['combined'])
    
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,combined,embedding
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,productid: B001E4KFG0 score: 5 text: I have bo...,"[-0.005925467703491449, -0.0029136091470718384..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,productid: B00813GRG4 score: 1 text: Product a...,"[-0.025345874950289726, -0.015591269358992577,..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,productid: B000LQOCH0 score: 4 text: This is a...,"[0.011405655182898045, 0.007502448279410601, -..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,productid: B000UA0QIQ score: 2 text: If you ar...,"[0.008057798258960247, 0.006768684834241867, -..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,productid: B006K2ZZ7K score: 5 text: Great taf...,"[-0.023589162155985832, -0.009200308471918106,..."


### Store the embeddings in Azure Cognitive Search Vector Store

[Azure Cognitive Search](https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search) provides a simple interface to create a vector database, store and retrieve data using vector search. You can read more about Vector search [here](https://github.com/Azure/cognitive-search-vector-pr/tree/main).

There are two steps to store data in AzureCogSearch vector database:
- First, we create the index (or schema) of the vector database
- Second, we add the chunked documents and their embeddings to the vector datastore

In [10]:
# Create Index for Cog Search with fields as id,  and contentVector
# Note the datatypes for each field below

url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}?api-version=2023-07-01-Preview"
payload = json.dumps({
  "name": cogsearch_index_name,
  "fields": [
    {
      "name": "id",
      "type": "Edm.String",
      "key": True,
      "filterable": True
    },
    {
      "name": "contentVector",
      "type": "Collection(Edm.Single)",
      "searchable": True,
      "retrievable": True,
      "dimensions": embedding_length,
      "vectorSearchConfiguration": "vectorConfig"
    }
  ],
  "vectorSearch": {
    "algorithmConfigurations": [
      {
        "name": "vectorConfig",
        "kind": "hnsw",
      }
    ]
  },
  "semantic": {
    "configurations": [
      {
        "name": "my-semantic-config",
        "prioritizedFields": {
          "prioritizedContentFields": [
            {
              "fieldName": "id"
            }
          ],
        }
      }
    ]
  }
})
headers = {
  'Content-Type': 'application/json',
  'api-key': cogsearch_api_key
}

response = requests.request("PUT", url, headers=headers, data=payload)
print(response.status_code)
print(response.text)

204



In [11]:
def insertToCogSearch(idx, contentVector):
    url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/index?api-version=2023-07-01-Preview"

    payload = json.dumps({
    "value": [
        {
        "id": str(idx),
        "contentVector": contentVector,
        "@search.action": "upload"
        },
    ]
    })
    headers = {
    'Content-Type': 'application/json',
    'api-key': cogsearch_api_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    # print(response.json())

    if response.status_code == 200 or response.status_code == 201:
        return "Success"
    else:
        return "Failure"

for index, row in df.iterrows():
    response = insertToCogSearch(row['Id'], row['embedding'])
    if response == "Failure":
        print(index, response)

### User Query

In [12]:
userQuestion = "Great taffy"
retrieve_k = 3 # Retrieve the top 3 documents from vector database

In [13]:
# retrieve k chnuks
def retrieve_k_chunk(k, questionEmbedding):
    # Retrieve the top K entries
    url = f"https://{cogsearch_name}.search.windows.net/indexes/{cogsearch_index_name}/docs/search?api-version=2023-07-01-Preview"

    payload = json.dumps({
    "vector": {
        "value": questionEmbedding,
        "fields": "contentVector",
        "k": k
    }
    })
    headers = {
    'Content-Type': 'application/json',
    'api-key': cogsearch_api_key,
    }

    response = requests.request("POST", url, headers=headers, data=payload)
    print(response.status_code)
    output = json.loads(response.text)
    return output

# Generate embeddings for the question and retrieve the top k document chunks
questionEmbedding = createEmbeddings(userQuestion)
output = retrieve_k_chunk(retrieve_k, questionEmbedding)

200


In [14]:
matching_ids = [int(value['id']) for value in output['value']]
matching_ids

[5, 8, 7]

### Retrieve text from database

In [15]:
documents = list(collection.find({'Id': {'$in': matching_ids}}))


In [16]:
df_retrieved = pd.DataFrame(documents)
df_retrieved.head()

Unnamed: 0,_id,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,648ccc21781ff3183a2498fe,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
1,648ccc21781ff3183a249900,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
2,648ccc21781ff3183a249901,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...


## OPTIONAL: Offer Response to User's Question
To offer a response, one can either follow a simple prompting method as shown below or leverage ways used by other libraries, such as [langchain](https://python.langchain.com/en/latest/index.html).

In [17]:
# create a prompt template 
template = """
    context :{context}
    Answer the question based on the context above. Provide the product id associated with the answer as well. If the
    information to answer the question is not present in the given context then reply "I don't know".
    Query: {query}
    Answer: """

In [18]:
# Create context for the prompt by combining the productid, score, and text of retrieved rows
df_retrieved['combined'] = 'productid: ' + df_retrieved['ProductId'] + ' ' + 'score: ' + df_retrieved['Score'].astype(str) + ' ' + 'text: ' + df_retrieved['Text']
context = '\n'.join(df_retrieved['combined'])

print(context)

productid: B006K2ZZ7K score: 5 text: Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.
productid: B006K2ZZ7K score: 5 text: This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!
productid: B006K2ZZ7K score: 5 text: This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!


In [19]:
prompt = template.format(context=context, query=userQuestion)
print(prompt)


    context :productid: B006K2ZZ7K score: 5 text: Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.
productid: B006K2ZZ7K score: 5 text: This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!
productid: B006K2ZZ7K score: 5 text: This taffy is so good.  It is very soft and chewy.  The flavors are amazing.  I would definitely recommend you buying it.  Very satisfying!!
    Answer the question based on the context above. Provide the product id associated with the answer as well. If the
    information to answer the question is not present in the given context then reply "I don't know".
    Query: Great taffy
    Answer: 


In [20]:
response = openai.Completion.create(
    engine= config["openai_deployment_completion"],
    prompt=prompt,
    max_tokens=1024,
    n=1,
    stop=None,
    temperature=1,
)

print(response['choices'][0]['text'])

 Product ID: B006K2ZZ7K


In [21]:
client.close()