# CosmosDB for NoSQL RAG
<img src = "./cosmosdbrag.png">


### Installing important packages and libraries

In [None]:
%pip install azure-cosmos
%pip install openai

In [None]:
from azure.cosmos import CosmosClient, PartitionKey, exceptions
import os
import json
from dotenv import load_dotenv

load_dotenv()

### Creating a connection to CosmosDB via connection string
and creating a database if it doesn't exist

In [None]:
cosmosdb_connection_string = os.getenv("COSMOSDB_CONNECTION_STRING")

client = CosmosClient.from_connection_string(cosmosdb_connection_string)
database_name = os.getenv("DATABASE_NAME")

database = client.create_database_if_not_exists(id=database_name)


### Defining the vector embedding policy 

In [None]:
pk = "/category"

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/vector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        }
    ]
}

### Creating a vector index with diskANN algorithm


In [None]:
indexing_policy = {
    "vectorIndexes": [
        {
            "path":"/vector",
            "type":"diskANN"
        }

    ]
}

### Creating container inside of the database

In [None]:
try:
    container_name = os.getenv("CONTAINER_NAME")
    
    container = database.create_container_if_not_exists(
        id=container_name,
        partition_key=PartitionKey(path=pk),
        indexing_policy=indexing_policy,
        vector_embedding_policy=vector_embedding_policy
        
    )
    
except Exception as e:
    print(e)

### Creating Azure OpenAI Client

In [None]:
from openai import AzureOpenAI

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")  
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

azure_openai_client = AzureOpenAI(
    api_key=azure_openai_key,
    api_version="2024-02-15-preview",
    azure_endpoint=azure_openai_endpoint
)

### Creating Embedding Generation Function
embedding engine to be used: text-embedding-ada-002 
<br>
vector dimensions: 1536

In [None]:
def generate_embeddings(client, text):
    embedding_model = os.getenv("EMBEDDING_ENGINE")
    
    response = client.embeddings.create(
        input=text,
        model = embedding_model
    )
    
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']
    

### Loading food dataset
the food dataset is stored in `"./fooditems.json"`
<br>
we will generate vector embedding for the `/description` field of each food object and store it in a new field `/vector`

In [None]:
import json
import uuid

file_path = "./food_items.json"

with open(file_path) as f:
    data = json.load(f)
    

    
for obj in data:
    guid = str(uuid.uuid4())
    vector_embeddings = generate_embeddings(azure_openai_client, obj['description'])
    obj['vector'] = vector_embeddings
    obj['id']=guid
    container.upsert_item(obj)
    
    
with open("./new_dataset.json", 'w') as f:
    json.dump(data, f)
    


### Generating vector embeddings for the user query

In [None]:
user_query="are pizzas available? i am lactose intolerant"
user_embeddings = generate_embeddings(azure_openai_client, user_query)
print(user_embeddings)


### Sending a query to database with filtering based upon VectorDistance

In [None]:
queryText = f""" SELECT TOP 5 c.category, c.name, c.description, c.price, VectorDistance(c.vector, {user_embeddings}) AS SimilarityScore
FROM c
ORDER BY VectorDistance(c.vector, {user_embeddings})"""
results = container.query_items(
    query=queryText,
    enable_cross_partition_query=True
)
dishes = []

for item in results:
    print(item)
    dishes.append(item)
    


### Sending call to our GPT engine for summarisation 

In [None]:
system_message = f"""You are meant to behave as a RAG chatbot that derives its context from a database of food items stored in azure cosmosDB for noSQL API.
please asnwer strictly from the context from the database provided and if you dont have an answer please politely say so. dont include any extra 
information that is not in the context and dont include links as well.
the context passed to you will be in the form of a pythonic list with each object in the list containing details of a food item and
having structure as follows:

 "category": "the category of the food item like smoothies, burgers, etc",
 "name": "the name of the food item",
 "description": "the description of the food item",
"price": "the price of the food item in USD",


the pythonic list contains best 5 matches to the user query based on cosine similarity of the embeddings of the user query and the food item descriptions.
please structure your answers in a very professional manner and in such a way that the user does not get to know that its RAG working under the hood
and its as if they are talking to a human."""

user_message = f""" the user query is: {user_query}
the context is : {dishes}"""

chat_completions_response = azure_openai_client.chat.completions.create(
    model = os.getenv("GPT_ENGINE"),
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ],
    temperature=0.7
)

print(chat_completions_response.choices[0].message.content)
    