# Retrieval Augmented Generation (RAG) and Vector Databases

In [11]:
%pip install --upgrade openai

Note: you may need to restart the kernel to use updated packages.


In [12]:
import os
import pandas as pd
import numpy as np
import openai

## Creating our Knowledge base

Creating a Azure Cosmos DB database


In [13]:
pip install azure-cosmos

Note: you may need to restart the kernel to use updated packages.


In [14]:
## create your cosmoss db on Azure CLI using the following commands
## az login
## az group create -n <resource-group-name> -l <location>
## az cosmosdb create -n <cosmos-db-name> -r <resource-group-name>
## az cosmosdb list-keys -n <cosmos-db-name> -g <resource-group-name>

## Once done navigate to data explorer and create a new database and a new container


In [15]:
from azure.cosmos import CosmosClient
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Initialize Cosmos Client
url = os.environ['COSMOS_DB_ENDPOINT']
key = os.environ['COSMOS_DB_KEY']
client = CosmosClient(url, credential=key)

# Select database
database_name = 'rag-cosmos-db'
database = client.get_database_client(database_name)

# Select container
container_name = 'data'
container = database.get_container_client(container_name)



In [16]:
import pandas as pd

# List to store data before creating DataFrame
data = []

# Your file paths
data_paths = [
    "data/frameworks.md",# ?WT.mc_id=academic-105485-koreyst
    "data/own_framework.md",#?WT.mc_id=academic-105485-koreyst
    "data/perceptron.md"#?WT.mc_id=academic-105485-koreyst
]

# Read each file and collect content
for path in data_paths:
    with open(path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        data.append({'path': path, 'text': file_content})

# Create DataFrame from the list
df = pd.DataFrame(data)

# Display first few rows
print(df.head())


                    path                                               text
0     data/frameworks.md  # Neural Network Frameworks\n\nAs we have lear...
1  data/own_framework.md  # Introduction to Neural Networks. Multi-Layer...
2     data/perceptron.md  # Introduction to Neural Networks: Perceptron\...


In [17]:
def split_text(text, max_length, min_length):
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        current_chunk.append(word)
        if len(' '.join(current_chunk)) < max_length and len(' '.join(current_chunk)) > min_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = []

    # If the last chunk didn't reach the minimum length, add it anyway
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

# Assuming analyzed_df is a pandas DataFrame and 'output_content' is a column in that DataFrame
splitted_df = df.copy()
splitted_df['chunks'] = splitted_df['text'].apply(lambda x: split_text(x, 400, 300))

splitted_df

Unnamed: 0,path,text,chunks
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,[# Neural Network Frameworks As we have learne...
1,data/own_framework.md,# Introduction to Neural Networks. Multi-Layer...,[# Introduction to Neural Networks. Multi-Laye...
2,data/perceptron.md,# Introduction to Neural Networks: Perceptron\...,[# Introduction to Neural Networks: Perceptron...


In [18]:
# Assuming 'chunks' is a column of lists in the DataFrame splitted_df, we will split the chunks into different rows
flattened_df = splitted_df.explode('chunks')

flattened_df.head()

Unnamed: 0,path,text,chunks
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke..."


## Converting our text to embeddings

Converting out text  to embeddings, and storing them in our database in chunks

In [19]:
import os
from openai import AzureOpenAI
import numpy as np
from dotenv import load_dotenv
load_dotenv()

client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = os.getenv("AZURE_OPENAI_API_VERSION"),
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [23]:
def create_embeddings(text, model="text-embedding-ada-002"):
    # Create embeddings for each document chunk
    embeddings = client.embeddings.create(input = text, model=model).data[0].embedding
    return embeddings

#embeddings for the first chunk
create_embeddings(flattened_df['chunks'][0])

[-0.01710554026067257,
 0.002864499343559146,
 0.025373218581080437,
 -0.03888116404414177,
 0.006913489196449518,
 0.003886080114170909,
 -0.006200758274644613,
 -0.003273471025750041,
 -0.0029238935094326735,
 -0.02932378463447094,
 0.03480841591954231,
 0.02043161727488041,
 0.0014475224306806922,
 0.0029680149164050817,
 -0.01467546820640564,
 -0.010982843115925789,
 0.022182898595929146,
 0.009109378792345524,
 -0.02935093641281128,
 -0.020594527944922447,
 -0.035487208515405655,
 -0.0037978373002260923,
 0.013039580546319485,
 -0.034428294748067856,
 -0.030491305515170097,
 -0.001521341037005186,
 0.015435714274644852,
 -0.0435512475669384,
 -0.0076092504896223545,
 -0.014268193393945694,
 0.019752826541662216,
 0.012564427219331264,
 -0.012625518254935741,
 -0.01565292663872242,
 -0.0047888727858662605,
 0.011138965375721455,
 0.0013134611072018743,
 0.008247314020991325,
 -0.0002793650492094457,
 -0.0018072818638756871,
 0.040374506264925,
 0.011315451003611088,
 -0.00984247401

In [24]:
cat = create_embeddings("cat")
cat

[-0.0070539116859436035,
 -0.01734057068824768,
 -0.009698242880403996,
 -0.03073945827782154,
 -0.012484360486268997,
 0.0030714645981788635,
 -0.005111427512019873,
 -0.041118279099464417,
 -0.014561542309820652,
 -0.021268075332045555,
 0.019240519031882286,
 0.05075980722904205,
 -0.0012246867408975959,
 0.00255216914229095,
 -0.03845268115401268,
 -0.006057857070118189,
 0.035475149750709534,
 -0.004622261971235275,
 0.002374935196712613,
 -0.013455602340400219,
 -0.01894276589155197,
 0.00905311107635498,
 0.015894342213869095,
 -0.00870573241263628,
 -0.014731687493622303,
 0.0071425288915634155,
 0.013150759972631931,
 -0.013228743337094784,
 0.0028676455840468407,
 0.0048987469635903835,
 0.004033844918012619,
 -0.016801780089735985,
 -0.015752553939819336,
 -0.04304658621549606,
 -0.027123885229229927,
 -0.004278427921235561,
 0.008074779063463211,
 -0.009939280338585377,
 0.022076262161135674,
 -0.009124004282057285,
 0.004920014645904303,
 0.00036133575486019254,
 -0.012073

In [25]:
# create embeddings for the whole data chunks and store them in a list

embeddings = []
for chunk in flattened_df['chunks']:
    embeddings.append(create_embeddings(chunk))

# store the embeddings in the dataframe
flattened_df['embeddings'] = embeddings

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...,"[-0.01710554026067257, 0.002864499343559146, 0..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...,"[-0.01482970081269741, 0.0016899447655305266, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...,"[-0.03680434077978134, -0.02070910856127739, 0..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...,"[-0.03173335641622543, -0.011053191497921944, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke...","[-0.008027797564864159, -0.0333440825343132, 0..."


# Retrieval

Vector search and similiarity between our prompt and the database

### Creating an search index and reranking

In [26]:
from sklearn.neighbors import NearestNeighbors

embeddings = flattened_df['embeddings'].to_list()

# Create the search index
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(embeddings)

# To query the index, you can use the kneighbors method
distances, indices = nbrs.kneighbors(embeddings)

# Store the indices and distances in the DataFrame
flattened_df['indices'] = indices.tolist()
flattened_df['distances'] = distances.tolist()

flattened_df.head()

Unnamed: 0,path,text,chunks,embeddings,indices,distances
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,# Neural Network Frameworks As we have learned...,"[-0.01710554026067257, 0.002864499343559146, 0...","[0, 2, 11, 3, 1]","[0.0, 0.5231289234277616, 0.5281664756122898, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,descent optimization While the `numpy` library...,"[-0.01482970081269741, 0.0016899447655305266, ...","[1, 0, 32, 2, 50]","[0.0, 0.5700021940324681, 0.5924127753624866, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,should give us the opportunity to compute grad...,"[-0.03680434077978134, -0.02070910856127739, 0...","[2, 3, 0, 5, 1]","[0.0, 0.5056656786084963, 0.5231289234277616, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,those computations on GPUs is very important. ...,"[-0.03173335641622543, -0.011053191497921944, ...","[3, 2, 0, 10, 11]","[0.0, 0.5056656786084963, 0.5459749903916924, ..."
0,data/frameworks.md,# Neural Network Frameworks\n\nAs we have lear...,"API, there is also higher-level API, called Ke...","[-0.008027797564864159, -0.0333440825343132, 0...","[4, 12, 10, 8, 9]","[0.0, 0.5191959112618413, 0.5522311510705009, ..."


In [27]:
# Your text question
question = "what is a perceptron?"

# Convert the question to a query vector
query_vector = create_embeddings(question)  # You need to define this function

# Find the most similar documents
distances, indices = nbrs.kneighbors([query_vector])

index = []
# Print the most similar documents
for i in range(3):
    index = indices[0][i]
    for index in indices[0]:
        print(flattened_df['chunks'].iloc[index])
        print(flattened_df['path'].iloc[index])
        print(flattened_df['distances'].iloc[index])
    else:
        print(f"Index {index} not found in DataFrame")

in our model, in which case the input vector would be a vector of size N. A perceptron is a **binary classification** model, i.e. it can distinguish between two classes of input data. We will assume that for each input vector x the output of our perceptron would be either +1 or -1, depending on the class.
data/perceptron.md
[0.0, 0.5277758037255733, 0.5363335862390093, 0.5444526711829064, 0.5541456450579713]
# Introduction to Neural Networks: Perceptron One of the first attempts to implement something similar to a modern neural network was done by Frank Rosenblatt from Cornell Aeronautical Laboratory in 1957. It was a hardware implementation called "Mark-1", designed to recognize primitive geometric figures,
data/perceptron.md
[0.0, 0.45856142626262997, 0.5236047423369777, 0.5627386585479264, 0.5634420757319366]
user to adjust the resistance of a circuit. > The New York Times wrote about perceptron at that time: *the embryo of an electronic computer that [the Navy] expects will be able

## Putting it all together to answer a question

In [28]:
import os
import openai

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")

In [30]:
user_input = "what is a perceptron?"

def chatbot(user_input):
    # Convert the question to a query vector
    query_vector = create_embeddings(user_input)

    # Find the most similar documents
    distances, indices = nbrs.kneighbors([query_vector])

    # add documents to query  to provide context
    history = []
    for index in indices[0]:
        history.append(flattened_df['chunks'].iloc[index])

    # combine the history and the user input
    history.append(user_input)

    # create a message object
    messages=[
        {"role": "system", "content": "You are an AI assiatant that helps with AI questions."},
        {"role": "user", "content": history[-1]}
    ]

    # use chat completion to generate a response
    response = client.chat.completions.create(
        model=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
        temperature=0.7,
        max_tokens=800,
        messages=messages
    )

    return response.choices[0].message

chatbot(user_input)

ChatCompletionMessage(content='A **perceptron** is a fundamental building block of artificial neural networks, especially in the context of machine learning and deep learning.\n\n### Definition\nA perceptron is a simple mathematical model that simulates a single neuron. It takes several input values, applies weights to them, sums them up, and passes the result through an activation function (usually a step or sigmoid function) to produce an output.\n\n### How it works\n\n1. **Inputs**: \\( x_1, x_2, ..., x_n \\)\n2. **Weights**: \\( w_1, w_2, ..., w_n \\)\n3. **Bias**: \\( b \\)\n4. **Summation**: \\( z = w_1x_1 + w_2x_2 + ... + w_nx_n + b \\)\n5. **Activation**: Output \\( y = f(z) \\), where \\( f \\) is typically a step function:  \n   - If \\( z > 0 \\), output 1  \n   - Else, output 0\n\n### Perceptron as a Linear Classifier\nA perceptron is a **linear binary classifier**: it can decide whether an input, represented by a vector of numbers, belongs to one class or another.\n\n### L

## Testing and evaluation

A basic example of how you can use Mean Average Precision (MAP) to evaluate the responses of your model based on their relevance.

In [31]:
from sklearn.metrics import average_precision_score

# Define your test cases
test_cases = [
    {
        "query": "What is a perceptron?",
        "relevant_responses": ["A perceptron is a type of artificial neuron.", "It's a binary classifier used in machine learning."],
        "irrelevant_responses": ["A perceptron is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is machine learning?",
        "relevant_responses": ["Machine learning is a method of data analysis that automates analytical model building.", "It's a branch of artificial intelligence based on the idea that systems can learn from data, identify patterns and make decisions with minimal human intervention."],
        "irrelevant_responses": ["Machine learning is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is deep learning?",
        "relevant_responses": ["Deep learning is a subset of machine learning in artificial intelligence (AI) that has networks capable of learning unsupervised from data that is unstructured or unlabeled.", "It's a type of machine learning."],
        "irrelevant_responses": ["Deep learning is a type of fruit.", "It's a type of car."]
    },
    {
        "query": "What is a neural network?",
        "relevant_responses": ["A neural network is a series of algorithms that endeavors to recognize underlying relationships in a set of data through a process that mimics the way the human brain operates.", "It's a type of machine learning."],
        "irrelevant_responses": ["A neural network is a type of fruit.", "It's a type of car."]
    }
]

# Initialize the total average precision
total_average_precision = 0

# Test the RAG application
for test_case in test_cases:
    query = test_case["query"]
    relevant_responses = test_case["relevant_responses"]
    irrelevant_responses = test_case["irrelevant_responses"]

    # Generate a response using your RAG application
    response = chatbot(query) 

    # Create a list of all responses and a list of true binary labels
    all_responses = relevant_responses + irrelevant_responses
    true_labels = [1] * len(relevant_responses) + [0] * len(irrelevant_responses)

    # Create a list of predicted scores based on whether the response is the generated response
    predicted_scores = [1 if resp == response else 0 for resp in all_responses]

    # Calculate the average precision for this query
    average_precision = average_precision_score(true_labels, predicted_scores)

    # Add the average precision to the total average precision
    total_average_precision += average_precision

# Calculate the mean average precision
mean_average_precision = total_average_precision / len(test_cases)

In [32]:
mean_average_precision

np.float64(0.5)