# Simple RAG

**Tech Stack** 
1. vectordatabase - ChramaDB
2. sentence embedding - all-MiniLM-L6-v2
3. llm - llama3-8b


In [None]:

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()

llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )



def embd_load_vectordb(filepath):
    # Initialize the embedding model
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        # Load and split the PDF document
    docs = PyPDFLoader(filepath).load_and_split()
    # Create a Chroma vector store with a specified directory for persistence
    vectordb = Chroma.from_documents(docs, embedding, persist_directory="./test_db")
    print("Vector database created and persisted.")
    return vectordb


def load_vectordb():
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    loaded_db = Chroma(persist_directory="./test_db", embedding_function=embedding)
    return loaded_db
    
vectordb = load_vectordb()



def response_generator(vectordb, query, llm):
    template = """Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. {context} Question: {question} Helpful Answer:"""


    QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

    qa_chain = RetrievalQA.from_chain_type(llm, 
                                           retriever=vectordb.as_retriever(), 
                                           return_source_documents=True, 
                                           chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

    ans = qa_chain.invoke(query)
    return ans["result"]


query = "what are the side effects of Ondansetron"
ans = response_generator(vectordb, query, llm)
print(ans)

In [4]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()

True

In [12]:
llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )


In [13]:
def embd_load_vectordb(filepath, vectordb_path):
    # Initialize the embedding model
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        # Load and split the PDF document
    docs = PyPDFLoader(filepath).load_and_split()
    # Create a Chroma vector store with a specified directory for persistence
    vectordb = Chroma.from_documents(docs, embedding, persist_directory=vectordb_path)
    print("Vector database created and persisted.")
    return vectordb


In [14]:
def load_vectordb(vectordb_path):
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    loaded_db = Chroma(persist_directory=vectordb_path, embedding_function=embedding)
    return loaded_db

In [15]:
def response_generator(vectordb, query, llm):

    template = """
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {question}

    Helpful Answer:
    """


    QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

    qa_chain = RetrievalQA.from_chain_type(llm, 
                                           retriever=vectordb.as_retriever(search_kwargs={"k": 3}), 
                                           return_source_documents=True, 
                                           chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

    ans = qa_chain.invoke(query)
    return ans["result"]


In [16]:
vectordb_path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/test_db"

In [17]:
# emed_data = embd_load_vectordb("./knowledge/health_products_data.pdf")
emed_data = embd_load_vectordb("C:/Users/mayur/Desktop/Tech/Project/FracsNet/knowledge/health_products_data.pdf",vectordb_path)

vectordb = load_vectordb(vectordb_path)

Vector database created and persisted.


In [18]:
query = "what is ibuprofen"
answer = response_generator(vectordb, query, llm)
print(answer)

Ibuprofen is a non-steroidal anti-inflammatory drug (NSAID) used to reduce pain, fever, and inflammation.


In [19]:
import pandas as pd
import time

# List of questions
questions = [
    "Which pain reliever is LEAST suitable for someone with stomach ulcers?",
    "Ibuprofen and Aspirin share what warning?",
    "Name two medications that might interact with Warfarin.",
    "What is the first-line medication for Type 2 Diabetes?",
    "Which medication might treat both insomnia and depression?",
    "Difference between bronchodilator and inhaled corticosteroid?",
    "Long-term side effects of Prednisone?",
    "Contraindications for Clopidogrel vs. Warfarin?",
]

# Initialize an empty list to store results
results = []

# Generate answers and record time
for query in questions:
    start_time = time.time()  # Start time tracking
    answer = response_generator(vectordb, query, llm)  # Generate answer
    end_time = time.time()  # End time tracking
    time_taken = end_time - start_time  # Calculate time taken

    # Append results to the list
    results.append({
        "Question": query,
        "Simple RAG result": answer,
        "Simple RAG time": time_taken
    })

# Convert the list to a pandas DataFrame
dataset = pd.DataFrame(results)
path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/simple_RAG.csv"
# Save the dataset to a CSV file
dataset.to_csv(path, index=False)


In [20]:
import pandas as pd

# Set the maximum column width to a large value (e.g., 500 characters)
pd.set_option('display.max_colwidth', 1000)  # Or even higher if needed

df = pd.read_csv(path)
df

Unnamed: 0,Question,Simple RAG result,Simple RAG time
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"Based on the context, Ibuprofen is the pain reliever that is LEAST suitable for someone with stomach ulcers. The context states that Ibuprofen has side effects such as stomach upset, nausea, and increased risk of heart attack/stroke, and it is contraindicated for individuals with a history of stomach ulcers.",0.922425
1,Ibuprofen and Aspirin share what warning?,"Ibuprofen and Aspirin are not mentioned in the given context. However, since Ibuprofen's interactions include ""May interact with blood thinners, aspirin, other NSAIDs"", we can infer that Ibuprofen may interact with Aspirin.",0.716047
2,Name two medications that might interact with Warfarin.,"Based on the provided context, I can help you with that.\n\nAccording to the information, medications that might interact with Warfarin include:\n\n1. Aspirin (mentioned in medication summary 36): May interact with blood thinners, including Warfarin.\n2. Clopidogrel (mentioned in medication summary 37): May interact with other medications that increase bleeding risk, including Warfarin.\n\nPlease note that this is based on the provided context, and it's essential to consult with a healthcare professional for personalized advice on medication interactions.",19.748091
3,What is the first-line medication for Type 2 Diabetes?,I don't know.,25.330218
4,Which medication might treat both insomnia and depression?,"Based on the provided context, both Zoloft and Prozac are mentioned as treating depression, and they both have insomnia as a common side effect. Additionally, Celexa and Paxil are also mentioned as treating depression and have insomnia as a common side effect. Therefore, the answer is that all four medications (Zoloft, Prozac, Celexa, and Paxil) might treat both insomnia and depression.",19.46325
5,Difference between bronchodilator and inhaled corticosteroid?,"I don't know. The provided context only discusses specific medications (salmeterol, albuterol, ipratropium, tiotropium, and montelukast) and does not mention bronchodilators or inhaled corticosteroids in general.",25.681642
6,Long-term side effects of Prednisone?,"According to the context, the long-term use of Prednisone can have significant side effects.",25.369845
7,Contraindications for Clopidogrel vs. Warfarin?,"I don't know.\n\nThe context only provides information about Enoxaparin, Aspirin, Clopidogrel, and Pravastatin. There is no information about Warfarin.",24.301037


# Advanced RAG (Dense Passage Retrieval (DPR) Technique)

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
import faiss
import numpy as np

# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

documents = PyPDFLoader("./knowledge/health_products_data.pdf").load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects

# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)

# Create FAISS Index
dimension = passage_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(passage_embeddings)

# Encode Query
query = "which medicine could be usefull for knee pain"
query_embedding = query_encoder.encode([query], convert_to_numpy=True)

# Perform Similarity Search
k = 2  # Retrieve top-2 passages
distances, indices = index.search(query_embedding, k)

# Print Results
print("Query:", query)
print("\nTop Relevant Passages:")
for i in range(k):
    print(f"{i+1}. {passages[indices[0][i]]} (Distance: {distances[0][i]:.4f})")


In [None]:
from sentence_transformers import SentenceTransformer
import faiss

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()
import numpy as np

llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )

# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

documents = PyPDFLoader("./knowledge/health_products_data.pdf").load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects


# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)

# Create FAISS Index
dimension = passage_embeddings.shape[1]
vectordb = faiss.IndexFlatL2(dimension)
vectordb.add(passage_embeddings)

def response_generator(passages, query, llm):

    query_embedding = query_encoder.encode([query], convert_to_numpy=True)
    # Perform Similarity Search
    k = 2  # Retrieve top-2 passages
    indices = vectordb.search(query_embedding, k)
    context = [passages[i] for i in indices[0]] 

    template = f"""
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {query}

    Helpful Answer:
    """
    res = llm.invoke(template)
    return res.content


query = "which medicine is used to treat depression"
ans= response_generator(passages, query, llm)
print(ans)

In [21]:
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
import faiss
import numpy as np

from langchain_groq import ChatGroq


In [22]:
llm = ChatGroq(model_name="Llama3-8b-8192")

In [23]:
# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')


In [24]:
data_path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/knowledge/health_products_data.pdf"
documents = PyPDFLoader(data_path).load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects

# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)


In [25]:
# # Create FAISS Index
dimension = passage_embeddings.shape[1]
vectordb = faiss.IndexFlatL2(dimension)
vectordb.add(passage_embeddings)

In [26]:
def response_generator(passages, query, llm):
    query_embedding = query_encoder.encode([query], convert_to_numpy=True)

    # Perform Similarity Search
    k = 3  # Retrieve top-2 passages
    distances, indices = vectordb.search(query_embedding, k)

    # Extract relevant passages
    context = [passages[i] for i in indices[0].tolist()]  # Convert NumPy array to list

    # Construct prompt
    template = f"""
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {query}

    Helpful Answer:
    """

    # Generate response using LLM
    res = llm.invoke(template)
    return res.content



In [27]:
query = "which medicine is used to treat depression"
ans = response_generator(passages, query, llm)
print(ans)

Wellbutrin (bupropion) is an antidepressant medication used to treat depression, as well as to aid in smoking cessation.


In [30]:
import pandas as pd
import time

# List of questions
questions = [
    "Which pain reliever is LEAST suitable for someone with stomach ulcers?",
    "Ibuprofen and Aspirin share what warning?",
    "Name two medications that might interact with Warfarin.",
    "What is the first-line medication for Type 2 Diabetes?",
    "Which medication might treat both insomnia and depression?",
    "Difference between bronchodilator and inhaled corticosteroid?",
    "Long-term side effects of Prednisone?",
    "Contraindications for Clopidogrel vs. Warfarin?",
]

# Initialize an empty list to store results
results = []

# Generate answers and record time
for query in questions:
    start_time = time.time()  # Start time tracking
    answer = response_generator(passages, query, llm) # Generate answer
    end_time = time.time()  # End time tracking
    time_taken = end_time - start_time  # Calculate time taken

    # Append results to the list
    results.append({
        "Question": query,
        "DPR RAG result": answer,
        "DPR RAG time": time_taken
    })

# Convert the list to a pandas DataFrame
dataset = pd.DataFrame(results)
path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/DPR_RAG.csv"
# Save the dataset to a CSV file
dataset.to_csv(path, index=False)


In [31]:
import pandas as pd

# Set the maximum column width to a large value (e.g., 500 characters)
pd.set_option('display.max_colwidth', 1000)  # Or even higher if needed

df = pd.read_csv(path)
df

Unnamed: 0,Question,DPR RAG result,DPR RAG time
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"According to the context, Aspirin is mentioned as a medication used to reduce pain, fever, and inflammation. However, it is also mentioned as a medication that can cause stomach upset, bleeding, ulcers, and ringing in the ears. Therefore, it is likely that Aspirin is NOT suitable for someone with stomach ulcers.\n\nSo, the correct answer is: Aspirin.",3.263138
1,Ibuprofen and Aspirin share what warning?,"According to the context, Ibuprofen and Aspirin do not share any information as Ibuprofen is not mentioned in the context. However, Aspirin shares the warning ""Take with food to reduce stomach irritation. Avoid alcohol.""",24.703709
2,Name two medications that might interact with Warfarin.,"Based on the provided context, two medications that might interact with Warfarin are:\n\n1. Aspirin\n2. Antibiotics",26.579809
3,What is the first-line medication for Type 2 Diabetes?,I don't know. The provided context does not mention Type 2 Diabetes or any medications for its treatment.,24.780622
4,Which medication might treat both insomnia and depression?,"According to the context, Remeron (mirtazapine) is an antidepressant medication that also helps to improve sleep. It is used to treat depression and insomnia. Therefore, Remeron is the medication that might treat both insomnia and depression.",24.313662
5,Difference between bronchodilator and inhaled corticosteroid?,I don't know.,24.486315
6,Long-term side effects of Prednisone?,I don't know.,24.285315
7,Contraindications for Clopidogrel vs. Warfarin?,"Contraindications for Clopidogrel: Active bleeding, severe liver disease.\n\nContraindications for Warfarin: Active bleeding, severe liver disease, pregnancy.\n\nNote: These answers are based solely on the provided context and do not include any external information.",24.613664


# Comparison dataset of both Techniques

In [32]:
import pandas as pd


path1 = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/simple_RAG.csv"
path2 = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/DPR_RAG.csv"

df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

# Rename specific columns
# df1 = df1.rename(columns={"Answer": "Simple RAG result", "Time Taken (seconds)": "simple RAG time"})
# df2 = df2.rename(columns={"Answer": "DPR RAG result", "Time Taken (seconds)": "DPR RAG time"})

In [33]:
# Merge the DataFrames on the common column
merged_df = pd.merge(df1, df2, on='Question', how='inner')  # You can change 'how' to 'left', 'right', or 'outer'
merged_df

Unnamed: 0,Question,Simple RAG result,Simple RAG time,DPR RAG result,DPR RAG time
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"Based on the context, Ibuprofen is the pain reliever that is LEAST suitable for someone with stomach ulcers. The context states that Ibuprofen has side effects such as stomach upset, nausea, and increased risk of heart attack/stroke, and it is contraindicated for individuals with a history of stomach ulcers.",0.922425,"According to the context, Aspirin is mentioned as a medication used to reduce pain, fever, and inflammation. However, it is also mentioned as a medication that can cause stomach upset, bleeding, ulcers, and ringing in the ears. Therefore, it is likely that Aspirin is NOT suitable for someone with stomach ulcers.\n\nSo, the correct answer is: Aspirin.",3.263138
1,Ibuprofen and Aspirin share what warning?,"Ibuprofen and Aspirin are not mentioned in the given context. However, since Ibuprofen's interactions include ""May interact with blood thinners, aspirin, other NSAIDs"", we can infer that Ibuprofen may interact with Aspirin.",0.716047,"According to the context, Ibuprofen and Aspirin do not share any information as Ibuprofen is not mentioned in the context. However, Aspirin shares the warning ""Take with food to reduce stomach irritation. Avoid alcohol.""",24.703709
2,Name two medications that might interact with Warfarin.,"Based on the provided context, I can help you with that.\n\nAccording to the information, medications that might interact with Warfarin include:\n\n1. Aspirin (mentioned in medication summary 36): May interact with blood thinners, including Warfarin.\n2. Clopidogrel (mentioned in medication summary 37): May interact with other medications that increase bleeding risk, including Warfarin.\n\nPlease note that this is based on the provided context, and it's essential to consult with a healthcare professional for personalized advice on medication interactions.",19.748091,"Based on the provided context, two medications that might interact with Warfarin are:\n\n1. Aspirin\n2. Antibiotics",26.579809
3,What is the first-line medication for Type 2 Diabetes?,I don't know.,25.330218,I don't know. The provided context does not mention Type 2 Diabetes or any medications for its treatment.,24.780622
4,Which medication might treat both insomnia and depression?,"Based on the provided context, both Zoloft and Prozac are mentioned as treating depression, and they both have insomnia as a common side effect. Additionally, Celexa and Paxil are also mentioned as treating depression and have insomnia as a common side effect. Therefore, the answer is that all four medications (Zoloft, Prozac, Celexa, and Paxil) might treat both insomnia and depression.",19.46325,"According to the context, Remeron (mirtazapine) is an antidepressant medication that also helps to improve sleep. It is used to treat depression and insomnia. Therefore, Remeron is the medication that might treat both insomnia and depression.",24.313662
5,Difference between bronchodilator and inhaled corticosteroid?,"I don't know. The provided context only discusses specific medications (salmeterol, albuterol, ipratropium, tiotropium, and montelukast) and does not mention bronchodilators or inhaled corticosteroids in general.",25.681642,I don't know.,24.486315
6,Long-term side effects of Prednisone?,"According to the context, the long-term use of Prednisone can have significant side effects.",25.369845,I don't know.,24.285315
7,Contraindications for Clopidogrel vs. Warfarin?,"I don't know.\n\nThe context only provides information about Enoxaparin, Aspirin, Clopidogrel, and Pravastatin. There is no information about Warfarin.",24.301037,"Contraindications for Clopidogrel: Active bleeding, severe liver disease.\n\nContraindications for Warfarin: Active bleeding, severe liver disease, pregnancy.\n\nNote: These answers are based solely on the provided context and do not include any external information.",24.613664


# Modular RAG

# Graph RAG