# Simple RAG

**Tech Stack** 
1. vectordatabase - ChramaDB
2. sentence embedding - all-MiniLM-L6-v2
3. llm - llama3-8b


In [None]:

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()

llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )



def embd_load_vectordb(filepath):
    # Initialize the embedding model
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        # Load and split the PDF document
    docs = PyPDFLoader(filepath).load_and_split()
    # Create a Chroma vector store with a specified directory for persistence
    vectordb = Chroma.from_documents(docs, embedding, persist_directory="./test_db")
    print("Vector database created and persisted.")
    return vectordb


def load_vectordb():
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    loaded_db = Chroma(persist_directory="./test_db", embedding_function=embedding)
    return loaded_db
    
vectordb = load_vectordb()



def response_generator(vectordb, query, llm):
    template = """Use the following pieces of context to answer the question at the end. 
    If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. {context} Question: {question} Helpful Answer:"""


    QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

    qa_chain = RetrievalQA.from_chain_type(llm, 
                                           retriever=vectordb.as_retriever(), 
                                           return_source_documents=True, 
                                           chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

    ans = qa_chain.invoke(query)
    return ans["result"]


query = "what are the side effects of Ondansetron"
ans = response_generator(vectordb, query, llm)
print(ans)

In [36]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()

True

In [37]:
llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )


In [38]:
def embd_load_vectordb(filepath, vectordb_path):
    # Initialize the embedding model
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
        # Load and split the PDF document
    docs = PyPDFLoader(filepath).load_and_split()
    # Create a Chroma vector store with a specified directory for persistence
    vectordb = Chroma.from_documents(docs, embedding, persist_directory=vectordb_path)
    print("Vector database created and persisted.")
    return vectordb


In [39]:
def load_vectordb(vectordb_path):
    embedding = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

    loaded_db = Chroma(persist_directory=vectordb_path, embedding_function=embedding)
    return loaded_db

In [40]:
def response_generator(vectordb, query, llm):

    template = """
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {question}

    Helpful Answer:
    """


    QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"], template=template)

    qa_chain = RetrievalQA.from_chain_type(llm, 
                                           retriever=vectordb.as_retriever(search_kwargs={"k": 3}), 
                                           return_source_documents=True, 
                                           chain_type_kwargs={"prompt":QA_CHAIN_PROMPT})

    ans = qa_chain.invoke(query)
    return ans["result"]


In [41]:
vectordb_path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/test_db"

In [42]:
# emed_data = embd_load_vectordb("./knowledge/health_products_data.pdf")
emed_data = embd_load_vectordb("C:/Users/mayur/Desktop/Tech/Project/FracsNet/knowledge/health_products_data.pdf",vectordb_path)

vectordb = load_vectordb(vectordb_path)

Vector database created and persisted.


In [43]:
query = "what is ibuprofen"
answer = response_generator(vectordb, query, llm)
print(answer)

Ibuprofen is a non-steroidal anti-inflammatory drug (NSAID) used to reduce pain, fever, and inflammation.


In [44]:
import pandas as pd
import time

# List of questions
questions = [
    "Which pain reliever is LEAST suitable for someone with stomach ulcers?",
    "Ibuprofen and Aspirin share what warning?",
    "Name two medications that might interact with Warfarin.",
    "What is the first-line medication for Type 2 Diabetes?",
    "Which medication might treat both insomnia and depression?",
    "Difference between bronchodilator and inhaled corticosteroid?",
    "Long-term side effects of Prednisone?",
    "Contraindications for Clopidogrel vs. Warfarin?",
]

# Initialize an empty list to store results
results = []

# Generate answers and record time
for query in questions:
    start_time = time.time()  # Start time tracking
    answer = response_generator(vectordb, query, llm)  # Generate answer
    end_time = time.time()  # End time tracking
    time_taken = end_time - start_time  # Calculate time taken

    # Append results to the list
    results.append({
        "Question": query,
        "Answer": answer,
        "Time Taken (seconds)": time_taken
    })

# Convert the list to a pandas DataFrame
dataset = pd.DataFrame(results)
path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/simple_RAG.csv"
# Save the dataset to a CSV file
dataset.to_csv(path, index=False)


In [45]:
import pandas as pd

# Set the maximum column width to a large value (e.g., 500 characters)
pd.set_option('display.max_colwidth', 1000)  # Or even higher if needed

df = pd.read_csv(path)
df

Unnamed: 0,Question,Answer,Time Taken (seconds)
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"According to the context, Ibuprofen is not suitable for someone with stomach ulcers, as it can cause stomach upset, nausea, and dizziness.",0.821572
1,Ibuprofen and Aspirin share what warning?,Ibuprofen and Aspirin both warn against taking alcohol.,22.337727
2,Name two medications that might interact with Warfarin.,"Based on the provided context, I can help you with that!\n\nTwo medications that might interact with Warfarin are:\n\n1. Aspirin (mentioned in medication summary 36)\n2. Clopidogrel (mentioned in medication summary 37)\n\nThese medications may interact with Warfarin, as they are both blood thinners and can increase the risk of bleeding when taken together.",21.394796
3,What is the first-line medication for Type 2 Diabetes?,I don't know. The provided context does not mention the first-line medication for Type 2 Diabetes.,25.257297
4,Which medication might treat both insomnia and depression?,"According to the provided context, medications that might treat both insomnia and depression are:\n\n* Zoloft (sertraline): Common side effects include insomnia and depression.\n* Prozac (fluoxetine): Common side effects include insomnia and depression.\n* Paxil (paroxetine): Common side effects include insomnia and depression.\n* Celexa (citalopram): Common side effects include insomnia and depression.\n\nPlease note that this is not a substitute for professional medical advice. If you are experiencing insomnia and depression, it is essential to consult with a healthcare professional for proper diagnosis and treatment.",19.942706
5,Difference between bronchodilator and inhaled corticosteroid?,I don't know. The provided context does not contain information on the difference between bronchodilators and inhaled corticosteroids.,26.205215
6,Long-term side effects of Prednisone?,"According to the provided context, Long-term use of Prednisone can have significant side effects.",25.946357
7,Contraindications for Clopidogrel vs. Warfarin?,I don't know.\n\nThe information provided only contains details about Clopidogrel and Warfarin is not mentioned.,24.450222


# Advanced RAG (Dense Passage Retrieval (DPR) Technique)

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
import faiss
import numpy as np

# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

documents = PyPDFLoader("./knowledge/health_products_data.pdf").load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects

# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)

# Create FAISS Index
dimension = passage_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(passage_embeddings)

# Encode Query
query = "which medicine could be usefull for knee pain"
query_embedding = query_encoder.encode([query], convert_to_numpy=True)

# Perform Similarity Search
k = 2  # Retrieve top-2 passages
distances, indices = index.search(query_embedding, k)

# Print Results
print("Query:", query)
print("\nTop Relevant Passages:")
for i in range(k):
    print(f"{i+1}. {passages[indices[0][i]]} (Distance: {distances[0][i]:.4f})")


In [None]:
from sentence_transformers import SentenceTransformer
import faiss

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
load_dotenv()
import numpy as np

llm = ChatGroq(model_name="Llama3-8b-8192")

# llm = ChatOllama(
#     model = "deepseek-r1:1.5b",
#     temperature = 0,
#     num_predict = 256,
#     # other params ...
# )

# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

documents = PyPDFLoader("./knowledge/health_products_data.pdf").load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects


# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)

# Create FAISS Index
dimension = passage_embeddings.shape[1]
vectordb = faiss.IndexFlatL2(dimension)
vectordb.add(passage_embeddings)

def response_generator(passages, query, llm):

    query_embedding = query_encoder.encode([query], convert_to_numpy=True)
    # Perform Similarity Search
    k = 2  # Retrieve top-2 passages
    indices = vectordb.search(query_embedding, k)
    context = [passages[i] for i in indices[0]] 

    template = f"""
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {query}

    Helpful Answer:
    """
    res = llm.invoke(template)
    return res.content


query = "which medicine is used to treat depression"
ans= response_generator(passages, query, llm)
print(ans)

In [46]:
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
import faiss
import numpy as np

from langchain_groq import ChatGroq


In [47]:
llm = ChatGroq(model_name="Llama3-8b-8192")

In [48]:
# Load DPR Model
query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')


In [49]:
data_path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/knowledge/health_products_data.pdf"
documents = PyPDFLoader(data_path).load_and_split()
passages = [doc.page_content for doc in documents]  # Extract text from Document objects

# Encode Passages into Dense Vectors
passage_embeddings = passage_encoder.encode(passages, convert_to_numpy=True)


In [50]:
# # Create FAISS Index
dimension = passage_embeddings.shape[1]
vectordb = faiss.IndexFlatL2(dimension)
vectordb.add(passage_embeddings)

In [51]:
def response_generator(passages, query, llm):
    query_embedding = query_encoder.encode([query], convert_to_numpy=True)

    # Perform Similarity Search
    k = 3  # Retrieve top-2 passages
    distances, indices = vectordb.search(query_embedding, k)

    # Extract relevant passages
    context = [passages[i] for i in indices[0].tolist()]  # Convert NumPy array to list

    # Construct prompt
    template = f"""
    You are an intelligent assistant designed to provide accurate and concise answers based on the context provided. 
    Follow these rules strictly:
    1. Use ONLY the information provided in the context to answer the question.
    2. If the context does not contain enough information to answer the question, say "I don't know."
    3. Do not make up or assume any information outside of the context.
    4. Keep your answer concise and to the point (maximum 3 sentences).

    Context:
    {context}

    Question:
    {query}

    Helpful Answer:
    """

    # Generate response using LLM
    res = llm.invoke(template)
    return res.content



In [52]:
query = "which medicine is used to treat depression"
ans = response_generator(passages, query, llm)
print(ans)

According to the provided context, the medicines used to treat depression are:

* Wellbutrin (bupropion)
* Remeron (mirtazapine)
* Trazodone


In [53]:
import pandas as pd
import time

# List of questions
questions = [
    "Which pain reliever is LEAST suitable for someone with stomach ulcers?",
    "Ibuprofen and Aspirin share what warning?",
    "Name two medications that might interact with Warfarin.",
    "What is the first-line medication for Type 2 Diabetes?",
    "Which medication might treat both insomnia and depression?",
    "Difference between bronchodilator and inhaled corticosteroid?",
    "Long-term side effects of Prednisone?",
    "Contraindications for Clopidogrel vs. Warfarin?",
]

# Initialize an empty list to store results
results = []

# Generate answers and record time
for query in questions:
    start_time = time.time()  # Start time tracking
    answer = response_generator(passages, query, llm) # Generate answer
    end_time = time.time()  # End time tracking
    time_taken = end_time - start_time  # Calculate time taken

    # Append results to the list
    results.append({
        "Question": query,
        "Answer": answer,
        "Time Taken (seconds)": time_taken
    })

# Convert the list to a pandas DataFrame
dataset = pd.DataFrame(results)
path = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/DPR_RAG.csv"
# Save the dataset to a CSV file
dataset.to_csv(path, index=False)


In [54]:
import pandas as pd

# Set the maximum column width to a large value (e.g., 500 characters)
pd.set_option('display.max_colwidth', 1000)  # Or even higher if needed

df = pd.read_csv(path)
df

Unnamed: 0,Question,Answer,Time Taken (seconds)
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"Based on the provided context, Aspirin is the pain reliever that is LEAST suitable for someone with stomach ulcers. The context states that Aspirin is contraindicated for individuals with a history of stomach ulcers.",15.99491
1,Ibuprofen and Aspirin share what warning?,"According to the context, Aspirin and Ibuprofen (not mentioned in the context, but assumed to share the warning) share the warning ""Avoid alcohol"".",24.793888
2,Name two medications that might interact with Warfarin.,"I can help with that!\n\nAccording to the provided context, two medications that might interact with Warfarin are:\n\n1. Aspirin\n2. Antibiotics\n\nThese medications can interact with Warfarin, which is an anticoagulant medication used to prevent blood clots.",27.045915
3,What is the first-line medication for Type 2 Diabetes?,I don't know. The provided context does not mention Type 2 Diabetes or any medications for it.,24.949702
4,Which medication might treat both insomnia and depression?,"Based on the provided context, the medication that might treat both insomnia and depression is Remeron (mirtazapine).",23.656723
5,Difference between bronchodilator and inhaled corticosteroid?,"I don't know. The provided context does not contain information about bronchodilators, inhaled corticosteroids, or their differences.",25.189033
6,Long-term side effects of Prednisone?,I don't know.,23.997596
7,Contraindications for Clopidogrel vs. Warfarin?,"Contraindications for Clopidogrel: Active bleeding, severe liver disease.\nContraindications for Warfarin: Active bleeding, severe liver disease, pregnancy.",24.694759


# Comparison dataset of both Techniques

In [61]:
import pandas as pd


path1 = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/simple_RAG.csv"
path2 = "C:/Users/mayur/Desktop/Tech/Project/FracsNet/RAG_tech_comparisons/DPR_RAG.csv"

df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

# Rename specific columns
df1 = df1.rename(columns={"Answer": "Simple RAG result", "Time Taken (seconds)": "simple RAG time"})
df2 = df2.rename(columns={"Answer": "DPR RAG result", "Time Taken (seconds)": "DPR RAG time"})

In [62]:
# Merge the DataFrames on the common column
merged_df = pd.merge(df1, df2, on='Question', how='inner')  # You can change 'how' to 'left', 'right', or 'outer'
merged_df

Unnamed: 0,Question,Simple RAG result,simple RAG time,DPR RAG result,DPR RAG time
0,Which pain reliever is LEAST suitable for someone with stomach ulcers?,"According to the context, Ibuprofen is not suitable for someone with stomach ulcers, as it can cause stomach upset, nausea, and dizziness.",0.821572,"Based on the provided context, Aspirin is the pain reliever that is LEAST suitable for someone with stomach ulcers. The context states that Aspirin is contraindicated for individuals with a history of stomach ulcers.",15.99491
1,Ibuprofen and Aspirin share what warning?,Ibuprofen and Aspirin both warn against taking alcohol.,22.337727,"According to the context, Aspirin and Ibuprofen (not mentioned in the context, but assumed to share the warning) share the warning ""Avoid alcohol"".",24.793888
2,Name two medications that might interact with Warfarin.,"Based on the provided context, I can help you with that!\n\nTwo medications that might interact with Warfarin are:\n\n1. Aspirin (mentioned in medication summary 36)\n2. Clopidogrel (mentioned in medication summary 37)\n\nThese medications may interact with Warfarin, as they are both blood thinners and can increase the risk of bleeding when taken together.",21.394796,"I can help with that!\n\nAccording to the provided context, two medications that might interact with Warfarin are:\n\n1. Aspirin\n2. Antibiotics\n\nThese medications can interact with Warfarin, which is an anticoagulant medication used to prevent blood clots.",27.045915
3,What is the first-line medication for Type 2 Diabetes?,I don't know. The provided context does not mention the first-line medication for Type 2 Diabetes.,25.257297,I don't know. The provided context does not mention Type 2 Diabetes or any medications for it.,24.949702
4,Which medication might treat both insomnia and depression?,"According to the provided context, medications that might treat both insomnia and depression are:\n\n* Zoloft (sertraline): Common side effects include insomnia and depression.\n* Prozac (fluoxetine): Common side effects include insomnia and depression.\n* Paxil (paroxetine): Common side effects include insomnia and depression.\n* Celexa (citalopram): Common side effects include insomnia and depression.\n\nPlease note that this is not a substitute for professional medical advice. If you are experiencing insomnia and depression, it is essential to consult with a healthcare professional for proper diagnosis and treatment.",19.942706,"Based on the provided context, the medication that might treat both insomnia and depression is Remeron (mirtazapine).",23.656723
5,Difference between bronchodilator and inhaled corticosteroid?,I don't know. The provided context does not contain information on the difference between bronchodilators and inhaled corticosteroids.,26.205215,"I don't know. The provided context does not contain information about bronchodilators, inhaled corticosteroids, or their differences.",25.189033
6,Long-term side effects of Prednisone?,"According to the provided context, Long-term use of Prednisone can have significant side effects.",25.946357,I don't know.,23.997596
7,Contraindications for Clopidogrel vs. Warfarin?,I don't know.\n\nThe information provided only contains details about Clopidogrel and Warfarin is not mentioned.,24.450222,"Contraindications for Clopidogrel: Active bleeding, severe liver disease.\nContraindications for Warfarin: Active bleeding, severe liver disease, pregnancy.",24.694759


# Modular RAG

# Graph RAG