# AyurBot: Using RAG Based LLM

In [24]:
import os
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()

MONGODB_CONNECTION_STRING = os.getenv("MONGODB_URI")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION")
API_TOKEN = os.getenv("API_TOKEN")


## Text extraction from Book

In [None]:
# Function to extract the data from the file
import re
import fitz  # PyMuPDF

# Open the PDF file
pdf_document = fitz.open('/Users/mrinoyb2/git/AyurBot/Data/pdf/Ayurveda_Book.pdf')

# Function to preprocess and clean text
def preprocess_text_mupdf(text):
    # Remove headers/footers
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove empty lines
    text = re.sub(r'[^A-Za-z0-9.,;:!?()\'\"\n]+', ' ', text)  # Remove special characters but keep punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

# Extract and clean text
cleaned_text_mupdf = ""
for page_number in range(pdf_document.page_count):
    page = pdf_document.load_page(page_number)
    text = page.get_text()
    cleaned_text_mupdf += preprocess_text_mupdf(text)

# Close the PDF document
pdf_document.close()

# Output the first 
print(cleaned_text_mupdf)

# Save the cleaned text to a file
with open('/Users/mrinoyb2/git/AyurBot/Data/clean_text/Ayurveda_Book.txt', 'w') as file:
    file.write(cleaned_text_mupdf)


## Store chunks in MongoDB database

In [None]:
import pymongo

# Connect to MongoDB (Update the connection string as per your MongoDB setup)
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

# Chunking the text by sentence to boost accurate retrieval
def chunk_by_sentence(text):
    sentences = []
    tmp_sentence = ""
    for char in text:
        if char in [".", "!", "?"]:
            sentences.append(tmp_sentence)
            tmp_sentence = ""
        else:
            tmp_sentence += char
    # Add any remaining text as the last sentence
    if tmp_sentence:
        sentences.append(tmp_sentence)
    return sentences
    

# Chunk the text
chunks = chunk_by_sentence(cleaned_text_mupdf)

# Store chunks in MongoDB
for idx, chunk in enumerate(chunks):
    # Create a document for each chunk
    document = {"_id": idx, "text": chunk}
    # Insert the document into the collection
    collection.insert_one(document)

print(f"Total chunks stored in MongoDB: {len(chunks)}")

## Implement RAG

### Create word embeddings

In [None]:
from sentence_transformers import SentenceTransformer
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to update documents with embeddings
def update_documents_with_embeddings():
    for document in chunks_collection.find():
        # Generate embedding
        embedding = model.encode(document['text'], convert_to_tensor=False)
        # Update document with embedding
        chunks_collection.update_one({'_id': document['_id']}, {'$set': {'embedding': embedding.tolist()}})

# Uncomment the following line to run the embedding update
update_documents_with_embeddings()


### Semantic search retrieval

In [25]:
from sentence_transformers import SentenceTransformer
import pymongo
import numpy as np
from scipy.spatial.distance import cosine

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Function to perform semantic search
def semantic_search(query, top_k=5):
    # Generate query embedding
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query, convert_to_tensor=False)
    
    # Retrieve all embeddings from MongoDB and calculate similarity
    similarities = []
    for document in chunks_collection.find():
        doc_embedding = np.array(document['embedding'])
        similarity = 1 - cosine(query_embedding, doc_embedding)  # Higher score means more similar
        similarities.append((document['_id'], similarity, document['text']))
    
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return top_k most similar documents
    return similarities[:top_k]

# Example usage
query = "How to cure backpain?"
results = semantic_search(query)
for idx, (doc_id, similarity, text) in enumerate(results, start=1):
    print(f"Result {idx} (Score: {similarity:.3f}): {text[:100]}...")  # Print the first 100 characters for brevity



Result 1 (Score: 0.653): Whatever the cause of your backache, the following natural Ayurvedic home remedies will be helpful....
Result 2 (Score: 0.645): Following these guidelines will help you heal your aching back as well as avoid backpain in the futu...
Result 3 (Score: 0.563): Backache can also be relieved by the use of the herb musta, which is a muscle painkiller....
Result 4 (Score: 0.529): Some gentle yoga exercises can help with back pain....
Result 5 (Score: 0.519): For extra healing and muscle relaxation, apply the mahanarayan oil on your back and then follow with...


## Connect LLM model

In [26]:
import replicate
from replicate.client import Client
import os

# Function to generate an answer using LLama2 from Replicate
def generate_answer(question, max_context_length=1000):
    # Assume semantic_search is defined and returns relevant context as a single string
    context_results = semantic_search(question, top_k=1)
    if context_results:
        context = context_results[0][2]  # Get the text of the top result
        # Truncate context if it exceeds the maximum length
        if len(context) > max_context_length:
            context = context[:max_context_length]
        prompt = f"[INST]\nQuestion: {question}\nContext: {context}\n[/INST]"
        print(prompt)
    else:
        prompt = f"[INST]\nQuestion: {question}\n[/INST]"  # Fallback in case no context is found
    
   
    client = Client(api_token=API_TOKEN)

    # Generate the answer using LLama2 from Replicate
    output = client.run(
        "nwhitehead/llama2-7b-chat-gptq:8c1f632f7a9df740bfbe8f6b35e491ddfe5c43a79b43f062f719ccbe03772b52",
        input={
            "seed": -1,
            "top_k": 20,
            "top_p": 1,
            "prompt": prompt,
            "max_tokens": 1024,
            "min_tokens": 1,
            "temperature": 0.5,
            "repetition_penalty": 1
        }
    )
    answer = ""
    for item in output:
        answer += item
        
    # Handle the case where the answer is empty
    if not answer:
        answer = "Sorry, I don't have an answer for that."

    return answer

In [27]:
# Example query
query = "What is Ayurveda?"
answer = generate_answer(query)
print(answer)

[INST]
Question: What is Ayurveda?
Context: AYURVEDA.
[/INST]
  Ayurveda is a traditional system of medicine and healthy living that originated in India over 5,000 years ago. The term "Ayurveda" is derived from the Sanskrit words "ayur" meaning "life" and "veda" meaning "science" or "knowledge." It is based on the belief that the mind, body, and spirit are interconnected and that imbalances in any one of these areas can lead to disease.
Ayurveda emphasizes the use of natural remedies, such as herbs, yoga, and meditation, to promote overall wellness and balance in the body. It also focuses on prevention and early detection of disease, rather than just treating symptoms after they have already developed.
The core principles of Ayurveda are:
1. The body is made up of five elements (earth, water, fire, air, and ether) and three doshas (Vata, Pitta, and Kapha). Imbalances in these elements and doshas can lead to disease.
2. The body has a natural healing mechanism, and Ayurveda seeks to enh