## Text extraction from Book

In [5]:
# Function to extract the data from the file
%pip install PyMuPDF
import re
import fitz  # PyMuPDF

# Open the PDF file
pdf_document = fitz.open('/Users/mrinoyb2/git/AyurBot/Data/pdf/Ayurveda_Book.pdf')

# Function to preprocess and clean text
def preprocess_text_mupdf(text):
    # Remove headers/footers
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove empty lines
    text = re.sub(r'[^A-Za-z0-9.,;:!?()\'\"\n]+', ' ', text)  # Remove special characters but keep punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

# Extract and clean text
cleaned_text_mupdf = ""
for page_number in range(pdf_document.page_count):
    page = pdf_document.load_page(page_number)
    text = page.get_text()
    cleaned_text_mupdf += preprocess_text_mupdf(text)

# Close the PDF document
pdf_document.close()

# Output the first 
print(cleaned_text_mupdf)

# Save the cleaned text to a file
with open('/Users/mrinoyb2/git/AyurBot/Data/clean_text/Ayurveda_Book.txt', 'w') as file:
    file.write(cleaned_text_mupdf)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## Store chunks in MongoDB database

In [8]:

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import pymongo

# Connect to MongoDB (Update the connection string as per your MongoDB setup)
client = pymongo.MongoClient("mongodb+srv://mrinoybanerjee:Areracolony1@cluster0.njwesz0.mongodb.net/")
db = client["Ayurveda-text"]
collection = db["Ayurveda"]

# Function to chunk text by sentence
def chunk_by_sentence(text):
    return sent_tokenize(text)

# Chunk the text
chunks = chunk_by_sentence(cleaned_text_mupdf)

# Store chunks in MongoDB
for idx, chunk in enumerate(chunks):
    # Create a document for each chunk
    document = {"_id": idx, "text": chunk}
    # Insert the document into the collection
    collection.insert_one(document)

print(f"Total chunks stored in MongoDB: {len(chunks)}")

[nltk_data] Downloading package punkt to /Users/mrinoyb2/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Total chunks stored in MongoDB: 7326


## Implement RAG

### Create word embeddings

In [11]:
from sentence_transformers import SentenceTransformer
import pymongo

# Connect to MongoDB
mongo_client = pymongo.MongoClient("mongodb+srv://mrinoybanerjee:Areracolony1@cluster0.njwesz0.mongodb.net/")
db = mongo_client["Ayurveda-text"]
chunks_collection = db["Ayurveda"]

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to update documents with embeddings
def update_documents_with_embeddings():
    for document in chunks_collection.find():
        # Generate embedding
        embedding = model.encode(document['text'], convert_to_tensor=False)
        # Update document with embedding
        chunks_collection.update_one({'_id': document['_id']}, {'$set': {'embedding': embedding.tolist()}})

# Uncomment the following line to run the embedding update
update_documents_with_embeddings()


  from .autonotebook import tqdm as notebook_tqdm
modules.json: 100%|██████████| 349/349 [00:00<00:00, 167kB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 117kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 3.20MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 24.6kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 348kB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:02<00:00, 35.8MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 468kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.79MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.22MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 56.2kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 81.0kB/s]


### Semantic search retrieval

In [12]:
from scipy.spatial.distance import cosine
import numpy as np

# Function to perform semantic search
def semantic_search(query, top_k=5):
    # Convert query to embedding
    query_embedding = model.encode(query, convert_to_tensor=False)
    
    # Retrieve all embeddings from MongoDB and calculate similarity
    similarities = []
    for document in chunks_collection.find():
        doc_embedding = np.array(document['embedding'])
        similarity = 1 - cosine(query_embedding, doc_embedding)  # Higher score means more similar
        similarities.append((document['_id'], similarity, document['text']))
    
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return top_k most similar documents
    return similarities[:top_k]

# Example usage
query = "What are the benefits of Ayurveda?"
results = semantic_search(query)
for idx, (doc_id, similarity, text) in enumerate(results, start=1):
    print(f"Result {idx} (Score: {similarity:.3f}): {text[:100]}...")  # Print the first 100 characters for brevity


Result 1 (Score: 0.817): AYURVEDA....
Result 2 (Score: 0.785): In this chapter we will consider some of the fundamental principles and approaches recommended by Ay...
Result 3 (Score: 0.779): The Hidden Secret of Ayurveda....
Result 4 (Score: 0.757): These principles are a key to healing with Ayurveda....
Result 5 (Score: 0.757): Ayurveda is a system of natural medicine, which means that you have to see what is happening in natu...


## Connect LLM model

In [13]:
%pip install transformers torch


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [15]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, pipeline

# Correctly initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B")

# Use the pipeline for simplicity
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Function to generate an answer (make sure to have your semantic_search function defined as before)
def generate_answer(question):
    # Assume semantic_search is defined and returns relevant context as a single string
    context = "Your context fetched from semantic_search"
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"
    
    # Generate the answer
    generated_answers = generator(prompt, max_length=150, num_return_sequences=1)
    answer = generated_answers[0]['generated_text']
    
    return answer

# Example query
query = "What is the history of Ayurveda?"
answer = generate_answer(query)
print(answer)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 