<a href="https://colab.research.google.com/github/nadertoti/ERP-Chatbot/blob/main/RAG_Chat_with_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install --quiet pymongo sentence_transformers einops langchain langchain_community pypdf huggingface_hub PyPDF2

In [None]:
# Ensure the vector search index exists
def create_vector_index():
    search_index_model = SearchIndexModel(
        definition={
            "fields": [
                {
                    "type": "vector",
                    "numDimensions": 384,  # Adjusted to match SentenceTransformer output
                    "path": "embedding",
                    "similarity": "cosine"
                }
            ]
        },
        name="vector_index",
        type="vectorSearch"
    )
    collection.create_search_index(model=search_index_model)

In [None]:
# Extract text from PDF
def extract_text_from_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [None]:
# Generate embeddings and save to MongoDB
def save_to_mongodb(text, model):
    sentences = text.split("\n")  # Split into chunks (modify for larger PDFs)
    for sentence in tqdm(sentences, desc="Processing text"):
        if sentence.strip():  # Skip empty lines
            embedding = model.encode(sentence).tolist()
            collection.insert_one({"text": sentence, "embedding": embedding})

In [17]:
import os
import numpy as np
import pymongo
from pymongo.operations import SearchIndexModel
from PyPDF2 import PdfReader
from huggingface_hub import InferenceClient
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# MongoDB connection setup
MONGO_URI = "mongodb+srv://nadertoti4:UU1h8bHKSG2msUhh@chatbot.3g0if.mongodb.net/?retryWrites=true&w=majority&appName=chatbot"
client = pymongo.MongoClient(MONGO_URI)
db = client["rag_db"]
collection = db["test"]

# Retrieve relevant documents using vector search
def get_query_results(query, model):
    query_embedding = model.encode(query).tolist()
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "exact": True,
                "limit": 5
            }
        },
        {
            "$project": {
                "_id": 0,
                "text": 1,
                "score": {"$meta": "vectorSearchScore"}
            }
        }
    ]
    results = collection.aggregate(pipeline)
    return list(results)

# Load Hugging Face LLM
def load_llm():
    os.environ["HF_TOKEN"] = "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"
    return InferenceClient(
        "mistralai/Mistral-7B-Instruct-v0.3",
        token=os.getenv("HF_TOKEN")
    )

# RAG pipeline for chat
def chat_with_pdf(query, model, llm):
    context_docs = get_query_results(query, model)
    context_string = " ".join([doc["text"] for doc in context_docs])
    prompt = f"""Use the following pieces of context to answer the question at the end.
        {context_string}
        Question: {query}
    """
    output = llm.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=150
    )
    return output.choices[0].message.content

# Main application loop
def main():
    # Load embedding model and LLM
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    llm = load_llm()

    # Create vector index
    # create_vector_index()

    print("Welcome to the PDF Chat Application!")
    while True:
        print("\nOptions:")
        print("1. Upload a PDF")
        print("2. Ask a question")
        print("3. Exit")
        choice = input("Enter your choice: ")

        if choice == "1":
            '''
            file_path = input("Enter the path to your PDF file: ")
            try:
                text = extract_text_from_pdf(file_path)
                save_to_mongodb(text, embedding_model)
                print("PDF content uploaded and processed successfully!")
            except Exception as e:
                print(f"Error processing PDF: {e}")
            '''
        elif choice == "2":
            query = input("Enter your question: ")
            try:
                response = chat_with_pdf(query, embedding_model, llm)
                print(f"\nResponse:\n{response}")
            except Exception as e:
                print(f"Error processing query: {e}")

        elif choice == "3":
            print("Exiting the application. Goodbye!")
            break

        else:
            print("Invalid choice. Please try again.")

# Run the application
if __name__ == "__main__":
    main()


Welcome to the PDF Chat Application!

Options:
1. Upload a PDF
2. Ask a question
3. Exit
Enter your choice: 1

Options:
1. Upload a PDF
2. Ask a question
3. Exit
Enter your choice: 3
Exiting the application. Goodbye!
