# Retrieval-Augmented Generation (RAG) Chatbot with PyPDFLoader

This notebook demonstrates a RAG pipeline for answering questions using the contents of a PDF file. It includes:
1. **Loading the PDF with PyPDFLoader.**
2. **Splitting text into chunks and populating ChromaDB.**
3. **Configuring a chatbot with embedding and generative models.**
4. **Querying the chatbot with technical questions.**

### Key Features:
- Simplified PDF loading with `PyPDFLoader`.
- Free, open-source embedding and generative models.
- Modular and reusable code structure.


# Load and proccess PDF

In [1]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document
import torch

# Load the PDF using PyPDFLoader
pdf_path = "../data/pdf/pythonlearn.pdf"
loader = PyPDFLoader(pdf_path)

# Load and split the document into pages
pages = loader.load()

print(f"Loaded {len(pages)} pages from the PDF.")

# Unir páginas con saltos de línea
#pdf_text = "\n\n".join([page.page_content for page in pages])

# Print first 1000 characters of our PDF
print(pages[:1000])

Loaded 241 pages from the PDF.


# Clean text

In [3]:
import re

def clean_text(text):
    # Reemplazar múltiples saltos de línea por uno solo
    text = re.sub(r'\n{2,}', '\n\n', text)
    # Reemplazar múltiples espacios por uno solo
    text = re.sub(r'[ ]{2,}', ' ', text)
    # Eliminar caracteres no imprimibles
    text = re.sub(r'[^\x20-\x7E\n]', '', text)
    return text.strip()

# Aplicar la limpieza al texto del PDF
cleaned_pdf_text = clean_text(pages)

print("Text cleaned correctly!")

print(cleaned_pdf_text[:1000])

TypeError: expected string or bytes-like object

# Create chunks

In [3]:
from langchain.text_splitter import CharacterTextSplitter

# Text splitter config
text_splitter = CharacterTextSplitter(
    chunk_size=10000
    chunk_overlap=0, 
    #separator="."  # If want to separate by something
)

# Split text into chunks
chunks = text_splitter.split_documents(pages)
print(f"Created {len(chunks)} chunks.")

print("Chunks generated correctly!")

Created 241 chunks.
Chunks generated correctly!


# Chroma DB

In [4]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

# Crear objetos Document
#documents = [Document(page_content=chunk) for chunk in chunks]

# Initialize HuggingFace embeddings
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Initialize ChromaDB
db = Chroma.from_documents(chunks, embedding_model, persist_directory="../data/chroma_db")

print("ChromaDB populated and persisted with HuggingFace embeddings.")

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


ChromaDB populated and persisted with HuggingFace embeddings.


# ChatBot config

In [5]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the generative model (e.g., Flan-T5)
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
generative_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

# Define the chatbot class
class GenerativeChatbot:
    def __init__(self, db, generative_model, tokenizer):
        self.db = db
        self.generative_model = generative_model
        self.tokenizer = tokenizer

    def retrieve_context(self, user_question, k=5):
        results = self.db.similarity_search(user_question, k=k)
        return results

    def format_context(self, docs):
        prompt = "\n"
        for doc in docs:
            #clean_content = clean_text(doc.page_content)
            prompt += f"Content:\n{doc.page_content}\n\n"  # Añade saltos claros entre fragmentos
        return prompt

        
    """def clean_response(self, response):
        # Reemplazar múltiples saltos de línea por uno solo
        response = re.sub(r'\n{2,}', '\n\n', response)
        # Reemplazar múltiples espacios por uno solo
        response = re.sub(r'[ ]{2,}', ' ', response)
        # Eliminar caracteres no imprimibles
        response = re.sub(r'[^\x20-\x7E\n]', '', response)
        return response.strip()"""
        
    def generate_response(self, user_question, formatted_context):
        # Generate a tailored prompt for Python content
        prompt = f"""
        You are a Python programming assistant.
    
        ## USER QUESTION:
        {user_question}
    
        ## CONTEXT:
        The following content has been retrieved from Python programming resources:
        '''
        {formatted_context}
        '''
    
        ## TASK:
        1. Use the CONTEXT provided to answer the user's question directly.
        2. Include Python code examples if applicable, using proper formatting (```python ... ```).
        3. If the CONTEXT does not contain the answer, respond with: "The provided context does not contain this information."
        """


        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        inputs = {key: value.to("cuda") for key, value in inputs.items()}
        outputs = self.generative_model.generate(inputs["input_ids"], max_length=500, num_beams=4, early_stopping=True)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        #response = self.clean_response(response)
        
        return response


# ChatBot query

In [6]:
import re
from langchain.docstore.document import Document

def clean_context(docs):
    cleaned_docs = []
    for doc in docs:
        # Si es un objeto Document
        if isinstance(doc, Document):
            clean_text = re.sub(r'/quotesingle.ts1', "'", doc.page_content)  # Reemplazar caracteres extraños
            clean_text = re.sub(r'[^\x20-\x7E]', '', clean_text)  # Eliminar caracteres no imprimibles
            clean_text = re.sub(r'\s+', ' ', clean_text).strip()  # Normalizar espacios
            cleaned_docs.append(Document(page_content=clean_text, metadata=doc.metadata))
        # Si es una cadena de texto
        elif isinstance(doc, str):
            clean_text = re.sub(r'/quotesingle.ts1', "'", doc)  # Reemplazar caracteres extraños
            clean_text = re.sub(r'[^\x20-\x7E]', '', clean_text)  # Eliminar caracteres no imprimibles
            clean_text = re.sub(r'\s+', ' ', clean_text).strip()  # Normalizar espacios
            cleaned_docs.append(clean_text)
    return cleaned_docs

In [18]:
# Instanciar el chatbot con los parámetros necesarios
chatbot = GenerativeChatbot(db=db, generative_model=generative_model, tokenizer=tokenizer)

# Pregunta del usuario
user_question = "How I define a dictionary in Python?"

# Recuperar documentos relevantes
retrieved_docs = db.similarity_search(user_question, k=5)

# Limpiar los documentos
cleaned_docs = clean_context(retrieved_docs)

# Formatear el contexto
formatted_context = chatbot.format_context(cleaned_docs)

# Generar la respuesta
response = chatbot.generate_response(user_question, formatted_context)

# Mostrar la respuesta
print("Response:\n", response)

Response:
 ## USER QUESTIONS: How I define a dictionary in Python? ## CONTEXT: The content has been retrieved from Python programming resources: ## CONTEXT:
