<a href="https://colab.research.google.com/github/nilea-cyber/muhozgu/blob/main/rag_with_chromadb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install chromadb



In [19]:
!pip install sentence-transformers



In [20]:
!pip install pypdf



In [21]:
!pip install pyPDF2



In [22]:
!pip install langchain



In [23]:
!pip install tiktoken



In [24]:
# Simple RAG with your own documents
import chromadb
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize components
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
chroma_client = chromadb.Client()

# Create a collection (like a database table)
collection = chroma_client.create_collection(name="my_knowledge_base")

# Sample documents (you can replace these with your own)
documents = [
    "Large Language Models (LLMs) are AI systems trained on vast amounts of text data.",
    "RAG stands for Retrieval-Augmented Generation, which combines retrieval and generation.",
    "Fine-tuning adapts pre-trained models to specific tasks using additional training data.",
    "ChromaDB is an open-source embedding database for AI applications.",
    "Vector databases store data as numerical vectors for efficient similarity search.",
    "Transformers are neural network architectures used in modern NLP models.",
    "BERT is a transformer-based model for natural language understanding tasks.",
    "Students can use Google Colab for free GPU access to train ML models."
]

# Generate embeddings
print("Generating embeddings for documents...")
embeddings = embedder.encode(documents).tolist()

# Add to ChromaDB
for i, (doc, emb) in enumerate(zip(documents, embeddings)):
    collection.add(
        embeddings=[emb],
        documents=[doc],
        metadatas=[{"source": "textbook"}],
        ids=[f"doc_{i}"]
    )

print(f"Added {len(documents)} documents to the database")

Generating embeddings for documents...
Added 8 documents to the database


In [25]:
# Query function
def query_rag(query_text, n_results=3):
    # Generate query embedding
    query_embedding = embedder.encode([query_text]).tolist()[0]

    # Search in ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=n_results
    )

    return results

# Test queries
test_queries = [
    "What are LLMs?",
    "Explain RAG",
    "How can students access free GPUs?"
]

for query in test_queries:
    print(f"\nQuery: '{query}'")
    results = query_rag(query, n_results=2)
    print("Retrieved documents:")
    for i, doc in enumerate(results['documents'][0]):
        print(f"  {i+1}. {doc}")


Query: 'What are LLMs?'
Retrieved documents:
  1. Large Language Models (LLMs) are AI systems trained on vast amounts of text data.
  2. RAG stands for Retrieval-Augmented Generation, which combines retrieval and generation.

Query: 'Explain RAG'
Retrieved documents:
  1. RAG stands for Retrieval-Augmented Generation, which combines retrieval and generation.
  2. BERT is a transformer-based model for natural language understanding tasks.

Query: 'How can students access free GPUs?'
Retrieved documents:
  1. Students can use Google Colab for free GPU access to train ML models.
  2. ChromaDB is an open-source embedding database for AI applications.


In [41]:
# COMPLETE PDF PROCESSING CODE - Run this as ONE CELL

# 1. Install required packages
!pip install pypdf sentence-transformers chromadb -q

# 2. Import everything
import chromadb
from sentence_transformers import SentenceTransformer
from PyPDF2 import PdfReader
from google.colab import files
import os

# 3. Initialize ChromaDB and Embedder
print("Initializing components...")
chroma_client = chromadb.Client()
# Fix: Use get_or_create=True to avoid error if collection already exists
collection = chroma_client.create_collection(name="pdf_documents", get_or_create=True)
embedder = SentenceTransformer('all-MiniLM-L6-v2')  # ← THIS LINE IS CRITICAL

print("Components initialized successfully!")

# 4. PDF processing function
def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# 5. Upload and process PDF
print("\nUpload a PDF file:")
uploaded = files.upload()

# 6. Process each uploaded PDF
for filename in uploaded.keys():
    if filename.endswith('.pdf'):
        print(f"\nProcessing {filename}...")

        # Extract text
        text = extract_text_from_pdf(filename)
        print(f"Extracted {len(text)} characters")

        # Split text into chunks (better approach)
        sentences = text.replace('\n', ' ').split('. ')
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) < 500:  # Max 500 chars per chunk
                current_chunk += sentence + ". "
            else:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence + ". "

        if current_chunk:
            chunks.append(current_chunk.strip())

        print(f"Created {len(chunks)} chunks")

        # Add chunks to ChromaDB
        for i, chunk in enumerate(chunks):
            embedding = embedder.encode([chunk]).tolist()[0]
            collection.add(
                embeddings=[embedding],
                documents=[chunk],
                metadatas=[
                    {
                        "source": filename,
                        "chunk_id": i,
                        "total_chunks": len(chunks)
                    }
                ],
                ids=[f"pdf_{filename}_{i}"]
            )

        print(f"Added {len(chunks)} chunks from {filename} to ChromaDB")

print("\n✅ PDF processing complete!")
print(f"Total documents in collection: {collection.count()}")

Initializing components...
Components initialized successfully!

Upload a PDF file:


Saving 2501.09223v1.pdf to 2501.09223v1.pdf

Processing 2501.09223v1.pdf...
Extracted 618724 characters
Created 1440 chunks
Added 1440 chunks from 2501.09223v1.pdf to ChromaDB

✅ PDF processing complete!
Total documents in collection: 1877


In [27]:
!pip install transformers



In [42]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

# Load a small, efficient language model for generation
generator = pipeline(
    "text-generation",
    model="microsoft/DialoGPT-small",  # Small model that fits in free Colab
    torch_dtype=torch.float16,
    device_map="auto" if torch.cuda.is_available() else None
)

def generate_answer_with_rag(query):
    # Step 1: Retrieve relevant documents
    results = query_rag(query, n_results=3)
    context = "\n".join(results['documents'][0])

    # Step 2: Create prompt with context
    prompt = f"""Based on the following context, answer the question.

Context:
{context}

Question: {query}

Answer: """

    # Step 3: Generate answer
    response = generator(
        prompt,
        max_length=200,
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True
    )

    return response[0]['generated_text']

# Test the complete RAG pipeline
questions = [
    "What is RAG used for?",
    "Tell me about LLMs",
    "What database did we use for this project?"
]

for question in questions:
    print(f"\n{'='*50}")
    print(f"Question: {question}")
    print(f"{'='*50}")
    answer = generate_answer_with_rag(question)
    print(f"Answer: {answer}")

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Question: What is RAG used for?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: Based on the following context, answer the question.

Context:
Sometimes, LLMs must derive responses strictly from the pro vided texts, while at other times, they may need to generate responses using their pre-trained knowledge if the provided texts are insufﬁcient. There are many aspects of RAG, such as improvem ents to the retrieval systems, that cannot be covered in this chapter. Interested readers can re fer to surveys of RAG techniques for more information [ Li et al. ,2022 ;Gao et al. ,2023c ].
By drawing fro m external databases and documents, 3.2 Advanced Prompting Methods 135 RAG can signiﬁcantly improve the quality of responses, ensu ring they are both contextually rel- evant and factually correct. Such an approach is particular ly useful in scenarios that require high factual accuracy and up-to-date information, such as compl ex question answering. The concept of RAG has been mentioned several times in the pre vious sections and chapters.
In contrast, in RAG, the ret

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: Based on the following context, answer the question.

Context:
As a result, the explosion of resea rch interest in LLMs has also led to a vast number of new techniques and models. However, we do not a ttempt to provide a comprehen- sive literature review on all aspects of LLMs, given the rapi d evolution of the ﬁeld. Nevertheless, one can still gain knowledge about LLMs from general reviews [Zhao et al. ,2023 ;Minaee et al. , 2024 ] or more focused discussions on speciﬁc topics [ Ruan et al. ,2024 ].
This approach is simpl e and practical, as there have been a lot of well-developed LLMs and we just need to use them with n o or little modiﬁcation. An interesting issue, though not closely related to the discus sion here, arises: can an LLM that aligns with other LLMs outperform those LLMs? Probably not at ﬁrst g lance. In part, this is because the target LLM merely imitates other LLMs based on limited su pervision and thus cannot capture well the nuances of the behaviors of these

In [40]:
# Save the database to disk (in Colab's temporary storage)
persistent_client = chromadb.PersistentClient(path="./chroma_db")
persistent_collection = persistent_client.create_collection("persistent_knowledge", get_or_create=True)

# Copy from memory to persistent storage
# Explicitly include 'embeddings' to avoid 'NoneType' error
all_data = collection.get(include=['documents', 'embeddings', 'metadatas'])
for i, (doc, emb, meta) in enumerate(zip(all_data['documents'], all_data['embeddings'], all_data['metadatas'])):
    persistent_collection.add(
        embeddings=[emb],
        documents=[doc],
        metadatas=[meta],
        ids=[all_data['ids'][i]]
    )

print("Database saved to ./chroma_db")

# Load it back later
def load_persistent_db():
    client = chromadb.PersistentClient(path="./chroma_db")
    return client.get_collection("persistent_knowledge")

# Example usage in another session
# collection = load_persistent_db()

Database saved to ./chroma_db


In [32]:
!pip install beautifulsoup4 requests



In [43]:
import requests
from bs4 import BeautifulSoup

def scrape_webpage(url):
    """Scrape text content from a webpage"""
    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()

        # Get text
        text = soup.get_text()

        # Clean up
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)

        return text
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return ""

# Add Wikipedia page about LLMs
llm_wiki_text = scrape_webpage("https://en.wikipedia.org/wiki/Large_language_model")
if llm_wiki_text:
    # Split into chunks
    chunks = llm_wiki_text.split('\n')
    chunks = [chunk for chunk in chunks if len(chunk) > 50]

    # Add to database
    for i, chunk in enumerate(chunks[:20]):  # Limit to 20 chunks
        embedding = embedder.encode([chunk]).tolist()[0]
        collection.add(
            embeddings=[embedding],
            documents=[chunk],
            metadatas=[{"source": "wikipedia", "url": "https://en.wikipedia.org/wiki/Large_language_model"}],
            ids=[f"wiki_llm_{i}"]
        )
    print(f"Added {len(chunks[:20])} chunks from Wikipedia")


Added 1 chunks from Wikipedia


In [44]:
# Simple evaluation of your RAG system
def evaluate_rag(test_queries, expected_answers):
    """
    Simple evaluation function
    test_queries: list of questions
    expected_answers: list of expected answers (for manual comparison)
    """
    print("Evaluating RAG system...")
    print("-" * 50)

    for i, (query, expected) in enumerate(zip(test_queries, expected_answers)):
        print(f"\nTest {i+1}: {query}")

        # Get retrieved documents
        results = query_rag(query, n_results=3)
        retrieved_docs = results['documents'][0]

        # Check if expected concepts are in retrieved documents
        expected_concepts = expected.lower().split()
        found_concepts = []

        for doc in retrieved_docs:
            for concept in expected_concepts:
                if concept in doc.lower() and len(concept) > 3:
                    found_concepts.append(concept)

        # Calculate simple accuracy
        accuracy = len(set(found_concepts)) / len(set(expected_concepts)) if expected_concepts else 0
        print(f"Retrieval Accuracy: {accuracy:.2%}")
        print(f"Retrieved {len(retrieved_docs)} documents")

        # Generate and display answer
        answer = generate_answer_with_rag(query)
        print(f"Generated Answer: {answer[:200]}...")  # First 200 chars

# Sample evaluation
test_data = [
    ("What is RAG?", "retrieval augmented generation combines search with generation"),
    ("What are LLMs?", "large language models are AI systems trained on text")
]

queries = [q for q, _ in test_data]
answers = [a for _, a in test_data]

evaluate_rag(queries, answers)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Evaluating RAG system...
--------------------------------------------------

Test 1: What is RAG?
Retrieval Accuracy: 16.67%
Retrieved 3 documents


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Answer: Based on the following context, answer the question.

Context:
Sometimes, LLMs must derive responses strictly from the pro vided texts, while at other times, they may need to generate responses using ...

Test 2: What are LLMs?
Retrieval Accuracy: 44.44%
Retrieved 3 documents
Generated Answer: Based on the following context, answer the question.

Context:
As a result, the explosion of resea rch interest in LLMs has also led to a vast number of new techniques and models. However, we do not a...


In [45]:
# Create a student FAQ system with RAG

# Add student-related documents
student_docs = [
    "Google Colab provides free GPU access for 12 hours per session.",
    "Students can use Hugging Face for free pre-trained models and datasets.",
    "Fine-tuning requires labeled data and computational resources.",
    "Python is the primary language for machine learning projects.",
    "GitHub is used for version control and sharing code.",
    "Research papers are published on arXiv.org for free access.",
    "Kaggle offers datasets and competitions for practice.",
    "Office hours are Tuesdays and Thursdays from 2-4 PM.",
    "The final project deadline is December 15th.",
    "Group projects should have 3-4 students maximum."
]

# Add to collection
for i, doc in enumerate(student_docs):
    embedding = embedder.encode([doc]).tolist()[0]
    collection.add(
        embeddings=[embedding],
        documents=[doc],
        metadatas=[{"source": "student_faq", "category": "general"}],
        ids=[f"faq_{i}"]
    )

# Create a simple chat interface
def student_chatbot():
    print("Student FAQ Chatbot")
    print("Type 'exit' to quit")
    print("-" * 30)

    while True:
        user_input = input("\nYour question: ").strip()

        if user_input.lower() in ['exit', 'quit', 'bye']:
            print("Goodbye!")
            break

        if not user_input:
            continue

        # Get answer
        answer = generate_answer_with_rag(user_input)

        # Extract just the answer part (after "Answer: ")
        if "Answer:" in answer:
            answer = answer.split("Answer:")[-1].strip()

        print(f"\nBot: {answer[:300]}")  # Limit response length

# Uncomment to run the chatbot
# student_chatbot()