In [2]:
# ============================================
# STEP 1: INSTALLATION
# ============================================
# Run these in separate cells:

# Install required packages
!pip install pymilvus google-generativeai sentence-transformers pandas

# ============================================
# STEP 2: START MILVUS (LITE VERSION)
# ============================================
# Milvus Lite runs locally without Docker



In [4]:
!pip install chromadb google-generativeai sentence-transformers pandas


Collecting chromadb
  Using cached chromadb-1.1.1-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Using cached pybase64-1.4.2-cp310-cp310-win_amd64.whl.metadata (9.0 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.37.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Using cached opentelemetry_sdk-1.37.0-py3-none-any.whl.metadata (1.5 kB)
Collecting pypika>

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.12.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.5 which is incompatible.


In [1]:
# STEP 2: IMPORT LIBRARIES
# ============================================

import chromadb
from chromadb.config import Settings
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import pandas as pd
from typing import List

print("✅ Libraries imported successfully!")



  from .autonotebook import tqdm as notebook_tqdm


✅ Libraries imported successfully!


In [2]:
# STEP 3: INITIALIZE CHROMADB
# ============================================

# Initialize ChromaDB client (stores in memory by default)
client = chromadb.Client()

# Or use persistent storage:
# client = chromadb.PersistentClient(path="./chroma_db")

print("✅ ChromaDB initialized!")

✅ ChromaDB initialized!


In [30]:
#import google.generativeai as genai

#genai.configure(api_key="AIzaSyB58xEsMAI0SW3oZiaMDMNhCeoQLyBvwV8")

# List all available models
#print("Available Gemini models:")
#for model in genai.list_models():
   # if 'generateContent' in model.supported_generation_methods:
       # print(f"  - {model.name}")

In [20]:
# STEP 4: CONFIGURE GEMINI
# ============================================

# Set your Gemini API key here
GEMINI_API_KEY = "AIzaSyB58xEsMAI0SW3oZiaMDMNhCeoQLyBvwV8"  # Replace with your actual key
genai.configure(api_key=GEMINI_API_KEY)

# Initialize Gemini model (using latest version)
gemini_model = genai.GenerativeModel('gemini-2.5-flash')

print("✅ Gemini configured!")

# ============================================

✅ Gemini configured!


In [21]:
##!pip install hf_xet

In [22]:
# STEP 5: LOAD EMBEDDING MODEL
# ============================================

# Load sentence transformer for creating embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("✅ Embedding model loaded!")


✅ Embedding model loaded!


In [23]:
# STEP 6: PREPARE YOUR DATASET
# ============================================

# Example dataset - Replace with your own data
documents = [
    "Python is a high-level programming language known for its simplicity and readability.",
    "Machine learning is a subset of artificial intelligence that enables computers to learn from data.",
    "ChromaDB is a vector database designed for AI applications and embeddings.",
    "Gemini is Google's advanced large language model for various AI tasks.",
    "RAG (Retrieval Augmented Generation) combines retrieval and generation for better AI responses.",
    "Vector embeddings represent text as numerical arrays in high-dimensional space.",
    "Jupyter notebooks are interactive coding environments popular in data science.",
    "Natural language processing helps computers understand and generate human language.",
    "Deep learning uses neural networks with multiple layers to process complex data.",
    "Transformers are a type of neural network architecture that revolutionized NLP."
]

# Load from CSV (uncomment to use):
# df = pd.read_csv('your_dataset.csv')
# documents = df['text_column'].tolist()

# Load from text file (uncomment to use):
# with open('your_file.txt', 'r') as f:
#     documents = [line.strip() for line in f if line.strip()]

print(f"✅ Loaded {len(documents)} documents")

# ============================================

✅ Loaded 10 documents


In [24]:
# STEP 7: CREATE CHROMADB COLLECTION
# ============================================

# Create or get collection
collection_name = "document_collection"

# Delete if exists (for fresh start)
try:
    client.delete_collection(name=collection_name)
except:
    pass

# Create new collection
collection = client.create_collection(
    name=collection_name,
    metadata={"description": "Document collection for RAG"}
)

print(f"✅ Collection '{collection_name}' created!")

# ============================================

✅ Collection 'document_collection' created!


In [25]:
# STEP 8: CREATE EMBEDDINGS & ADD TO CHROMADB
# ============================================

print("Creating embeddings and adding to ChromaDB...")

# Create embeddings
embeddings = embedding_model.encode(documents).tolist()

# Add to collection
collection.add(
    embeddings=embeddings,
    documents=documents,
    ids=[f"doc_{i}" for i in range(len(documents))],
    metadatas=[{"index": i, "source": "dataset"} for i in range(len(documents))]
)

print(f"✅ Added {len(documents)} documents to ChromaDB!")

# ============================================

Creating embeddings and adding to ChromaDB...
✅ Added 10 documents to ChromaDB!


In [26]:
# STEP 9: RAG QUERY FUNCTION
# ============================================

def rag_query(question: str, top_k: int = 3):
    """
    Perform RAG: Retrieve relevant documents and generate answer with Gemini
    
    Args:
        question: The question to answer
        top_k: Number of documents to retrieve
    
    Returns:
        Dictionary with question, retrieved docs, and answer
    """
    
    # Step 1: Convert question to embedding
    question_embedding = embedding_model.encode([question])[0].tolist()
    
    # Step 2: Search ChromaDB for similar documents
    results = collection.query(
        query_embeddings=[question_embedding],
        n_results=top_k
    )
    
    # Step 3: Extract retrieved documents
    retrieved_docs = results['documents'][0]
    distances = results['distances'][0]
    
    print("\n" + "="*60)
    print("📚 RETRIEVED DOCUMENTS:")
    print("="*60)
    for i, (doc, dist) in enumerate(zip(retrieved_docs, distances), 1):
        print(f"{i}. [Score: {1-dist:.3f}] {doc}")
    
    # Step 4: Create context from retrieved documents
    context = "\n\n".join(retrieved_docs)
    
    # Step 5: Create prompt for Gemini
    prompt = f"""Based on the following context, answer the question.
If the answer cannot be found in the context, say "I don't have enough information to answer that."

Context:
{context}

Question: {question}

Answer:"""
    
    # Step 6: Generate answer with Gemini
    try:
        response = gemini_model.generate_content(prompt)
        answer = response.text
    except Exception as e:
        answer = f"Error generating response: {e}"
    
    print("\n" + "="*60)
    print("🤖 GEMINI'S ANSWER:")
    print("="*60)
    print(answer)
    print("="*60 + "\n")
    
    return {
        "question": question,
        "retrieved_docs": retrieved_docs,
        "answer": answer,
        "scores": [1-d for d in distances]
    }

# ============================================

In [27]:
# STEP 10: TEST THE SYSTEM
# ============================================

print("\n🚀 TESTING RAG SYSTEM\n")

# Example queries
test_questions = [
    "What is ChromaDB?",
    "Tell me about machine learning",
    "How does RAG work?",
    "What is Python used for?"
]

for question in test_questions:
    print(f"\n❓ Question: {question}")
    result = rag_query(question, top_k=3)
    print("\n" + "-"*80 + "\n")

# ============================================


🚀 TESTING RAG SYSTEM


❓ Question: What is ChromaDB?

📚 RETRIEVED DOCUMENTS:
1. [Score: 0.241] ChromaDB is a vector database designed for AI applications and embeddings.
2. [Score: -0.661] Gemini is Google's advanced large language model for various AI tasks.
3. [Score: -0.718] Natural language processing helps computers understand and generate human language.

🤖 GEMINI'S ANSWER:
ChromaDB is a vector database designed for AI applications and embeddings.


--------------------------------------------------------------------------------


❓ Question: Tell me about machine learning

📚 RETRIEVED DOCUMENTS:
1. [Score: 0.538] Machine learning is a subset of artificial intelligence that enables computers to learn from data.
2. [Score: -0.015] Deep learning uses neural networks with multiple layers to process complex data.
3. [Score: -0.157] Natural language processing helps computers understand and generate human language.

🤖 GEMINI'S ANSWER:
Machine learning is a subset of artificial intell

In [28]:
# STEP 11: INTERACTIVE QUERY FUNCTION
# ============================================

def ask(question: str):
    """Simple wrapper for asking questions"""
    return rag_query(question, top_k=3)

print("✅ Setup complete!")
print("\n💡 Usage:")
print("   ask('Your question here')")
print("\nExample:")
print("   ask('What are transformers in deep learning?')")

# ============================================

✅ Setup complete!

💡 Usage:
   ask('Your question here')

Example:
   ask('What are transformers in deep learning?')


In [29]:
# STEP 12: ADD MORE DOCUMENTS (OPTIONAL)
# ============================================

def add_documents(new_docs: List[str]):
    """Add new documents to the collection"""
    
    # Get current count
    current_count = collection.count()
    
    # Create embeddings
    new_embeddings = embedding_model.encode(new_docs).tolist()
    
    # Add to collection
    collection.add(
        embeddings=new_embeddings,
        documents=new_docs,
        ids=[f"doc_{current_count + i}" for i in range(len(new_docs))],
        metadatas=[{"index": current_count + i, "source": "added"} for i in range(len(new_docs))]
    )
    
    print(f"✅ Added {len(new_docs)} new documents!")
    print(f"📊 Total documents: {collection.count()}")

# Example usage:
# add_documents(["New document 1", "New document 2"])

# ============================================
# STEP 13: SEARCH WITHOUT GEMINI (OPTIONAL)
# ============================================

def search_only(query: str, top_k: int = 5):
    """Search without generating an answer - just retrieval"""
    
    query_embedding = embedding_model.encode([query])[0].tolist()
    
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )
    
    print(f"\n🔍 Search results for: '{query}'")
    print("="*60)
    
    for i, (doc, dist) in enumerate(zip(results['documents'][0], results['distances'][0]), 1):
        score = 1 - dist
        print(f"{i}. [Score: {score:.3f}]")
        print(f"   {doc}\n")
    
    return results

# Example:
# search_only("artificial intelligence")

# ============================================
# STEP 14: VIEW COLLECTION INFO
# ============================================

def collection_info():
    """Display information about the collection"""
    count = collection.count()
    print(f"\n📊 Collection: {collection_name}")
    print(f"📄 Total documents: {count}")
    print(f"🔢 Embedding dimension: 384")
    
    # Get a sample
    if count > 0:
        sample = collection.get(limit=3)
        print(f"\n📝 Sample documents:")
        for i, doc in enumerate(sample['documents'], 1):
            print(f"   {i}. {doc[:100]}...")

# collection_info()

# ============================================
# STEP 15: CLEANUP (OPTIONAL)
# ============================================

def cleanup():
    """Delete the collection"""
    try:
        client.delete_collection(name=collection_name)
        print("✅ Collection deleted!")
    except Exception as e:
        print(f"❌ Error: {e}")

# Uncomment to clean up:
# cleanup()

print("\n" + "="*60)
print("🎉 ALL DONE! Your RAG system is ready!")
print("="*60)


🎉 ALL DONE! Your RAG system is ready!
