# RAG System - Interactive Exploration

This notebook demonstrates how to interact with all components of the RAG system:
- Document Processor (Docling)
- Embedding Model (Sentence Transformers)
- Vector Store (FAISS)
- RAG Pipeline (Complete system)
- REST API (FastAPI)

## Setup

In [None]:
import sys
from pathlib import Path
import numpy as np
import requests
import json

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))

from src.document_processor import DocumentProcessor
from src.embeddings import EmbeddingModel
from src.vector_store import FAISSVectorStore
from src.rag_pipeline import RAGPipeline
from src.config import get_settings

In [None]:
# Load settings
settings = get_settings()
print(f"Embedding Model: {settings.embedding_model}")
print(f"LLM Model: {settings.model_name}")
print(f"Data Directory: {settings.data_dir}")

## 1. Document Processor (Docling)

Process documents and extract structured content.

In [None]:
# Initialize document processor
doc_processor = DocumentProcessor(chunk_size=512, chunk_overlap=50)

# Example: Process a single document
# doc_path = Path("../data/raw/your_document.pdf")
# if doc_path.exists():
#     result = doc_processor.process_document(doc_path)
#     print(f"Title: {result['metadata']['title']}")
#     print(f"Format: {result['metadata']['format']}")
#     print(f"Chunks created: {result['metadata']['chunk_count']}")
#     print(f"\nFirst chunk preview:\n{result['chunks'][0][:200]}...")

print("Document processor initialized!")
print(f"Chunk size: {doc_processor.chunk_size}")
print(f"Chunk overlap: {doc_processor.chunk_overlap}")

In [None]:
# Example: Process all documents in a directory
# raw_dir = Path("../data/raw")
# if raw_dir.exists():
#     docs = doc_processor.process_directory(raw_dir)
#     print(f"Processed {len(docs)} documents")
#     for doc in docs:
#         print(f"- {doc['metadata']['title']}: {doc['metadata']['chunk_count']} chunks")

## 2. Embedding Model (Sentence Transformers)

Convert text to vector embeddings.

In [None]:
# Initialize embedding model
embedding_model = EmbeddingModel(model_name=settings.embedding_model)

print(f"Model: {embedding_model.model_name}")
print(f"Dimension: {embedding_model.dimension}")

In [None]:
# Example: Encode a single text
text = "What is Retrieval-Augmented Generation?"
embedding = embedding_model.encode([text])

print(f"Text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First 10 dimensions: {embedding[0][:10]}")

In [None]:
# Example: Encode multiple texts
texts = [
    "Machine learning is a subset of AI",
    "Deep learning uses neural networks",
    "RAG combines retrieval and generation"
]

embeddings = embedding_model.encode_batch(texts)

print(f"Encoded {len(texts)} texts")
print(f"Embeddings shape: {embeddings.shape}")

# Calculate similarity between first two texts
similarity = np.dot(embeddings[0], embeddings[1]) / (
    np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
)
print(f"\nSimilarity between texts 1 and 2: {similarity:.4f}")

## 3. Vector Store (FAISS)

Store and search embeddings efficiently.

In [None]:
# Initialize vector store
vector_store = FAISSVectorStore(
    dimension=embedding_model.dimension,
    index_type="Flat"  # Use "IVFFlat" for large datasets
)

print(f"Vector store initialized")
print(f"Dimension: {vector_store.dimension}")
print(f"Index type: {vector_store.index_type}")

In [None]:
# Example: Add embeddings to the store
sample_texts = [
    "Python is a programming language",
    "JavaScript is used for web development",
    "Machine learning requires data",
    "Neural networks are used in deep learning",
    "RAG improves LLM accuracy"
]

# Create embeddings
sample_embeddings = embedding_model.encode_batch(sample_texts)

# Create metadata
metadata = [
    {"file_path": f"sample_{i}.txt", "title": f"Sample {i}", "chunk_text": text}
    for i, text in enumerate(sample_texts)
]

# Add to vector store
vector_store.add_embeddings(sample_embeddings, metadata)

print(f"Added {len(sample_texts)} embeddings to the store")
print(f"Total vectors: {vector_store.index.ntotal}")

In [None]:
# Example: Search the vector store
query = "What is machine learning?"
query_embedding = embedding_model.encode([query])

results = vector_store.search(query_embedding, top_k=3)

print(f"Query: {query}\n")
print("Top 3 results:")
for i, (doc, score) in enumerate(results, 1):
    print(f"\n{i}. Score: {score:.4f}")
    print(f"   Text: {doc['chunk_text']}")
    print(f"   Source: {doc['file_path']}")

In [None]:
# Get statistics
stats = vector_store.get_stats()
print(json.dumps(stats, indent=2))

## 4. Complete RAG Pipeline

Use the full pipeline to index and query documents.

In [None]:
# Initialize RAG pipeline
rag_pipeline = RAGPipeline(
    embedding_model=settings.embedding_model,
    llm_model=settings.model_name,
    chunk_size=512,
    chunk_overlap=50
)

print("RAG pipeline initialized!")
stats = rag_pipeline.get_stats()
print(json.dumps(stats, indent=2))

In [None]:
# Option 1: Load existing index
index_path = Path("../data/indices/main_index")

if index_path.exists():
    rag_pipeline.load_index(index_path)
    print(f"Loaded existing index from {index_path}")
    stats = rag_pipeline.get_stats()
    print(f"Total vectors: {stats['vector_store']['total_vectors']}")
else:
    print(f"No index found at {index_path}")
    print("You can build one using the cell below or run build_index.py")

In [None]:
# Option 2: Build new index from documents
# Uncomment to run:

# raw_dir = Path("../data/raw")
# if raw_dir.exists():
#     print(f"Indexing documents from {raw_dir}...")
#     rag_pipeline.index_documents(raw_dir)
#     
#     # Save the index
#     output_path = Path("../data/indices/notebook_index")
#     rag_pipeline.save_index(output_path)
#     print(f"\nIndex saved to {output_path}")
#     
#     stats = rag_pipeline.get_stats()
#     print(f"Total vectors: {stats['vector_store']['total_vectors']}")
#     print(f"Total documents: {stats['vector_store']['total_documents']}")

In [None]:
# Query the RAG system
question = "What is RAG and how does it work?"

# Note: This requires OPENAI_API_KEY to be set
# result = rag_pipeline.query(question, top_k=5)

# print(f"Question: {question}\n")
# print(f"Answer:\n{result.answer}\n")
# print(f"\nSources ({len(result.sources)}):")
# for i, source in enumerate(result.sources, 1):
#     print(f"\n{i}. {source['title']}")
#     print(f"   Score: {source['score']:.4f}")
#     print(f"   Excerpt: {source['excerpt']}")

# print(f"\nQuery time: {result.query_time_ms:.2f}ms")

print("Uncomment the code above to run a query (requires API key)")

## 5. Interact with the REST API

Query the system via HTTP requests (requires API server to be running).

In [None]:
# API base URL
API_URL = "http://localhost:8000"

def check_api_health():
    """Check if API is running."""
    try:
        response = requests.get(f"{API_URL}/health", timeout=5)
        return response.status_code == 200
    except:
        return False

if check_api_health():
    print("✓ API is running")
else:
    print("✗ API is not running")
    print("Start it with: docker compose -f docker/docker-compose.yml up -d")

In [None]:
# Health check
response = requests.get(f"{API_URL}/health")
health_data = response.json()

print("API Health:")
print(json.dumps(health_data, indent=2))

In [None]:
# Get statistics
response = requests.get(f"{API_URL}/stats")
stats_data = response.json()

print("System Statistics:")
print(json.dumps(stats_data, indent=2))

In [None]:
# Query via API
query_data = {
    "question": "What is machine learning?",
    "top_k": 5
}

response = requests.post(
    f"{API_URL}/query",
    json=query_data,
    headers={"Content-Type": "application/json"}
)

if response.status_code == 200:
    result = response.json()
    
    print(f"Question: {query_data['question']}\n")
    print(f"Answer:\n{result['answer']}\n")
    print(f"\nSources ({len(result['sources'])}):")
    
    for i, source in enumerate(result['sources'], 1):
        print(f"\n{i}. {source['title']}")
        print(f"   Score: {source['score']:.4f}")
        print(f"   Excerpt: {source['excerpt']}")
    
    print(f"\nQuery time: {result['metadata']['query_time_ms']:.2f}ms")
else:
    print(f"Error: {response.status_code}")
    print(response.text)

In [None]:
# Upload a document via API
# Uncomment to use:

# file_path = Path("../data/raw/your_document.pdf")
# if file_path.exists():
#     with open(file_path, "rb") as f:
#         files = {"file": (file_path.name, f, "application/pdf")}
#         response = requests.post(f"{API_URL}/documents/upload", files=files)
#     
#     if response.status_code == 200:
#         result = response.json()
#         print(json.dumps(result, indent=2))
#     else:
#         print(f"Error: {response.status_code}")
#         print(response.text)

print("Uncomment to upload a document")

## 6. Advanced: Direct FAISS Index Exploration

In [None]:
# Load and explore FAISS index directly
import faiss
import pickle

index_dir = Path("../data/indices/main_index")

if index_dir.exists():
    # Load FAISS index
    faiss_index = faiss.read_index(str(index_dir / "index.faiss"))
    
    # Load metadata
    with open(index_dir / "documents.pkl", "rb") as f:
        documents = pickle.load(f)
    
    print(f"FAISS Index loaded")
    print(f"Total vectors: {faiss_index.ntotal}")
    print(f"Dimension: {faiss_index.d}")
    print(f"Total document metadata: {len(documents)}")
    
    # Show first document
    if documents:
        print(f"\nFirst document metadata:")
        print(json.dumps(documents[0], indent=2)[:300] + "...")
else:
    print(f"No index found at {index_dir}")

## 7. Utility Functions

In [None]:
def batch_query(questions, pipeline, top_k=5):
    """Query multiple questions and return results."""
    results = []
    for question in questions:
        try:
            result = pipeline.query(question, top_k=top_k)
            results.append({
                "question": question,
                "answer": result.answer,
                "num_sources": len(result.sources),
                "query_time_ms": result.query_time_ms
            })
        except Exception as e:
            results.append({
                "question": question,
                "error": str(e)
            })
    return results

# Example usage:
# questions = [
#     "What is RAG?",
#     "How does FAISS work?",
#     "What is LangChain?"
# ]
# results = batch_query(questions, rag_pipeline)
# for r in results:
#     print(json.dumps(r, indent=2))

print("batch_query function defined")

In [None]:
def compare_embeddings(text1, text2, model):
    """Compare similarity between two texts."""
    emb1 = model.encode([text1])[0]
    emb2 = model.encode([text2])[0]
    
    # Cosine similarity
    similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
    
    # L2 distance
    distance = np.linalg.norm(emb1 - emb2)
    
    return {
        "text1": text1,
        "text2": text2,
        "cosine_similarity": float(similarity),
        "l2_distance": float(distance)
    }

# Example usage:
# result = compare_embeddings(
#     "Machine learning is AI",
#     "Deep learning is part of AI",
#     embedding_model
# )
# print(json.dumps(result, indent=2))

print("compare_embeddings function defined")

## Summary

This notebook showed you how to:

1. **Document Processor**: Parse and chunk documents
2. **Embedding Model**: Convert text to vectors
3. **Vector Store**: Store and search FAISS index
4. **RAG Pipeline**: Complete end-to-end system
5. **REST API**: Interact via HTTP requests
6. **Direct FAISS**: Explore index files
7. **Utilities**: Helper functions for batch operations

### Next Steps:

- Add your documents to `data/raw/`
- Build the index with `build_index.py`
- Run queries and explore results
- Customize the pipeline for your use case