# Enterprise RAG System - Final Working Version
Complete RAG system using Amazon Bedrock with Titan v2 embeddings and Claude 3.5 Sonnet v2

In [None]:
# Cell 1: Initialize Bedrock Client
import boto3
from botocore.config import Config
import json
from IPython.display import display, Markdown

# Create Bedrock runtime client
bedrock = boto3.client(
    service_name='bedrock-runtime',
    region_name='ap-southeast-2',
    config=Config(
        retries={'max_attempts': 10, 'mode': 'adaptive'}
    )
)

print("✅ Bedrock client initialized successfully")
print(f"Region: {bedrock._client_config.region_name}")

In [None]:
# Cell 2: Set Up Document Processor
import os
import shutil
import uuid
import subprocess
import sys
from datetime import datetime
from typing import List
import chromadb
from chromadb.config import Settings

try:
    from langchain_aws import BedrockEmbeddings
    print("✅ Using updated langchain_aws package")
except ImportError:
    from langchain.embeddings import BedrockEmbeddings
    print("⚠️ Using deprecated BedrockEmbeddings")

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Try to install and import EPUB loader
epub_loader_available = False
try:
    # Try to import EPUB loader
    from langchain.document_loaders import UnstructuredEPubLoader
    epub_loader_available = True
    print("✅ EPUB loader available")
except ImportError:
    print("⚠️ EPUB loader not available - will only support PDF")

# If EPUB loader failed, try to install dependencies
if not epub_loader_available:
    try:
        print("🔧 Installing EPUB dependencies...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "unstructured[epub]", "pypandoc"])
        
        # Try to download pandoc
        import pypandoc
        pypandoc.download_pandoc()
        
        from langchain.document_loaders import UnstructuredEPubLoader
        epub_loader_available = True
        print("✅ EPUB loader installed successfully")
    except Exception as e:
        print(f"⚠️ Could not install EPUB support: {e}")
        print("📝 Note: Only PDF files will be supported")

# Clean up any existing database directories
db_paths = ["./rag_db_clean", "./enterprise_chroma_db", "./enterprise_chroma_db_v2"]
for db_path in db_paths:
    if os.path.exists(db_path):
        try:
            shutil.rmtree(db_path)
            print(f"🗑️ Removed old database: {db_path}")
        except Exception as e:
            print(f"⚠️ Could not remove {db_path}: {e}")

# Create embeddings with Titan v2 (1024 dimensions)
embeddings = BedrockEmbeddings(
    client=bedrock,
    model_id="amazon.titan-embed-text-v2:0",
    region_name="ap-southeast-2"
)

# Test embeddings
test_embedding = embeddings.embed_query("test")
print(f"✅ Embeddings working - dimension: {len(test_embedding)}")

# Create fresh ChromaDB client with unique path
unique_id = str(uuid.uuid4())[:8]
db_path = f"./rag_db_{unique_id}"

chroma_client = chromadb.PersistentClient(
    path=db_path,
    settings=Settings(
        anonymized_telemetry=False,
        is_persistent=True
    )
)

print(f"✅ ChromaDB client created at: {db_path}")
if epub_loader_available:
    print("✅ Document processor ready (supports PDF and EPUB)")
else:
    print("✅ Document processor ready (PDF only)")

In [None]:
# Cell 3: Process Book and Create Collection with Performance Monitoring
import time

# Find book files (prioritize PDF for better performance)
book_paths = [
    '../book.pdf', './book.pdf', 'book.pdf',
    '../book.epub', './book.epub', 'book.epub'
]
book_path = None
file_type = None

for path in book_paths:
    if os.path.exists(path):
        book_path = path
        file_type = 'epub' if path.endswith('.epub') else 'pdf'
        file_size_mb = os.path.getsize(path) / (1024 * 1024)
        print(f"📖 Found {file_type.upper()} at: {path} ({file_size_mb:.1f} MB)")
        break

if not book_path:
    print("❌ No book file found!")
    raise FileNotFoundError("Please place book.pdf or book.epub in the current directory")

# Load and process book with performance timing
load_start_time = time.time()
try:
    if file_type == 'epub' and epub_loader_available:
        print("📚 Using EPUB loader (may be slower due to XML parsing overhead)")
        loader = UnstructuredEPubLoader(book_path)
    else:
        # Fallback to PDF or force PDF if EPUB fails
        if file_type == 'epub':
            print("⚠️ EPUB loader not available, please convert to PDF or install pandoc")
            print("📝 To install pandoc: brew install pandoc (macOS) or apt-get install pandoc (Linux)")
            raise FileNotFoundError("Please use PDF format or install pandoc for EPUB support")
        print("📄 Using PDF loader (optimized for performance)")
        loader = PyPDFLoader(book_path)

    docs = loader.load()
    load_time = time.time() - load_start_time
    print(f"📄 Loaded {len(docs)} sections/pages in {load_time:.2f}s ({len(docs)/load_time:.1f} pages/sec)")

except Exception as e:
    print(f"❌ Error loading {file_type.upper()} file: {e}")

    # Try to find alternative format with timing
    alt_paths = ['../book.pdf', './book.pdf', 'book.pdf'] if file_type == 'epub' else ['../book.epub', './book.epub', 'book.epub']
    alt_found = False

    for alt_path in alt_paths:
        if os.path.exists(alt_path):
            print(f"🔄 Trying alternative format: {alt_path}")
            alt_type = 'pdf' if alt_path.endswith('.pdf') else 'epub'
            try:
                fallback_start = time.time()
                if alt_type == 'pdf':
                    loader = PyPDFLoader(alt_path)
                    docs = loader.load()
                    fallback_time = time.time() - fallback_start
                    book_path = alt_path
                    file_type = alt_type
                    alt_found = True
                    print(f"✅ Successfully loaded {alt_type.upper()}: {len(docs)} pages in {fallback_time:.2f}s")
                    break
            except Exception as alt_e:
                print(f"❌ Alternative also failed: {alt_e}")

    if not alt_found:
        raise Exception("Could not load any book format. Please ensure you have a valid PDF file.")

# Split into chunks with timing
chunk_start_time = time.time()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " "],
    length_function=len
)
chunks = text_splitter.split_documents(docs)
chunk_time = time.time() - chunk_start_time
print(f"📝 Created {len(chunks)} chunks in {chunk_time:.2f}s ({len(chunks)/chunk_time:.1f} chunks/sec)")

# Create collection with unique name
collection_name = f"book_rag_{int(time.time())}"
collection = chroma_client.create_collection(collection_name)
print(f"✅ Created collection: {collection_name}")

# Process in batches with performance monitoring
batch_size = 25  # Conservative batch size for all formats
total_processed = 0
embedding_start_time = time.time()

print(f"🔄 Processing {len(chunks)} chunks in batches of {batch_size}...")

for i in range(0, len(chunks), batch_size):
    batch = chunks[i:i + batch_size]
    batch_num = i // batch_size + 1
    total_batches = (len(chunks) + batch_size - 1) // batch_size
    
    batch_start = time.time()
    print(f"   Batch {batch_num}/{total_batches}: {len(batch)} chunks", end=" ")
    
    try:
        documents = [chunk.page_content for chunk in batch]
        metadatas = [{
            "source": f"book.{file_type}",
            "chunk_id": i + j,
            "page": getattr(chunk, 'metadata', {}).get('page', 0),
            "timestamp": datetime.now().isoformat(),
            "file_type": file_type
        } for j, chunk in enumerate(batch)]
        ids = [f"chunk_{i + j}_{unique_id}" for j in range(len(batch))]
        
        # Generate embeddings
        batch_embeddings = embeddings.embed_documents(documents)
        
        # Add to collection
        collection.add(
            documents=documents,
            metadatas=metadatas,
            ids=ids,
            embeddings=batch_embeddings
        )
        
        batch_time = time.time() - batch_start
        total_processed += len(batch)
        progress = (total_processed / len(chunks)) * 100
        print(f"({batch_time:.1f}s) ✅ {total_processed}/{len(chunks)} ({progress:.1f}%)")
        
    except Exception as e:
        print(f"   ❌ Batch {batch_num} failed: {e}")
        break

embedding_time = time.time() - embedding_start_time
total_time = time.time() - load_start_time

print(f"\n🎉 Successfully processed {total_processed} chunks from {file_type.upper()}!")
print(f"📊 Performance Summary:")
print(f"   • File loading: {load_time:.2f}s")
print(f"   • Text chunking: {chunk_time:.2f}s") 
print(f"   • Embedding + storage: {embedding_time:.2f}s")
print(f"   • Total time: {total_time:.2f}s")
print(f"   • Overall rate: {total_processed/total_time:.1f} chunks/sec")

# Verify collection
try:
    count = collection.count()
    print(f"✅ Collection verification: {count} documents stored")
except Exception as e:
    print(f"⚠️ Collection verification failed: {e}")

In [None]:
# Cell 4: RAG Engine
class RAGEngine:
    def __init__(self, collection, bedrock_client, embeddings_model):
        self.collection = collection
        self.bedrock = bedrock_client
        self.embeddings = embeddings_model
    
    def query(self, question: str, k: int = 3):
        """Ask a question and get an answer with sources"""
        try:
            # Generate query embedding with same model as collection
            query_embedding = self.embeddings.embed_query(question)
            
            # Retrieve relevant chunks using the embedding
            results = self.collection.query(
                query_embeddings=[query_embedding],
                n_results=k
            )
            
            if not results['documents'][0]:
                return {'answer': 'No relevant context found', 'sources': []}
            
            # Format context
            context = "\n\n".join([
                f"Source {i+1}: {results['documents'][0][i]}"
                for i in range(len(results['documents'][0]))
            ])
            
            # Create prompt
            prompt = f"""Based on the following French context, answer the question in English. 
            Translate and summarize the relevant information.

            Context (in French):
            {context}

            Question (in English): {question}

            Please provide a detailed answer in English:"""
            
            # Get answer from Claude 3.5 Sonnet v2
            response = self.bedrock.invoke_model(
                modelId='anthropic.claude-3-5-sonnet-20241022-v2:0',
                body=json.dumps({
                    "messages": [{"role": "user", "content": prompt}],
                    "max_tokens": 500,
                    "anthropic_version": "bedrock-2023-05-31"
                }),
                contentType='application/json'
            )
            
            result = json.loads(response['body'].read())
            answer = result['content'][0]['text']
            
            # Format sources
            sources = [{
                'content': results['documents'][0][i][:200] + "..." if len(results['documents'][0][i]) > 200 else results['documents'][0][i],
                'metadata': results['metadatas'][0][i]
            } for i in range(len(results['documents'][0]))]
            
            return {'answer': answer, 'sources': sources}
            
        except Exception as e:
            return {'answer': f'Error: {e}', 'sources': []}

# Create RAG engine
rag = RAGEngine(collection, bedrock, embeddings)
print("✅ RAG Engine ready!")

In [29]:
# Cell 5: Test the RAG System
def ask_question(question, k=3):
    """Ask a question and display the answer nicely"""
    result = rag.query(question, k=k)
    
    print("=" * 80)
    print(f"🤔 QUESTION: {question}")
    print("=" * 80)
    print(f"\n💡 ANSWER:")
    print(result['answer'])
    
    if result['sources']:
        print(f"\n📚 SOURCES:")
        for i, source in enumerate(result['sources'], 1):
            print(f"\n{i}. Chunk {source['metadata']['chunk_id']} (Page {source['metadata']['page']})")
            print(f"   {source['content']}")
    
    print("=" * 80)
    return result

# Test questions
test_questions = [
    "Provide me a summary of this book, chapter by chapter"
]

print("🧪 Testing RAG System...\n")

for question in test_questions:
    ask_question(question)
    print("\n")

print("✅ RAG system is working! You can now use ask_question('your question') to query the book.")

🧪 Testing RAG System...

🤔 QUESTION: Provide me a summary of this book, chapter by chapter

💡 ANSWER:
Based on the given French context, I cannot provide a complete chapter-by-chapter summary of the book, as the excerpts only show introductory or concluding remarks about different sections. However, I can tell you what these passages reveal about the book's structure and themes:

1. The book appears to contain three main parts:

- A section about the United States (Source 1)
The author apologizes for the schematic nature of three chapters devoted to the US, acknowledging that the crisis in American society is complex enough to warrant its own book. The author prioritizes timely analysis over academic perfection due to ongoing war-related developments.

- A section about geopolitical relations (Source 2)
This appears to be a concluding section that chronologically analyzes the thirty years following the Cold War, focusing on how NATO became entangled in the "Ukrainian trap." It examines

In [None]:
# Performance Comparison and Optimization Tips
def compare_formats():
    """Compare PDF vs EPUB processing performance"""
    print("📊 PDF vs EPUB Performance Analysis")
    print("=" * 50)
    
    print("🔍 Why EPUB might be slower:")
    print("• UnstructuredEPubLoader parses XML/XHTML structure")
    print("• Extracts from multiple files within EPUB archive") 
    print("• Converts complex formatting to plain text")
    print("• Requires pandoc and additional dependencies")
    print("• More metadata processing overhead")
    
    print("\n⚡ Why PDF is often faster:")
    print("• PyPDFLoader is highly optimized")
    print("• Direct binary format parsing")
    print("• Mature codebase with performance optimizations")
    print("• Less dependency overhead")
    
    print("\n🚀 Optimization recommendations:")
    print("• Use PDF format for best performance")
    print("• Consider pre-processing EPUB to text files")
    print("• Increase batch_size if you have more memory")
    print("• Use parallel processing for very large documents")
    
    # Show current settings
    print(f"\n⚙️ Current settings:")
    print(f"• Batch size: 25 chunks")
    print(f"• Chunk size: 1000 characters")
    print(f"• Chunk overlap: 200 characters")
    print(f"• Embedding model: amazon.titan-embed-text-v2:0 (1024 dims)")

compare_formats()