In [7]:
# ============================================================================
# ADVANCED RAG: COMPRESSION TECHNIQUE (Agent vs Non-Agent)
# ============================================================================
# This notebook compares:
# 1. Basic RAG (no compression)
# 2. RAG with Contextual Compression (without agent)
# 3. Agentic RAG with Compression (with agent)

# Compression = Filter/compress retrieved docs to only relevant parts
# Agent = LLM decides retrieval strategy dynamically

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Compression imports
from langchain_classic.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_classic.retrievers.document_compressors import LLMChainExtractor


# Agent imports
from langchain_classic.agents import AgentExecutor, create_openai_tools_agent
from langchain_classic.tools.retriever import create_retriever_tool

import os
from dotenv import load_dotenv

load_dotenv()

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

print("✓ Imports and LLM loaded")

✓ Imports and LLM loaded


In [8]:
# ============================================================================
# SETUP: Load Documents and Create Vector Store
# ============================================================================

loader = TextLoader("rag_test.txt")
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

vector_store = FAISS.from_documents(documents=chunks, embedding=embeddings)
base_retriever = vector_store.as_retriever(search_kwargs={"k": 4})

print(f"✓ Loaded {len(chunks)} chunks into vector store")

✓ Loaded 12 chunks into vector store


In [9]:
# ============================================================================
# METHOD 1: BASIC RAG (No Compression) - BASELINE
# ============================================================================
# Simple retrieve → generate pipeline
# Problem: Returns full chunks even if only a small part is relevant

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

basic_prompt = ChatPromptTemplate.from_template("""
Answer the question based only on the following context:
{context}

Question: {question}
""")

basic_rag_chain = (
    {"context": base_retriever | format_docs, "question": RunnablePassthrough()}
    | basic_prompt
    | llm
    | StrOutputParser()
)

print("✓ Basic RAG chain created (no compression)")

✓ Basic RAG chain created (no compression)


In [10]:
# ============================================================================
# METHOD 2: RAG WITH CONTEXTUAL COMPRESSION (Without Agent)
# ============================================================================
# Compression extracts ONLY the relevant parts from retrieved documents
# 
# Flow:
#   Question → Retrieve chunks → LLM Compressor extracts relevant parts → Generate
#
# Benefits:
#   - Reduces noise in context
#   - Saves tokens (cheaper)
#   - More focused answers

# Create compressor using LLM to extract relevant content
compressor = LLMChainExtractor.from_llm(llm)

# Wrap base retriever with compression
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=base_retriever
)

compression_rag_chain = (
    {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
    | basic_prompt
    | llm
    | StrOutputParser()
)

print("✓ Compression RAG chain created (without agent)")
print("  - Uses LLMChainExtractor to compress retrieved docs")
print("  - Extracts only question-relevant content from each chunk")

✓ Compression RAG chain created (without agent)
  - Uses LLMChainExtractor to compress retrieved docs
  - Extracts only question-relevant content from each chunk


In [11]:
# ============================================================================
# METHOD 3: AGENTIC RAG WITH COMPRESSION (With Agent)
# ============================================================================
# Agent decides WHEN and HOW to retrieve
#
# Flow:
#   Question → Agent thinks → Decides to use retrieval tool → 
#   Compressed retrieval → Agent analyzes → May retrieve again → Generate
#
# Benefits:
#   - Can decide if retrieval is needed at all
#   - Can re-retrieve with different queries
#   - Can combine multiple retrieval strategies
#   - Self-correcting behavior

# Create retrieval tool with compression
retriever_tool = create_retriever_tool(
    compression_retriever,
    name="lemobank_knowledge_base",
    description="Search for information about LemoBank products, policies, fees, and services. Use this tool when you need specific information about LemoBank."
)

# Agent prompt
agent_prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful LemoBank assistant. You have access to a knowledge base tool.

RULES:
1. ALWAYS use the knowledge base tool to find information before answering
2. If the first search doesn't find relevant info, try rephrasing your search
3. Only answer based on retrieved information
4. If information is not found, say "I don't have this information"
5. Cite your sources when possible"""),
    ("human", "{input}"),
    ("placeholder", "{agent_scratchpad}"),
])

# Create agent (using create_openai_tools_agent instead of deprecated functions_agent)
agent = create_openai_tools_agent(llm, [retriever_tool], agent_prompt)
agent_executor = AgentExecutor(agent=agent, tools=[retriever_tool], verbose=True)

print("✓ Agentic RAG created (with compression)")
print("  - Agent decides when to retrieve")
print("  - Can re-retrieve with different queries")
print("  - Self-correcting behavior")

✓ Agentic RAG created (with compression)
  - Agent decides when to retrieve
  - Can re-retrieve with different queries
  - Self-correcting behavior


In [12]:
# ============================================================================
# COMPARISON: Test All Three Methods
# ============================================================================

test_questions = [
    "What is the latest LemoCard annual fee?",
    "Can I reverse a wallet transfer?",
    "What is the refund timeline and what affects it?",
]

def compare_methods(question):
    print("\n" + "="*80)
    print(f"QUESTION: {question}")
    print("="*80)
    
    # Method 1: Basic RAG
    print("\n--- METHOD 1: BASIC RAG (No Compression) ---")
    basic_answer = basic_rag_chain.invoke(question)
    print(f"Answer: {basic_answer}")
    
    # Method 2: Compression RAG
    print("\n--- METHOD 2: COMPRESSION RAG (Without Agent) ---")
    compression_answer = compression_rag_chain.invoke(question)
    print(f"Answer: {compression_answer}")
    
    # Method 3: Agentic RAG
    print("\n--- METHOD 3: AGENTIC RAG (With Agent + Compression) ---")
    agent_answer = agent_executor.invoke({"input": question})
    print(f"Answer: {agent_answer['output']}")

# Run comparison for first question
compare_methods(test_questions[0])


QUESTION: What is the latest LemoCard annual fee?

--- METHOD 1: BASIC RAG (No Compression) ---
Answer: The latest LemoCard annual fee is ₹1,499, effective from 2025-11-15.

--- METHOD 2: COMPRESSION RAG (Without Agent) ---
Answer: The latest LemoCard annual fee is ₹1,499.

--- METHOD 3: AGENTIC RAG (With Agent + Compression) ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `lemobank_knowledge_base` with `{'query': 'LemoCard annual fee'}`


[0m[36;1m[1;3m- LemoCard annual fee: ₹999
- LemoCard annual fee increased to ₹1,499

- “annual fee” may be phrased as “membership fee”

Q3: What is the *latest* LemoCard annual fee?  
Expected: ₹1,499 (effective 2025-11-15), not ₹999.[0m[32;1m[1;3m
Invoking: `lemobank_knowledge_base` with `{'query': 'latest LemoCard membership fee'}`


[0m[36;1m[1;3m- LemoCard annual fee increased to ₹1,499

Q3: What is the *latest* LemoCard annual fee?  
Expected: ₹1,499 (effective 2025-11-15), not ₹999.[0m[32;1m[1;3mThe late

In [13]:
# ============================================================================
# VISUALIZE: What Compression Actually Does
# ============================================================================
# Let's see the difference between raw retrieval vs compressed retrieval

question = "What is the annual fee?"

print("="*80)
print("RAW RETRIEVAL (No Compression)")
print("="*80)
raw_docs = base_retriever.invoke(question)
for i, doc in enumerate(raw_docs):
    print(f"\n[Chunk {i+1}] ({len(doc.page_content)} chars)")
    print(doc.page_content[:300] + "...")

print("\n" + "="*80)
print("COMPRESSED RETRIEVAL")
print("="*80)
compressed_docs = compression_retriever.invoke(question)
for i, doc in enumerate(compressed_docs):
    print(f"\n[Compressed {i+1}] ({len(doc.page_content)} chars)")
    print(doc.page_content)

# Calculate compression ratio
raw_total = sum(len(d.page_content) for d in raw_docs)
compressed_total = sum(len(d.page_content) for d in compressed_docs)
print(f"\n--- COMPRESSION STATS ---")
print(f"Raw total: {raw_total} chars")
print(f"Compressed total: {compressed_total} chars")
print(f"Compression ratio: {(1 - compressed_total/raw_total)*100:.1f}% reduction")

RAW RETRIEVAL (No Compression)

[Chunk 1] (860 chars)
[A2] Product Overview
LemoBank has three products:
1) LemoCard (credit card)
   - Annual fee: ₹999
   - Cashback: 1% on all spends
2) LemoPay (UPI + wallet)
   - Instant transfers within India
3) LemoVault (savings)
   - Base interest: 4% per annum
   - Bonus interest: +1% per annum if monthly avera...

[Chunk 2] (906 chars)
Incident: INC-2025-12-20-002
- Date: 2025-12-20
- Service affected: LemoCard statement generation
- Impact: statements for 1,200 users generated 1 day late
- Root cause: batch job misconfigured cron
- Status: Resolved

[A9] FAQ (simple Q/A)
Q: Can I change my registered phone number?
A: Yes. It requ...

[Chunk 3] (952 chars)
[B5] Multi-hop (definition + threshold)
Q5: Explain how monthly average balance is computed and why it matters for bonus interest.
Expected: formula from [A13] + tie to bonus threshold.

[B6] Table lookup
Q6: Give the Bengaluru branch address.
Expected: Indiranagar address from [A10].

[B7] 

In [None]:
# ============================================================================
# AGENT BEHAVIOR: Watch the Agent Think
# ============================================================================
# The agent can:
# 1. Decide if retrieval is needed
# 2. Rephrase queries if first search fails
# 3. Make multiple searches for complex questions

# Complex question that might need multiple retrievals
complex_question = "Compare the old and new annual fees, and explain the refund policy"

print("="*80)
print("AGENTIC RAG: Complex Multi-Part Question")
print("="*80)
print(f"Question: {complex_question}\n")

# verbose=True shows agent's thinking process
result = agent_executor.invoke({"input": complex_question})
print(f"\nFINAL ANSWER: {result['output']}")

In [14]:
# ============================================================================
# SUMMARY: When to Use Each Method
# ============================================================================

summary = """
╔══════════════════════════════════════════════════════════════════════════════╗
║                    RAG COMPRESSION: AGENT VS NON-AGENT                       ║
╠══════════════════════════════════════════════════════════════════════════════╣
║                                                                              ║
║  METHOD 1: BASIC RAG (No Compression)                                        ║
║  ├─ Flow: Question → Retrieve → Generate                                     ║
║  ├─ Pros: Fast, simple, low cost                                             ║
║  ├─ Cons: Full chunks (noisy), may exceed context limits                     ║
║  └─ Use when: Simple questions, small chunks, cost-sensitive                 ║
║                                                                              ║
║  METHOD 2: COMPRESSION RAG (Without Agent)                                   ║
║  ├─ Flow: Question → Retrieve → Compress → Generate                          ║
║  ├─ Pros: Focused context, saves tokens, better answers                      ║
║  ├─ Cons: Extra LLM call for compression, slightly slower                    ║
║  └─ Use when: Large chunks, need precise answers, have noisy docs            ║
║                                                                              ║
║  METHOD 3: AGENTIC RAG (With Agent + Compression)                            ║
║  ├─ Flow: Question → Agent decides → Retrieve → Compress → Analyze → ...     ║
║  ├─ Pros: Self-correcting, handles complex questions, can re-retrieve        ║
║  ├─ Cons: Slower, more expensive, unpredictable # of LLM calls               ║
║  └─ Use when: Complex multi-part questions, need reliability over speed      ║
║                                                                              ║
╠══════════════════════════════════════════════════════════════════════════════╣
║                           COMPARISON TABLE                                   ║
╠═══════════════════╦═══════════════╦═══════════════════╦══════════════════════╣
║ Feature           ║ Basic RAG     ║ Compression RAG   ║ Agentic RAG          ║
╠═══════════════════╬═══════════════╬═══════════════════╬══════════════════════╣
║ LLM Calls         ║ 1             ║ 2 (compress+gen)  ║ 2-5+ (varies)        ║
║ Latency           ║ Fast          ║ Medium            ║ Slow                 ║
║ Cost              ║ Low           ║ Medium            ║ High                 ║
║ Context Quality   ║ Noisy         ║ Focused           ║ Focused              ║
║ Self-Correction   ║ No            ║ No                ║ Yes                  ║
║ Multi-hop         ║ No            ║ No                ║ Yes                  ║
║ Query Rewriting   ║ No            ║ No                ║ Yes                  ║
╚═══════════════════╩═══════════════╩═══════════════════╩══════════════════════╝
"""

print(summary)


╔══════════════════════════════════════════════════════════════════════════════╗
║                    RAG COMPRESSION: AGENT VS NON-AGENT                       ║
╠══════════════════════════════════════════════════════════════════════════════╣
║                                                                              ║
║  METHOD 1: BASIC RAG (No Compression)                                        ║
║  ├─ Flow: Question → Retrieve → Generate                                     ║
║  ├─ Pros: Fast, simple, low cost                                             ║
║  ├─ Cons: Full chunks (noisy), may exceed context limits                     ║
║  └─ Use when: Simple questions, small chunks, cost-sensitive                 ║
║                                                                              ║
║  METHOD 2: COMPRESSION RAG (Without Agent)                                   ║
║  ├─ Flow: Question → Retrieve → Compress → Generate                          ║
║  ├─ Pros: Focused context