In [12]:
# ============================================================================
# COMPREHENSIVE RETRIEVER COMPARISON TABLE - BEAUTIFULLY FORMATTED
# ============================================================================
# Detailed table with all key characteristics of each retriever type

from tabulate import tabulate
import pandas as pd

print("\n" + "█"*150)
print("█" + " "*148 + "█")
print("█" + " "*50 + "LANGCHAIN RETRIEVER COMPARISON TABLE" + " "*62 + "█")
print("█" + " "*148 + "█")
print("█"*150 + "\n")

# Create comprehensive retriever data
retriever_data = [
    [
        "Similarity Search",
        "Returns most similar documents\nbased on vector similarity",
        "General RAG, standard\napplications, prototyping",
        "Free",
        "Fast, simple search\nMVP, learning",
        "✓ Development\n✓ Production\n✓ Learning"
    ],
    [
        "Maximum Marginal\nRelevance (MMR)",
        "Returns diverse results\nwhile reducing redundancy",
        "Research, exploring multiple\nperspectives, recommendations",
        "Low",
        "Diverse results\nnon-redundant content",
        "✓ Research\n✓ Exploration\n✓ Recommendations"
    ],
    [
        "Similarity Score\nThreshold",
        "Returns only high-quality\nmatches above threshold",
        "Compliance systems,\ncritical applications",
        "Free",
        "Quality guarantees\nerror prevention",
        "✓ Compliance\n✓ Safety\n✓ Finance"
    ],
    [
        "BM25 Retriever",
        "Traditional keyword-based\nsearch (exact matching)",
        "Technical queries,\nacronyms, exact phrase",
        "Free",
        "Technical docs\ncode search, acronyms",
        "✓ Tech docs\n✓ Code\n✓ Structured data"
    ],
    [
        "Ensemble Retriever",
        "Combines multiple\nretrieval methods",
        "Hybrid search combining\nkeyword + semantic",
        "Medium",
        "Best of both worlds\nprecision + recall",
        "✓ Enterprise\n✓ Advanced search\n✓ Hybrid"
    ],
    [
        "Multi-Query\nRetriever",
        "Generates multiple query\nvariations for better recall",
        "Complex questions,\nambiguous queries",
        "High",
        "Understanding intent\nparaphrased queries",
        "✓ Q&A\n✓ Chat\n✓ Complex questions"
    ],
    [
        "Contextual\nCompression",
        "Compresses documents to\nextract relevant info only",
        "Reducing tokens,\nextracting key information",
        "High",
        "Token optimization\nsummarization",
        "✓ Cost-sensitive\n✓ Token limits\n✓ Extraction"
    ]
]

headers = ["RetrieveType", "Purpose", "Ideal Usage Scenario", "COST", "Good for", "Suitable"]

# Print beautiful table
print(tabulate(
    retriever_data,
    headers=headers,
    tablefmt="grid",
    stralign="center",
    maxcolwidths=[18, 28, 24, 12, 24, 28]
))

print("\n" + "█"*150)

# ============================================================================
# VECTOR STORE COMPARISON TABLE
# ============================================================================
print("\n" + "█"*150)
print("█" + " "*148 + "█")
print("█" + " "*55 + "VECTOR STORE COMPARISON" + " "*70 + "█")
print("█" + " "*148 + "█")
print("█"*150 + "\n")

vector_store_data = [
    ["FAISS", "In-Memory", "⚡⚡⚡⚡⚡ Fastest", "Free", "Learning & Dev", "No"],
    ["Chroma", "Local/Cloud", "⚡⚡⚡⚡ Fast", "Free", "Small Projects", "Yes"],
    ["Pinecone", "Cloud", "⚡⚡⚡⚡ Fast", "$", "Enterprise Scale", "Yes"],
    ["Weaviate", "Cloud/Self-hosted", "⚡⚡⚡⚡ Fast", "$$", "Enterprise Scale", "Yes"],
    ["Qdrant", "Cloud/Self-hosted", "⚡⚡⚡⚡⚡ Fastest", "$$", "High Performance", "Yes"]
]

headers_vs = ["Vector Store", "Storage Type", "Speed", "Cost", "Best For", "Persistence"]

print(tabulate(
    vector_store_data,
    headers=headers_vs,
    tablefmt="grid",
    stralign="center",
    maxcolwidths=[15, 20, 25, 12, 20, 15]
))

print("\n" + "█"*150)

# ============================================================================
# QUICK REFERENCE GUIDE
# ============================================================================
print("\n" + "█"*150)
print("█" + " "*148 + "█")
print("█" + " "*60 + "QUICK REFERENCE GUIDE" + " "*67 + "█")
print("█" + " "*148 + "█")
print("█"*150 + "\n")

quick_ref = [
    ["Building a prototype/demo", "→", "Similarity Search + FAISS"],
    ["Need diverse search results", "→", "MMR Retriever"],
    ["Must filter low-quality results", "→", "Similarity with Score Threshold"],
    ["Production system with storage", "→", "Similarity Search + Chroma/Pinecone"],
    ["High-performance production", "→", "Similarity Search + Qdrant"],
    ["Hybrid keyword + semantic search", "→", "Ensemble Retriever (BM25 + Vector)"],
    ["Complex questions, need better recall", "→", "Multi-Query Retriever"],
    ["Reduce token usage & API costs", "→", "Contextual Compression Retriever"]
]

print(tabulate(
    quick_ref,
    tablefmt="fancy_grid",
    stralign="left",
    maxcolwidths=[35, 3, 50]
))

print("\n" + "█"*150 + "\n")

# ============================================================================
# DETAILED NOTES
# ============================================================================
print("┌" + "─"*148 + "┐")
print("│" + " "*148 + "│")
print("│" + "DETAILED IMPLEMENTATION NOTES".center(148) + "│")
print("│" + " "*148 + "│")
print("└" + "─"*148 + "┘\n")

notes = {
    "SIMILARITY SEARCH": """
    ├─ Fastest and simplest approach for RAG
    ├─ Uses cosine similarity between query and document embeddings
    ├─ Example: retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
    ├─ Perfect for: MVP, prototyping, general use cases
    └─ Trade-off: Simple but may miss semantic nuances""",
    
    "MMR (Maximum Marginal Relevance)": """
    ├─ Balances relevance with diversity of results
    ├─ fetch_k determines candidate pool size for diversity calculation
    ├─ Higher fetch_k = more diverse but slower
    ├─ Example: search_type="mmr", search_kwargs={"k": 4, "fetch_k": 20}
    └─ Perfect for: Research, exploration, recommendation systems""",
    
    "SIMILARITY SCORE THRESHOLD": """
    ├─ Quality filter: only returns results above threshold
    ├─ Prevents low-relevance matches from affecting answers
    ├─ May return 0 results if threshold too high
    ├─ Example: search_type="similarity_score_threshold", score_threshold=0.5
    └─ Perfect for: Compliance, safety-critical, financial systems""",
    
    "BM25 RETRIEVER": """
    ├─ Traditional TF-IDF based keyword search
    ├─ Excellent for exact keyword and acronym matching
    ├─ Does NOT require embeddings (fast, efficient)
    ├─ Works well with technical documentation
    └─ Perfect for: Code search, technical docs, structured data""",
    
    "ENSEMBLE RETRIEVER": """
    ├─ Combines BM25 (keyword) + Vector (semantic) search
    ├─ Weights determine importance: [0.5, 0.5] = equal, [0.3, 0.7] = prefer semantic
    ├─ Best of both worlds: precision + recall
    ├─ Slightly slower due to multiple retriever calls
    └─ Perfect for: Enterprise applications, advanced search""",
    
    "MULTI-QUERY RETRIEVER": """
    ├─ LLM generates multiple question formulations
    ├─ Improves recall by capturing different query angles
    ├─ Slower due to LLM calls but better understanding
    ├─ Great for "Did you mean?" scenarios
    └─ Perfect for: Q&A systems, chat, complex questions""",
    
    "CONTEXTUAL COMPRESSION": """
    ├─ LLM extracts only the relevant parts of documents
    ├─ Significantly reduces token usage
    ├─ Slower but saves API costs on long documents
    ├─ Prevents hallucination from irrelevant document sections
    └─ Perfect for: Token-limited applications, cost optimization"""
}

for title, content in notes.items():
    print(f"► {title}")
    print(content)
    print()

print("█"*150 + "\n")


██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████
█                                                                                                                                                    █
█                                                  LANGCHAIN RETRIEVER COMPARISON TABLE                                                              █
█                                                                                                                                                    █
██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

+--------------------+------------------------------+-------------------------+--------+--------------------------+------------------------------+
|    RetrieveType    |           Purpose            |  Ideal Usage Scenario   |  COST  |        