## LangChain RAG for Philately

In [None]:
import os
import weaviate
from philatelic_weaviate import *
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_weaviate import WeaviateVectorStore
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain.retrievers import MultiQueryRetriever, EnsembleRetriever
from typing import List, Dict, Any
import time
import json
import re
from collections import defaultdict

print(f"📚 LangChain RAG for Philately - Weaviate v{weaviate.__version__}")
print("🔧 All modules imported successfully")

## 1. Settings

In [None]:
# ========================================================================================
# 🎛️ CONFIGURATION - Centralized Settings for RAG System
# ========================================================================================

# Environment variables - Import from philatelic_weaviate if not already defined
if 'WEAVIATE_URL' not in locals():
    from philatelic_weaviate import WEAVIATE_URL, OPENAI_API_KEY

# Collection Settings
COLLECTION_NAME = os.getenv('WEAVIATE_COLLECTION_NAME', 'Oxcart')

# Document Limits (configurable in one place)
RAG_DOCUMENT_LIMIT = 30        # Main limit for RAG evaluation with optimal balance
RETRIEVER_TEST_LIMIT = 15      # For individual retriever testing  
DEMO_LIMIT = 8                 # For demonstration purposes
SEARCH_MAX_LIMIT = 100         # Maximum for broad searches

# Content Processing
MAX_CONTENT_LENGTH = 400       # Characters per document in context
PRESERVE_METADATA_ALWAYS = True  # Always show Scott numbers, years, etc.

# Logging Control
VERBOSE_LOGGING = False        # Set to True for detailed debug info
SHOW_PROGRESS = True          # Show progress indicators

# Test Query
TEST_QUERY = "Costa Rica 1907 2 colones stamp with original gum. Scott 68 issue of 1907"
TEST_MODE = "hybrid"

print("⚙️ Configuration loaded:")
print(f"   🌐 Weaviate URL: {WEAVIATE_URL}")
print(f"   🔑 OpenAI API Key: {'✅ Set' if OPENAI_API_KEY else '❌ Missing'}")
print(f"   📊 RAG Document Limit: {RAG_DOCUMENT_LIMIT}")
print(f"   🧪 Test Limit: {RETRIEVER_TEST_LIMIT}")
print(f"   💬 Verbose Logging: {VERBOSE_LOGGING}")
print(f"   📝 Test Query: {TEST_QUERY[:50]}...")

In [None]:
# ========================================================================================
# 🔌 WEAVIATE & LANGCHAIN SETUP
# ========================================================================================

def log_info(message, force=False):
    """Clean logging that only shows when needed"""
    if VERBOSE_LOGGING or force:
        print(f"ℹ️ {message}")

def log_result(message):
    """Always show important results"""
    print(f"✅ {message}")

def log_error(message):
    """Always show errors"""
    print(f"❌ {message}")

# Connect to Weaviate
try:
    client = create_weaviate_client(WEAVIATE_URL, OPENAI_API_KEY)    
    # Quick health check - Fixed for Weaviate v4
    try:
        # Check if collection exists using the exists method
        if client.collections.exists(COLLECTION_NAME):
            collection = client.collections.get(COLLECTION_NAME)
            total_objects = collection.aggregate.over_all(total_count=True).total_count
            log_result(f"Connected to Weaviate - Collection '{COLLECTION_NAME}' has {total_objects:,} documents")
        else:
            log_result(f"Connected to Weaviate - Collection '{COLLECTION_NAME}' will be created")
    except Exception as collection_check_error:
        log_info(f"Collection check failed: {collection_check_error}, but client is connected")
        log_result(f"Connected to Weaviate - Collection '{COLLECTION_NAME}' status unknown")
        
except Exception as e:
    log_error(f"Weaviate connection failed: {e}")
    client = None

# Setup LangChain LLM and Embeddings
if client:
    try:
        llm = ChatOpenAI(
            model="gpt-5-nano", 
            api_key=OPENAI_API_KEY, 
            temperature=1,  # obligatorio para gpt-5-nano
            timeout=120.0,
            model_kwargs={
                "verbosity": "medium",
                "reasoning_effort" : "high"
            })
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large", api_key=OPENAI_API_KEY)
        log_result("LangChain LLM and Embeddings configured")
    except Exception as e:
        log_error(f"LangChain setup failed: {e}")
        llm = embeddings = None
else:
    llm = embeddings = None

In [None]:
# Ensure collection exists
if client:
    collection_created = create_oxcart_collection(client, COLLECTION_NAME)
    if collection_created:
        log_info(f"Collection '{COLLECTION_NAME}' ready", force=True)
    else:
        log_error(f"Failed to setup collection '{COLLECTION_NAME}'")

## 2. Core RAG Functions

Implementación optimizada y modular de los componentes RAG centrales.

In [None]:
# ========================================================================================
# 🛠️ UTILITY FUNCTIONS - Content Processing and Formatting
# ========================================================================================

def compress_content(content: str, max_length: int = MAX_CONTENT_LENGTH) -> str:
    """Intelligently compress content while preserving key philatelic information"""
    if len(content) <= max_length:
        return content
    
    # Try to find a good break point near the limit
    if max_length < len(content):
        # Look for sentence endings near the limit
        for i in range(max_length - 50, max_length):
            if i < len(content) and content[i] in '.!?':
                return content[:i+1] + " [...]"
    
    return content[:max_length-5] + " [...]"

def format_philatelic_metadata(doc: Dict) -> List[str]:
    """Extract and format key philatelic metadata"""
    metadata_lines = []
    
    if doc.get('scott_numbers'):
        metadata_lines.append(f"🔢 Scott: {', '.join(doc['scott_numbers'])}")
    if doc.get('years'):
        metadata_lines.append(f"📅 Years: {', '.join(map(str, doc['years']))}")
    if doc.get('colors'):
        metadata_lines.append(f"🎨 Colors: {', '.join(doc['colors'])}")
    if doc.get('catalog_systems'):
        metadata_lines.append(f"📖 Catalogs: {', '.join(doc['catalog_systems'])}")
    if doc.get('variety_classes'):
        metadata_lines.append(f"🔀 Varieties: {', '.join(doc['variety_classes'])}")
        
    return metadata_lines

def classify_document_authority(doc_id: str) -> tuple:
    """Classify document by philatelic authority level"""
    doc_id_lower = doc_id.lower()
    
    if any(keyword in doc_id_lower for keyword in ['scott', 'michel', 'catalog', 'mayer']):
        return ('catalog', '⭐ AUTHORITATIVE')
    elif any(keyword in doc_id_lower for keyword in ['postal history', 'literature', 'frajola']):
        return ('literature', '🟢 RELIABLE') 
    elif any(keyword in doc_id_lower for keyword in ['collection', 'nordberg', 'escalante', 'pinto']):
        return ('collection', '🟡 SUPPLEMENTARY')
    else:
        return ('reference', '🟡 SUPPLEMENTARY')

log_result("Core utility functions loaded")

In [None]:
# ========================================================================================
# 📝 RAG PROMPT TEMPLATE - Professional Philatelic Consultation
# ========================================================================================

philatelic_rag_template = """You are a senior philatelic researcher and catalog specialist with expertise in Costa Rican stamps and postal history. Provide comprehensive, well-structured analysis based strictly on the source materials provided.

SOURCE MATERIALS:
{context}

RESEARCH QUERY: {question}

RESPONSE REQUIREMENTS:

FORMATTING & STRUCTURE:
• Use clear hierarchical organization with descriptive headers
• Group related information under logical categories using ## and **bold subheadings**
• Use bullet points (•) for individual facts and varieties
• Include relevant emojis for major sections (🔍 📮 📚 🎯) to enhance readability
• Bold key terms, catalog numbers, and important details
• KEEP SECTIONS CONCISE - avoid excessive repetition or overly detailed explanations

CITATION FORMAT:
• Every factual statement must include: (SOURCE, page #)
• Multiple sources: (SOURCE1, page #; SOURCE2, page #)
• Always cite catalog numbers (scott, yvert, michell, etc), varieties, dates, quantities, and technical specifications
• When quoting directly, use quotation marks around quoted text

CONTENT ORGANIZATION:
• Lead with the most direct answer to the query
• Organize by catalog numbers, chronological order, or logical categories as appropriate
• Include technical specifications: dates, quantities, colors, perforations, varieties
• Provide brief historical context and collecting significance
• Note relationships between issues, varieties, or catalog entries
• Address valuation or rarity when relevant to the query

RESPONSE LENGTH:
• Aim for clear, informative responses that are thorough but not excessive
• Eliminate redundant information and repetitive explanations
• Focus on the most relevant information that directly answers the query
• If information is extensive, prioritize the most important catalog entries and varieties

TECHNICAL STANDARDS:
• Use precise philatelic terminology (definitive, commemorative, variety, error, overprint, etc.)
• Specify exact catalog numbers with proper formatting (Scott C216, not just C216)
• Include denomination and color details when available
• Note printing quantities, dates, and technical varieties
• Distinguish between verified catalog facts and expert opinions
• Flag incomplete or uncertain information clearly

RESEARCH COMPLETENESS:
• If source materials are insufficient, state: "The provided documents do not contain sufficient information about..."
• Suggest what additional sources or information would be needed
• Note any gaps in catalog coverage or missing details

PROFESSIONAL TONE:
• Maintain authoritative but accessible language
• Present information objectively without unnecessary qualifiers
• Use active voice and clear, direct statements
• Avoid speculation beyond what sources support

RESPONSE:"""

# Create the prompt template
rag_prompt = PromptTemplate(
    template=philatelic_rag_template,
    input_variables=["context", "question"]
)

## 3. Document Processing & RAG Chain

Funciones optimizadas para formateo de contexto y ejecución de RAG con 20 documentos.

In [None]:
# ========================================================================================
# 📄 OPTIMIZED DOCUMENT FORMATTING - For Academic Citation Style
# ========================================================================================

def format_docs_for_rag(docs_results: List[Dict]) -> str:
    """Efficient document formatting optimized for academic citation style (Document Name, p. Page)"""
    
    if not docs_results:
        return "\nNo source documents available."
    
    # Group and sort documents by authority
    #doc_groups = {'catalog': [], 'literature': [], 'collection': [], 'reference': []}
    docs = []
    
    for i, doc in enumerate(docs_results, 1):
        #category, reliability = classify_document_authority(doc.metadata.get('doc_id', 'Unknown'))
        
        doc_info = {
            'doc_num': i,
            'doc_id': doc.metadata.get('doc_id', 'Unknown'),
            'page': doc.metadata.get('page_number', 'N/A'),
            #'score': doc.metadata.get('score', 0.0),
           # 'type': doc.metadata.get('chunk_type', 'text'),
           # 'reliability': reliability,
            'content': doc.page_content, #doc.get('text_original', ''), #compress_content(doc.get('text', ''), MAX_CONTENT_LENGTH),
            #'metadata_lines': format_philatelic_metadata(doc.metadata)
        }
        #doc_groups[category].append(doc_info)
        docs.append(doc_info)
    return docs
    
    # Format with academic citation style
#     formatted_sections = [f"=== PHILATELIC SOURCES ({len(docs_results)} documents) ===\n"]
    
#     for category, category_name in [('catalog', 'CATALOGS'), ('literature', 'LITERATURE'), 
#                                   ('collection', 'COLLECTIONS'), ('reference', 'REFERENCE')]:
        
#         if doc_groups[category]:
#             formatted_sections.append(f"\n📚 {category_name}:")
            
#             for doc in doc_groups[category]:
#                 # Academic citation format
#                 doc_line = f"\n[{doc['doc_id']}] {doc['reliability']}"
#                 source_line = f"📄 Page {doc['page']} (Confidence: {doc['score']:.3f})"
                
#                 if doc['metadata_lines']:
#                     metadata_summary = " | ".join(doc['metadata_lines'][:3])  # Top 3 metadata
#                     source_line += f"\n🎯 {metadata_summary}"
                
#                 content_line = f"📝 {doc['content']}"
#                 citation_line = f"💡 Cite as: ({doc['doc_id']}, p. {doc['page']})"
                
#                 formatted_sections.extend([doc_line, source_line, content_line, citation_line, ""])
    
#     # Add concise citation guide
#     formatted_sections.append("""
# 🔍 CITATION FORMAT: (Document Name, p. Page)
# 📚 Multiple sources: (Source1, p. X; Source2, p. Y)
# ⭐ Higher confidence scores = more authoritative sources
# """)
    
    # return "\n".join(formatted_sections)

def create_rag_response(retriever_results: List[Dict], query: str) -> Dict:
    """Streamlined RAG chain execution with academic citation style"""
    
    if not retriever_results:
        return {"response": "No documents found for this query.", "generation_time": 0}
    
    # Format context efficiently for academic citations
    context = format_docs_for_rag(retriever_results)
    
    #limited_llm = llm.bind(max_tokens=4000)  # Limita la respuesta a 4000 tokens

    
    # Execute RAG chain
    rag_chain = (
        {"context": lambda x: context, "question": RunnablePassthrough()}
        | rag_prompt | llm | StrOutputParser()
    )
    
    start_time = time.time()
    response = rag_chain.invoke(query)
    generation_time = round(time.time() - start_time, 2)
    
    return {
        "response": response,
        "generation_time": generation_time,
        "context_docs_count": len(retriever_results),
        "context_length": len(context),
       # "max_tokens": 4000  # Agregar para tracking
    }

## 4. Retriever Implementations

Tres enfoques de retrieval optimizados para consultas filatélicas.

In [None]:
# ========================================================================================
# 🔄 BASE RETRIEVER FUNCTIONS - Shared Logic
# ========================================================================================

def format_retriever_results(raw_results: List[Dict], retriever_type: str, **metadata) -> List[Dict]:
    """Standardized formatting for all retriever results"""
    formatted_results = []
    
    for i, result in enumerate(raw_results, 1):
        formatted_result = {
            "rank": i,
            "score": result.get("score", 0.0),
            "text": result.get("text_original", ""),
            #"chunk_id": result.get("chunk_id", ""),
            #"chunk_type": result.get("chunk_type", "text"),
            "doc_id": result.get("doc_id", ""),
            # "page_number": result.get("page_number", 0),
            # "catalog_systems": result.get("catalog_systems", []),
            # "scott_numbers": result.get("scott_numbers", []),
            # "years": result.get("years", []),
            # "colors": result.get("colors", []),
            # "variety_classes": result.get("variety_classes", []),
            # "topics_primary": result.get("topics_primary", ""),
            # "has_catalog": result.get("has_catalog", False),
            # "has_varieties": result.get("has_varieties", False),
            # "is_guanacaste": result.get("is_guanacaste", False),
            # "quality_score": result.get("quality_score", 0.0),
            # "retriever_type": retriever_type,
            #**metadata  # Additional metadata from specific retrievers
        }
        formatted_results.append(formatted_result)
    
    return formatted_results

def execute_with_fallback(primary_func, fallback_func, *args, **kwargs):
    """Execute retriever with fallback option"""
    try:
        return primary_func(*args, **kwargs)
    except Exception as e:
        log_info(f"Primary method failed: {e}, using fallback")
        return fallback_func(*args, **kwargs)

log_result("Base retriever functions loaded")

### 4.1 Vector Store-Backed Retriever

In [None]:
def vector_store_retriever(client, query: str, limit: int = RAG_DOCUMENT_LIMIT, 
                          mode: str = "vector", alpha: float = 0.5) -> List[Dict]:
    """Clean vector store retriever using LangChain + Weaviate"""
    
    log_info(f"Vector Store Retriever: {mode} mode, limit={limit}")
    
    try:
        # Create LangChain vector store
        vector_store = WeaviateVectorStore(
            client=client,
            index_name=COLLECTION_NAME,
            text_key="text",
            embedding=embeddings
        )
        
        # Configure retriever based on mode
        search_config = {"k": limit}
        if mode == "hybrid":
            search_config["lambda_mult"] = alpha
            
        retriever = vector_store.as_retriever(
            search_type="mmr" if mode == "hybrid" else "similarity",
            search_kwargs=search_config
        )
        
        # Execute search
        docs = retriever.invoke(query)
        
        # Convert to standard format
        results = []
        for doc in docs:
            result = {
                "score": doc.metadata.get("score", 0.0),
                "text": doc.page_content,
                **doc.metadata
            }
            results.append(result)
            
        log_info(f"Retrieved {len(results)} documents")
        return format_retriever_results(results, "vector_store", search_mode=mode)
        
    except Exception as e:
        log_error(f"Vector store retriever failed: {e}")
        return []

### 4.2 Multi-Query Retriever

In [None]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + "Score: "+str(d.metadata.get("quality_score")) + d.page_content + "\nSource: "+ str(d.metadata.get("doc_id")) +"\n" + " page: " + str(d.metadata.get("page_number")) for i, d in enumerate(docs)]
        )
    )

In [None]:
def search_stamps_with_compression(query, client, embeddings, llm, limit=100, 
                                 alpha=0.30, diversity_lambda=0.75):
    """
    Búsqueda optimizada para consultas filatélicas con estrategia dual y compresión.
    
    Args:
        query (str): La consulta sobre sellos
        client: Cliente de Weaviate
        embeddings: Modelo de embeddings
        llm: Modelo de lenguaje
        limit (int): Número máximo de documentos a recuperar
        alpha (float): Factor lambda para híbrido (0.30 = 30% vectorial, 70% keywords)
        diversity_lambda (float): Factor de diversidad para MMR (0.75 = buena diversidad)
    
    Returns:
        list: Documentos comprimidos optimizados para filatelia
    """
    from langchain_weaviate import WeaviateVectorStore
    from langchain.retrievers import MultiQueryRetriever, ContextualCompressionRetriever, EnsembleRetriever
    from langchain.retrievers.document_compressors import LLMChainExtractor
    from langchain.prompts import PromptTemplate
    from langchain_openai import ChatOpenAI
    
    # Crear vector store
    vector_store = WeaviateVectorStore(
        client=client,
        index_name=COLLECTION_NAME,
        text_key="text",
        embedding=embeddings
    )
    
    # Try to create hybrid retriever - some Weaviate versions support this
    hybrid_kwargs = {"k": limit // 2}
    if alpha is not None:
        hybrid_kwargs["alpha"] = alpha  # Use 'alpha'
    
    # 1. Retriever híbrido para precisión (captura números exactos + contexto)
    precision_retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs=hybrid_kwargs
    )
    
    # 2. Retriever MMR para diversidad (evita sellos duplicados)
    diversity_retriever = vector_store.as_retriever(
        search_type="mmr",
        search_kwargs={"k": limit // 2, "lambda_mult": diversity_lambda}
    )
    
    # 3. Ensemble con estrategia dual
    base_retriever = EnsembleRetriever(
        retrievers=[precision_retriever, diversity_retriever],
        weights=[0.7, 0.3]  # 70% precisión híbrida + 30% diversidad
    )
    
    # Prompt especializado para filatelia
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template="""You are a specialized philatelic researcher expert in stamp catalogues and varieties.
Generate 3 strategically different versions of the question to capture comprehensive stamp information:

ORIGINAL: {question}

Create variations that target:
1. CATALOG PRECISION: Focus on exact catalog numbers, dates, and technical specifications
2. CONTEXTUAL SEARCH: Include related series, printings, varieties, and historical context  
3. TERMINOLOGY ALTERNATIVES: Use alternative philatelic terms and synonyms

Consider these philatelic elements:
- Catalog systems: Scott, Michel, Yvert, SG, local catalogs
- Technical terms: definitive/commemorative, variety/error, overprint/surcharge
- Time references: issue dates, printing dates, first day covers
- Denominations: face values, colors, perforations

Alternative searches:
1.
2. 
3."""
    )
    
    # MultiQueryRetriever con prompt especializado
    multi_retriever = MultiQueryRetriever.from_llm(
        retriever=base_retriever,
        llm=llm,
        prompt=query_prompt,
        parser_key="lines"
    )
    
    # Compresión optimizada para datos técnicos
    summarizer_llm = ChatOpenAI(
        model="gpt-4o-mini",
        api_key=OPENAI_API_KEY,
        temperature=0.1,  # Más determinístico para preservar números exactos
        timeout=30
    )
    
    compressor = LLMChainExtractor.from_llm(summarizer_llm)
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=compressor,
        base_retriever=multi_retriever
    )
    
    # Ejecutar búsqueda
    results = compression_retriever.invoke(query)

     # Reorder by quality_score if it exists
    def get_quality_score(doc):
        return doc.metadata.get('quality_score', 0.0)
    
    sorted_results = sorted(results, key=get_quality_score, reverse=True)
    return sorted_results

In [None]:
query = "Quien es alfonso molina o alfonso molina chacon o alfonso molina ch? Que articulos filatelicos o de estampillas ha escrito?",
compressed_docs = search_stamps_with_compression(
     query,
     client=client, 
     embeddings=embeddings, 
     llm=llm,
     alpha=0.30,  # 30% vectorial, 70% keywords para números exactos
     diversity_lambda=0.75)  # 75% relevancia, 25% diversidad )


In [None]:
pretty_print_docs(compressed_docs)

In [None]:
create_rag_response(compressed_docs,query)

In [None]:
def multi_query_retriever_langchain(client, query: str, limit: int = RAG_DOCUMENT_LIMIT, 
                          mode: str = "hybrid", alpha: float = 0.35) -> List[Dict]:
      
    # Create Weaviate vector store
    # Use the new langchain-weaviate integration
    vector_store = WeaviateVectorStore(
            client=client,
            index_name=COLLECTION_NAME,
            text_key="text",
            embedding=embeddings
        )
        
    # Configure retriever based on mode
    search_config = {"k": limit}
    if mode == "hybrid":
        search_config["lambda_mult"] = alpha
        
    retriever = vector_store.as_retriever(
        search_type="mmr" if mode == "hybrid" else "similarity",
        search_kwargs=search_config
    )
    
    # Custom prompt for philatelic queries
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template="""You are an expert in philatelic (stamp collecting) terminology. 
Generate 3 different versions of the given question that would retrieve similar stamp-related documents.

Consider:
- Alternative terminology (stamp/postage, variety/error, issue/emission)
- Date formats and variations
- Technical philatelic terms

Original question: {question}

Alternative questions:
1.
2.
3."""
    )
    
    # Create MultiQueryRetriever
    multi_retriever = MultiQueryRetriever.from_llm(
        retriever=retriever,
        llm=llm,
        prompt=query_prompt,
        parser_key="lines"  # Parse line by line
    )
    
    # Get documents
    documents = multi_retriever.invoke(query)
    
    # Convert to your format
    results = []
    seen_chunks = set()
    
    for doc in documents:
        chunk_id = doc.metadata.get('chunk_id', doc.metadata.get('id'))
        if chunk_id not in seen_chunks:
            seen_chunks.add(chunk_id)
            
            result = {
                'content': doc.page_content,
                'chunk_id': chunk_id,
                'score': doc.metadata.get('score', 0),
                'source': doc.metadata.get('source', ''),
                'metadata': doc.metadata
            }
            results.append(result)
    
    log_info(f"Retrieved {len(results)} unique documents via LangChain")
    
    return format_retriever_results(results, "langchain_multi_query")

In [None]:
def multi_query_retriever_ensamble(client, query: str, limit: int = RAG_DOCUMENT_LIMIT, 
                          mode: str = "hybrid", alpha: float = 0.35, 
                          rrf_k: int = 60) -> List[Dict]:
    """
    Multi-query retriever with RRF (Reciprocal Rank Fusion) for philatelic research.
    """
    
    # Create Weaviate vector store
    vector_store = WeaviateVectorStore(
        client=client,
        index_name=COLLECTION_NAME,
        text_key="text",
        embedding=embeddings
    )
    
    # Build retrievers with proper Weaviate parameters
    retrievers = []
    
    try:
        # 1. PRECISION: Exact similarity for specific stamps/catalogs
        precision_retriever = vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": limit * 2}
        )
        retrievers.append(precision_retriever)
        log_info("Created precision retriever (similarity)")
        
        # 2. DIVERSITY: MMR to avoid similar stamps from same series
        # Only add lambda_mult for MMR if supported
        mmr_kwargs = {"k": limit * 2}
        if mode == "hybrid":
            mmr_kwargs["fetch_k"] = limit * 3  # Use fetch_k instead of lambda_mult for MMR
        
        diversity_retriever = vector_store.as_retriever(
            search_type="mmr",
            search_kwargs=mmr_kwargs
        )
        retrievers.append(diversity_retriever)
        log_info("Created diversity retriever (MMR)")
        
        # 3. HYBRID: Only if mode is hybrid and supported
        if mode == "hybrid":
            try:
                # Try to create hybrid retriever - some Weaviate versions support this
                hybrid_kwargs = {"k": limit * 2}
                if alpha is not None:
                    hybrid_kwargs["alpha"] = alpha  # Use 'alpha' instead of 'lambda_mult'
                
                hybrid_retriever = vector_store.as_retriever(
                    search_type="similarity",  # Fallback to similarity
                    search_kwargs=hybrid_kwargs
                )
                retrievers.append(hybrid_retriever)
                log_info("Created hybrid retriever")
            except Exception as e:
                print(f"Hybrid retriever not supported: {e}")
        
    except Exception as e:
        log_error(f"Error creating retrievers: {e}")
        # Fallback to basic similarity retriever
        basic_retriever = vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": limit * 2}
        )
        retrievers = [basic_retriever]
        log_info("Using fallback basic retriever")
    
    # ENHANCED PHILATELIC RESEARCH PROMPT (in English)
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template="""You are a professional philatelic researcher and stamp collecting expert. Generate 3 alternative research queries for comprehensive stamp documentation retrieval.

PHILATELIC RESEARCH PRINCIPLES:
- Use both common and technical terminology
- Include catalog numbers, varieties, and printing details  
- Consider historical context and postal history
- Include alternative relevance from philatelic world vocabulary

Original query: {question}

Generate exactly 3 research-focused alternative queries:

1. TECHNICAL: Include Scott/Michel/SG numbers, perforations, watermarks, varieties, printing methods

2. HISTORICAL: Include historical period, postal usage, commemorative events, postal rates, printing runs (plates)

3. DESCRIPTIVE: Include colors, designs, denominations, condition terms, market aspects

Each query must be on a separate line and focus on different philatelic research angles."""
    )
    
    # Initialize documents variable to avoid reference error
    documents = []
    
    try:
        # Create MultiQueryRetriever for each strategy
        multi_query_retrievers = []
        
        for i, base_retriever in enumerate(retrievers):
            try:
                multi_retriever = MultiQueryRetriever.from_llm(
                    retriever=base_retriever,
                    llm=llm,
                    prompt=query_prompt,
                    parser_key="lines",
                    include_original=True
                )
                multi_query_retrievers.append(multi_retriever)
                log_info(f"Created MultiQueryRetriever #{i+1}")
            except Exception as e:
                log_error(f"Failed to create MultiQueryRetriever #{i+1}: {e}")
        
        if not multi_query_retrievers:
            raise Exception("No MultiQueryRetrievers could be created")
        
        print(f"\n🔍 ORIGINAL QUERY: {query}")
        try:
            llm_result = llm.invoke(query_prompt.format(question=query))
            generated_text = llm_result.content if hasattr(llm_result, 'content') else str(llm_result)
            print(f"🤖 GENERATED QUERIES:\n{generated_text}")
            print("-" * 50)
        except Exception as e:
            print(f"Could not preview queries: {e}")
        
        # Try EnsembleRetriever with RRF
        if len(multi_query_retrievers) > 1:
            try:
                ensemble_retriever = EnsembleRetriever(
                    retrievers=multi_query_retrievers,
                    weights=None,  # Equal weights
                    c=rrf_k,  # RRF constant
                )
                
                documents = ensemble_retriever.invoke(query)
                print(f"EnsembleRetriever with RRF returned {len(documents)} documents")
                
            except Exception as e:
                print(f"EnsembleRetriever failed: {e}")
        else:
            # Single retriever fallback
            print("Single retriever")
            documents = multi_query_retrievers[0].invoke(query)
            print(f"Single MultiQueryRetriever returned {len(documents)} documents")
            
    except Exception as e:
        print(f"All retrieval strategies failed: {e}")
        # Ultimate fallback to basic retrieval
        return []
    
    # Process results
    if not documents:
        print("No documents retrieved")
        return []
    
    #     # Convert to your format
    # Convert to standard format
    results = []
    for doc in documents:
        result = {
            "score": doc.metadata.get("score", 0.0),
            "text": doc.page_content,
            **doc.metadata
        }
        results.append(result)
        
    log_info(f"Retrieved {len(results)} documents")
        
    log_info(f"Retrieved {len(results)} documents")
    return format_retriever_results(results, "multi_query")

In [None]:
def multi_query_retriever_ensemble(client, query: str, limit: int = RAG_DOCUMENT_LIMIT, 
                                   mode: str = "hybrid", alpha: float = 0.45, 
                                   rrf_k: int = 60) -> List[Dict]:
    """
    Multi-query retriever with proper RRF scoring for philatelic research.
    Combines multiple retrieval strategies with proper score calculation.
    """
    
    # Create Weaviate vector store
    vector_store = WeaviateVectorStore(
        client=client,
        index_name=COLLECTION_NAME,
        text_key="text",
        embedding=embeddings
    )
    
    # Enhanced philatelic query generation prompt
    
    query_prompt = PromptTemplate(
        input_variables=["question"],
        template="""You are an expert philatelic researcher. Analyze the user's question and generate 3-4 diverse search queries that will retrieve the most relevant information from a philatelic database.

    Original question: {question}

    Consider these aspects when generating queries:
    - What specific philatelic information is being sought?
    - What synonyms or related terms might be used in documentation?
    - What different perspectives (collector, dealer, historian, cataloger) might describe this?
    - What time periods, countries, or technical details are relevant?

    Generate your queries as a simple numbered list. Make each query substantially different from the others to maximize coverage:

    1. [First alternative query]
    2. [Second alternative query] 
    3. [Third alternative query]
    4. [Fourth alternative query if needed]

    Focus on creating queries that would find information even if documented differently than the original question."""
    )
    
    # Generate multiple queries using LLM
    try:
        llm_result = llm.invoke(query_prompt.format(question=query))
        generated_text = llm_result.content if hasattr(llm_result, 'content') else str(llm_result)
        
        # Parse generated queries
        alternative_queries = [q.strip() for q in generated_text.split('\n') 
                             if q.strip() and not q.strip().startswith(('#', '-', '*'))]
        # Keep only numbered queries (1., 2., 3.)
        alternative_queries = [q.split('.', 1)[1].strip() if '.' in q else q 
                             for q in alternative_queries[:3]]
        
        # Add original query
        all_queries = [query] + alternative_queries
        
        log_info(f"Generated {len(all_queries)} queries total")
        print(f"🔍 Queries: {all_queries}")
        
    except Exception as e:
        log_error(f"Query generation failed: {e}")
        all_queries = [query]  # Fallback to original query only
    
    # Collect all results with proper scoring
    all_results = []
    query_results_map = {}  # Track which results came from which query
    
    for query_idx, search_query in enumerate(all_queries):
        try:
            # Strategy 1: Similarity search (includes scores)
            similarity_results = vector_store.similarity_search_with_score(
                query=search_query,
                k=limit * 2
            )
            
            for doc, score in similarity_results:
                # Create unique ID for deduplication
                doc_id = doc.metadata.get('id', hash(doc.page_content[:100]))
                
                result = {
                    'doc_id': doc_id,
                    'content': doc.page_content,
                    'metadata': doc.metadata,
                    'similarity_score': float(score),  # Original similarity score
                    'query_idx': query_idx,
                    'retrieval_method': 'similarity'
                }
                all_results.append(result)
            
            # Strategy 2: MMR search (for diversity)
            if mode in ["hybrid", "mmr"]:
                try:
                    mmr_results = vector_store.max_marginal_relevance_search(
                        query=search_query,
                        k=limit,
                        fetch_k=limit * 3,
                        lambda_mult=0.5  # Balance diversity vs relevance
                    )
                    
                    for rank, doc in enumerate(mmr_results):
                        doc_id = doc.metadata.get('id', hash(doc.page_content[:100]))
                        
                        # MMR doesn't provide scores, so we use rank-based scoring
                        mmr_score = 1.0 / (rank + 1)
                        
                        result = {
                            'doc_id': doc_id,
                            'content': doc.page_content,
                            'metadata': doc.metadata,
                            'mmr_score': mmr_score,
                            'query_idx': query_idx,
                            'retrieval_method': 'mmr'
                        }
                        all_results.append(result)
                        
                except Exception as e:
                    log_error(f"MMR search failed for query {query_idx}: {e}")
            
            # Strategy 3: Hybrid search (if Weaviate supports it)
            if mode == "hybrid" and hasattr(vector_store, 'hybrid_search'):
                try:
                    hybrid_results = vector_store.hybrid_search(
                        query=search_query,
                        alpha=alpha,  # Balance between vector and keyword search
                        limit=limit * 2
                    )
                    
                    for doc in hybrid_results:
                        doc_id = doc.metadata.get('id', hash(doc.page_content[:100]))
                        
                        result = {
                            'doc_id': doc_id,
                            'content': doc.page_content,
                            'metadata': doc.metadata,
                            'hybrid_score': doc.metadata.get('score', 0.5),
                            'query_idx': query_idx,
                            'retrieval_method': 'hybrid'
                        }
                        all_results.append(result)
                        
                except Exception as e:
                    log_info(f"Hybrid search not available: {e}")
                    
        except Exception as e:
            log_error(f"Search failed for query '{search_query}': {e}")
            continue
    
    if not all_results:
        log_error("No results retrieved")
        return []
    
    # Apply Reciprocal Rank Fusion (RRF) with proper scoring
    doc_scores = {}  # doc_id -> combined score
    doc_data = {}    # doc_id -> document data
    
    # Group results by document ID
    from collections import defaultdict
    doc_groups = defaultdict(list)
    
    for result in all_results:
        doc_groups[result['doc_id']].append(result)
    
    # Calculate RRF scores for each unique document
    for doc_id, results in doc_groups.items():
        # Initialize with the first occurrence's data
        doc_data[doc_id] = {
            'content': results[0]['content'],
            'metadata': results[0]['metadata']
        }
        
        # Calculate RRF score combining all retrieval methods
        rrf_score = 0.0
        method_scores = defaultdict(list)
        
        for result in results:
            # Collect scores by method
            if 'similarity_score' in result:
                method_scores['similarity'].append(result['similarity_score'])
            if 'mmr_score' in result:
                method_scores['mmr'].append(result['mmr_score'])
            if 'hybrid_score' in result:
                method_scores['hybrid'].append(result['hybrid_score'])
        
        # Weighted combination of scores
        weights = {
            'similarity': 0.4,
            'mmr': 0.3,
            'hybrid': 0.6
        }
        
        combined_score = 0.0
        total_weight = 0.0
        
        for method, scores in method_scores.items():
            if scores:
                # Use max score for each method (best performance across queries)
                method_score = max(scores)
                weight = weights.get(method, 0.33)
                combined_score += method_score * weight
                total_weight += weight
        
        # Normalize by total weight
        if total_weight > 0:
            combined_score /= total_weight
        
        # Apply RRF formula for final ranking
        # RRF score = sum(1 / (k + rank)) for each query where doc appeared
        query_appearances = set(r['query_idx'] for r in results)
        rrf_bonus = sum(1.0 / (rrf_k + idx + 1) for idx in query_appearances)
        
        # Final score combines normalized score with RRF bonus
        final_score = (combined_score * 0.7) + (rrf_bonus * 0.3)
        
        doc_scores[doc_id] = final_score
    
    # Sort by final score and create output
    sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Format results
    final_results = []
    for doc_id, score in sorted_docs[:limit]:
        doc_info = doc_data[doc_id]
        
        result = {
            'score': float(score),  # Ensure score is included and is float
            'text': doc_info['content'],
            **doc_info['metadata']
        }
        final_results.append(result)
    
    log_info(f"Retrieved {len(final_results)} documents with RRF scoring")
    
    # Format for output (assuming format_retriever_results adds additional formatting)
    return format_retriever_results(final_results, "multi_query_rrf")

In [None]:
query = "Research about the 1952 and 1953 Airmail Costa Rica Definitive Stamps. Mail plane type of 1934. C216-C219."
results_multi_query = multi_query_retriever_ensemble(client, query, 200)

In [None]:
results_multi_query

In [None]:
create_rag_response(results_multi_query,query)

### 4.3 Self-Querying Retriever

In [None]:
def self_querying_retriever(client, query: str, limit: int = RAG_DOCUMENT_LIMIT) -> List[Dict]:
    """Self-querying retriever with automatic filter extraction"""
    
    log_info(f"Self-Querying Retriever: extracting filters, limit={limit}")
    
    try:
        # Extract filters using LLM
        filter_prompt = f"""Extract philatelic search filters from this query. Respond in JSON format:

Query: "{query}"

{{
    "years": [integers or null],
    "scott_numbers": [strings or null], 
    "colors": [strings or null],
    "catalog_systems": [strings or null],
    "has_varieties": boolean or null,
    "is_guanacaste": boolean or null,
    "chunk_type": string or null
}}"""

        response = llm.invoke([HumanMessage(content=filter_prompt)])
        
        try:
            extracted_filters = json.loads(response.content.strip())
            log_info(f"Extracted filters: {list(extracted_filters.keys())}")
        except json.JSONDecodeError:
            log_info("Could not parse filters, using no filters")
            extracted_filters = {}
        
        # Convert to Weaviate filters
        weaviate_filters = {}
        
        if extracted_filters.get("years") and isinstance(extracted_filters["years"], list):
            years = [y for y in extracted_filters["years"] if isinstance(y, int)]
            if years:
                weaviate_filters["year_range"] = [min(years), max(years)] if len(years) > 1 else [years[0], years[0]]
        
        if extracted_filters.get("scott_numbers"):
            scott_nums = [s for s in extracted_filters["scott_numbers"] if s]
            if scott_nums:
                weaviate_filters["scott_number"] = scott_nums[0]
        
        if extracted_filters.get("colors"):
            colors = [c for c in extracted_filters["colors"] if c]
            if colors:
                weaviate_filters["color"] = colors[0].lower()
                
        if extracted_filters.get("has_varieties") is True:
            weaviate_filters["has_varieties"] = True
            
        if extracted_filters.get("is_guanacaste") is True:
            weaviate_filters["is_guanacaste"] = True
            
        if extracted_filters.get("chunk_type"):
            weaviate_filters["chunk_type"] = extracted_filters["chunk_type"]
        
        # Execute search with filters
        results = search_chunks_semantic(
            client=client,
            query=query,
            collection_name=COLLECTION_NAME,
            limit=limit,
            filters=weaviate_filters,
            mode="vector",
            distance_metric="cosine"
        )
        
        log_info(f"Retrieved {len(results)} documents with {len(weaviate_filters)} filter(s)")
        
        return format_retriever_results(
            results, 
            "self_querying", 
            applied_filters=weaviate_filters,
            filters_count=len(weaviate_filters)
        )
        
    except Exception as e:
        log_error(f"Self-querying retriever failed: {e}")
        # Fallback to simple search
        return execute_with_fallback(
            lambda: format_retriever_results(
                search_chunks_semantic(client, query, COLLECTION_NAME, limit=limit), 
                "self_querying_fallback"
            ),
            lambda: [],
            client, query, COLLECTION_NAME, limit=limit
        )

## 5. RAG Evaluation System

Evaluación completa del pipeline RAG con 20 documentos y sistema de juez automatizado.

In [None]:
# ========================================================================================
# 🏆 COMPLETE RAG EVALUATION - 20 Documents per Retriever
# ========================================================================================

def execute_rag_evaluation():
    """Execute complete RAG pipeline with all retrievers using optimized 20-document limit"""
    
    if not client:
        log_error("No Weaviate client available for RAG evaluation")
        return {}
    
    if SHOW_PROGRESS:
        print("🚀 Executing complete RAG evaluation...")
        print(f"   📝 Query: {TEST_QUERY}")
        print(f"   📊 Document limit: {RAG_DOCUMENT_LIMIT} per retriever")
        print(f"   🎯 Mode: {TEST_MODE}")
    
    # Define retrievers with 20-document limit
    retrievers = [
        ("Vector Store", lambda: vector_store_retriever(client, TEST_QUERY, RAG_DOCUMENT_LIMIT, TEST_MODE)),
        ("Multi Query", lambda: multi_query_retriever_ensamble(client, TEST_QUERY, RAG_DOCUMENT_LIMIT)), #multi_query_retriever_langchain
        ("Self Querying", lambda: self_querying_retriever(client, TEST_QUERY, RAG_DOCUMENT_LIMIT))
    ]
    
    rag_results = {}
    
    for name, retriever_func in retrievers:
        try:
            if SHOW_PROGRESS:
                print(f"   🔄 {name} Retriever + RAG...")
            
            # Execute retriever
            retriever_results = retriever_func()
            
            if not retriever_results:
                log_error(f"{name}: No documents retrieved")
                continue
            
            # Execute RAG chain
            rag_result = create_rag_response(retriever_results, TEST_QUERY)
            
            # Store complete result
            key = name.lower().replace(" ", "_")
            rag_results[key] = {
                "retriever_name": name,
                "retriever_results": retriever_results,
                "rag_response": rag_result["response"],
                "generation_time": rag_result["generation_time"],
                "context_docs_count": rag_result["context_docs_count"],
                "context_length": rag_result.get("context_length", 0),
                "retriever_type": retriever_results[0].get("retriever_type", "unknown")
            }
            
            log_result(f"{name}: {rag_result['context_docs_count']} docs → {len(rag_result['response'])} chars ({rag_result['generation_time']}s)")
            
        except Exception as e:
            log_error(f"{name} RAG failed: {e}")
            continue
    
    if SHOW_PROGRESS and rag_results:
        print(f"🎉 Generated {len(rag_results)} RAG responses for comparison")
    
    return rag_results

# Execute RAG evaluation
rag_evaluation_results = execute_rag_evaluation()

In [None]:
# for x in rag_evaluation_results['vector_store']['retriever_results']:
#     print(x['doc_id'])
#     print(x['text'])

In [None]:
# ========================================================================================
# 📊 RESPONSE DISPLAY & COMPARISON
# ========================================================================================

def display_rag_responses(rag_results: Dict, show_full_responses: bool = True):
    """Clean display of RAG responses for comparison"""
    
    if not rag_results:
        log_error("No RAG results to display")
        return
    
    print("📊 RAG RESPONSE COMPARISON")
    print("=" * 70)
    print(f"Query: {TEST_QUERY}")
    print("=" * 70)
    
    for key, result in rag_results.items():
        retriever_name = result["retriever_name"]
        response = result["rag_response"]
        docs_count = result["context_docs_count"]
        gen_time = result["generation_time"]
        context_size = result.get("context_length", 0)
        
        # Icon based on retriever type
        icon = "🔵" if "vector" in key else "🟡" if "multi" in key else "🟢"
        
        print(f"\n{icon} {retriever_name.upper()}")
        print("─" * 50)
        print(f"📄 Documents: {docs_count} | ⏱️ Time: {gen_time}s | 📝 Context: {context_size:,} chars")
        
        if show_full_responses:
            print("\n📝 RESPONSE:")
            print("─" * 30)
            print(response)
            print("─" * 50)
        else:
            # Show preview only
            preview = response[:200] + "..." if len(response) > 200 else response
            print(f"📝 Preview: {preview}")
    
    print(f"\n✅ Displayed {len(rag_results)} RAG responses")

# Display responses (set to False for preview only)
if rag_evaluation_results:
    display_rag_responses(rag_evaluation_results, show_full_responses=True)

### 5.1 LLM Judge Evaluation

In [None]:
def llm_judge_evaluation(query: str, rag_results: Dict) -> Dict:
    """Streamlined LLM judge evaluation with focus on key criteria"""
    
    if len(rag_results) < 2:
        log_error("Need at least 2 responses for comparison")
        return {}
    
    log_info("Executing LLM judge evaluation...")
    
    # Prepare responses for evaluation
    responses_text = ""
    for i, (key, result) in enumerate(rag_results.items(), 1):
        responses_text += f"""
RESPONSE {i} - {result['retriever_name']}:
Documents Used: {result['context_docs_count']}
Generation Time: {result['generation_time']}s
Response: {result['rag_response']}

---
"""
    
    # Streamlined judge prompt focusing on key criteria
    judge_prompt = f"""You are an expert philatelic consultant evaluating RAG system responses.

ORIGINAL QUESTION: {query}

Evaluate each response on these 4 key criteria (1-10 scale):
1. **ACCURACY** - Factual correctness of philatelic information
2. **CITATION QUALITY** - Proper use of [Doc#-P.##-Score] format
3. **COMPLETENESS** - Addresses all aspects of the question
4. **PHILATELIC EXPERTISE** - Professional terminology and domain knowledge

RESPONSES:
{responses_text}

Provide concise evaluation in this format:

SCORES:
Response 1: Accuracy=X, Citations=X, Completeness=X, Expertise=X (Total: X/40)
Response 2: Accuracy=X, Citations=X, Completeness=X, Expertise=X (Total: X/40)
Response 3: Accuracy=X, Citations=X, Completeness=X, Expertise=X (Total: X/40)

WINNER: Response X ({{"retriever name"}})
REASONING: [Brief explanation of why this response is superior]

CITATION ANALYSIS: [Count and quality of citations in each response]"""
    
    try:
        start_time = time.time()
        judge_response = llm.invoke([HumanMessage(content=judge_prompt)])
        evaluation_time = round(time.time() - start_time, 2)
        
        return {
            "evaluation": judge_response.content,
            "evaluation_time": evaluation_time,
            "responses_compared": len(rag_results)
        }
        
    except Exception as e:
        log_error(f"Judge evaluation failed: {e}")
        return {}

def execute_judge_evaluation():
    """Execute LLM judge if we have results to compare"""
    
    if not rag_evaluation_results or len(rag_evaluation_results) < 2:
        log_info("Insufficient RAG results for judge evaluation")
        return None
        
    if SHOW_PROGRESS:
        print(f"⚖️ Running LLM Judge evaluation on {len(rag_evaluation_results)} responses...")
    
    judge_result = llm_judge_evaluation(TEST_QUERY, rag_evaluation_results)
    
    if judge_result:
        print("\n🏆 LLM JUDGE EVALUATION")
        print("=" * 50)
        print(f"⏱️ Evaluation time: {judge_result['evaluation_time']}s")
        print(f"📊 Responses compared: {judge_result['responses_compared']}")
        print("\n📋 DETAILED EVALUATION:")
        print("─" * 50)
        print(judge_result['evaluation'])
        print("─" * 50)
        
        log_result("LLM Judge evaluation completed")
    
    return judge_result

# Execute judge evaluation
judge_results = execute_judge_evaluation()

### 5.2 Citation Analysis

In [None]:
def analyze_citations():
    """Analyze citation quality in RAG responses using academic format"""
    
    if not rag_evaluation_results:
        log_info("No RAG results available for citation analysis")
        return
    
    print("🔍 CITATION QUALITY ANALYSIS")
    print("=" * 50)
    
    # Updated pattern for academic citation format: (Document Name, p. Page)
    citation_pattern = r'\([^(),]+,\s*p\.\s*[^)]+\)'
    
    for key, result in rag_evaluation_results.items():
        response_text = result["rag_response"]
        retriever_name = result["retriever_name"]
        
        citations_found = re.findall(citation_pattern, response_text)
        
        # Rough estimate of factual claims (sentences with numbers, years, Scott numbers)
        factual_patterns = [r'\b\d{4}\b', r'Scott\s+#?\d+', r'\b\d+\s*colones?\b', r'\bmint\b', r'\bused\b']
        factual_claims = 0
        for pattern in factual_patterns:
            factual_claims += len(re.findall(pattern, response_text, re.IGNORECASE))
        
        citation_rate = len(citations_found) / max(factual_claims, 1) * 100
        
        icon = "🔵" if "vector" in key else "🟡" if "multi" in key else "🟢"
        
        print(f"\n{icon} {retriever_name}:")
        print(f"   📊 Citations found: {len(citations_found)}")
        print(f"   📋 Estimated factual claims: {factual_claims}")
        print(f"   📈 Citation rate: {citation_rate:.1f}%")
        
        if citations_found:
            print(f"   📝 Sample citations: {citations_found[:2]}")
        
        # Quality assessment
        if citation_rate > 80:
            quality = "✅ Excellent"
        elif citation_rate > 50:
            quality = "🟡 Good"
        else:
            quality = "❌ Needs improvement"
            
        print(f"   🎯 Quality: {quality}")
    
    log_result("Citation analysis completed for academic format")

# Run citation analysis
analyze_citations()