## LangChain RAG for Philately

In [1]:
import weaviate
from philatelic_weaviate import *
weaviate.__version__

Philatelic Weaviate Integration v2.1 cargado exitosamente
Funciones disponibles:
   - create_weaviate_client()
   - create_oxcart_collection()
   - index_philatelic_document()
   - search_chunks_semantic()
   - get_collection_stats()


'4.16.9'

## 1. Settings

In [2]:
COLLECTION_NAME = os.getenv('WEAVIATE_COLLECTION_NAME', 'Oxcart')

In [3]:
# Conectar a Weaviate
print("🔌 Conectando a Weaviate...")

try:
    client = create_weaviate_client(WEAVIATE_URL, OPENAI_API_KEY)
    print("✅ Conexión exitosa")
    
    # Verificar que Weaviate esté funcionando
    meta = client.get_meta()
    print(f"📊 Weaviate versión: {meta.get('version', 'unknown')}")
    
    # Verificar si la colección existe
    try:
        collections = client.collections.list_all()
        collection_names = [col.name for col in collections]
        
        if COLLECTION_NAME in collection_names:
            collection = client.collections.get(COLLECTION_NAME)
            total_objects = collection.aggregate.over_all(total_count=True).total_count
            print(f"📊 Colección '{COLLECTION_NAME}' existe con {total_objects} documentos")
        else:
            print(f"📝 Colección '{COLLECTION_NAME}' no existe (se creará durante la indexación)")
    except Exception as e:
        print(f"⚠️ No se pudo verificar colecciones: {e}")
        
except Exception as e:
    print(f"❌ Error conectando a Weaviate: {e}")
    print("💡 Asegúrate de que Weaviate esté corriendo:")
    print("   docker-compose up -d")
    client = None

🔌 Conectando a Weaviate...
Conectado a Weaviate en http://localhost:8083
✅ Conexión exitosa
📊 Weaviate versión: 1.32.4
⚠️ No se pudo verificar colecciones: 'str' object has no attribute 'name'


In [4]:
# Crear colección Oxcart
if client:
    print("\n🏗️ Configurando colección Oxcart...")
    
    collection_created = create_oxcart_collection(client, COLLECTION_NAME)
    
    if collection_created:
        print("✅ Colección lista para indexación")
        
        # Mostrar estadísticas de la colección
        stats = get_collection_stats(client, COLLECTION_NAME)
        if stats:
            print(f"📊 Chunks actuales en Weaviate: {stats.get('total_chunks', 0)}")
            if stats.get('documents'):
                print(f"📄 Documentos indexados: {list(stats['documents'].keys())}")
    else:
        print("❌ Error configurando colección")
        client = None
else:
    print("⚠️ Saltando configuración de colección (sin conexión)")


🏗️ Configurando colección Oxcart...
ADVERTENCIA: Coleccion 'Oxcart' ya existe
INFORMACION: Usando coleccion existente
✅ Colección lista para indexación
📊 Chunks actuales en Weaviate: 127025
📄 Documentos indexados: ['Mena 2014', 'Scott 2024', 'The Postal History Frajola Mayer', 'Nordberg Collection Gold', 'Pinto Collection', 'OXCART151', 'CRF 152', 'Mayer Costa Rica', 'CRF 75-76', 'CRF 56', 'CRF 49-50', 'OXCART100', 'CRF 44-45', 'CRF 65-66', 'CRF 144', 'CRF 151', 'CRF 155', 'CRF 77-82', 'CRF 67-68', 'CRF 85', 'Escalante Collection', 'CRF 69-70', 'CRF 10', 'CRF 43', 'CRF 83-84', 'CRF 150', 'CRF 31-34', 'CRF 88', 'OXCART144', 'CRF 64', 'Repertorio Filatelico 11-15', 'CRF 153', 'CRF 97', 'Repertorio Filatelico 16-20', 'CRF 63', 'CRF 143', 'CRF 39', 'CRF 05', 'Timbre40', 'CR Postal Stationary 1883-1953', 'Timbre37', 'CRF 147', 'OXCART160', 'CRF 91', 'CRF 40', 'CRF 11', 'OXCART156', 'CRF 35', 'CRF 41', 'CRF 89', 'CRF 12', 'CRF 53-54', 'OXCART155', 'CRF 74', 'CRF 145', 'CRF 61-62', 'CRF 148'

## 2. Weaviate + LangChain Retreiver Examples

In [5]:
test_query = "Costa Rica 1907 2 colones stamp with original gum. Scott 68 issue of 1907"
test_mode = "hybrid"

In [6]:
results = search_chunks_semantic(
                client, 
                test_query, 
                "Oxcart", 
                limit=50,
                filters=[],
                mode = test_mode,
                alpha= 0.45
                
            )
            
print(f"   📊 Resultados: {len(results)}")

for j, result in enumerate(results, 1):
    print(f"\n      🏷️ #{j} (Score: {result['score']:.3f})")
    print(f"         📄 Documento: {result['doc_id']}")
    print(f"         📋 Tipo: {result['chunk_type']}")
    print(f"         📄 Página: {result['page_number']}")
    
    # Mostrar metadatos relevantes
    if result.get('catalog_systems'):
        print(f"         📖 Catálogos: {result['catalog_systems']}")
    if result.get('scott_numbers'):
        print(f"         🔢 Scott: {result['scott_numbers']}")
    if result.get('years'):
        print(f"         📅 Años: {result['years']}")
    if result.get('colors'):
        print(f"         🎨 Colores: {result['colors']}")
    if result.get('variety_classes'):
        print(f"         🔀 Variedades: {result['variety_classes']}")
    
    # Texto truncado
    text = result.get('text', '')
    # if len(text) > 200:
    #     text = text[:200] + "..."
    print(f"         📝 Texto: {text}")
    print("**********************************************************************************************************")

   📊 Resultados: 50

      🏷️ #1 (Score: 0.550)
         📄 Documento: OXCART116
         📋 Tipo: text
         📄 Página: 25
         📖 Catálogos: ['Scott']
         🔢 Scott: ['2, 3, 4', '32-34, 35–44', '4, 1', '64, 65, 66', '68', '143-146', '32-34', '35–44', '68, 143-146']
         📅 Años: [1907]
         🎨 Colores: ['red']
         📝 Texto: Got any ideas?\n\nSuggestions for the improvement of the OXCART Postal Sales are always welcome!\nCondición: centrado fine.\n\n![Figure](figures/OXCART116_page_025_figure_000.png)\nCondición: centrado good.\n\n192 193 194.\n\n![Figure](figures/OXCART116_page_025_figure_004.png)\nCondición: centrado good.\n\n![Figure](figures/OXCART116_page_025_figure_006.png)\nCondición: centrado good.\n\n196 197. 198. 199. 200.\n\n![Figure](figures/OXCART116_page_025_figure_012.png)\nCondición: centrado good.\n\n![Figure](figures/OXCART116_page_025_figure_013.png)\nCondición: centrado good.\n\n![Figure](figures/OXCART116_page_025_figure_014.png)\nCondición: centra

## 3. LangChain Retrievers Setup

Configuración de LangChain con OpenAI y tres tipos diferentes de retrievers.

In [7]:
# LangChain imports
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_weaviate import WeaviateVectorStore
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain.schema.retriever import BaseRetriever
from typing import List, Dict, Any

print("🚀 LangChain modules imported successfully")

# Setup OpenAI LLM para LangChain
llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=OPENAI_API_KEY,
    temperature=0
)

# Setup OpenAI Embeddings para LangChain 
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-large",
    api_key=OPENAI_API_KEY
)

print("✅ LangChain LLM and Embeddings configured")
print(f"   🤖 LLM Model: gpt-4o-mini")
print(f"   🔢 Embedding Model: text-embedding-3-large")

🚀 LangChain modules imported successfully
✅ LangChain LLM and Embeddings configured
   🤖 LLM Model: gpt-4o-mini
   🔢 Embedding Model: text-embedding-3-large


### 3.1 Vector Store-Backed Retriever

Retriever básico que usa el vector store de Weaviate con LangChain. Soporta los modos vector, hybrid y bm25.

In [8]:
def test_vector_store_retriever(client, query: str, collection_name: str = "Oxcart", 
                                mode: str = "vector", limit: int = 10, alpha: float = 0.5) -> List[Dict[str, Any]]:
    """
    Vector Store-Backed Retriever usando LangChain + Weaviate
    
    Args:
        client: Cliente Weaviate
        query: Consulta de búsqueda
        collection_name: Nombre de la colección
        mode: Modo de búsqueda ("vector", "hybrid", "bm25")
        limit: Número máximo de resultados
        alpha: Para modo hybrid (0.0=keyword, 1.0=vector)
    
    Returns:
        Lista de documentos con metadatos
    """
    print(f"🔍 Vector Store-Backed Retriever")
    print(f"   📝 Query: {query}")
    print(f"   🎯 Mode: {mode}")
    print(f"   📊 Limit: {limit}")
    
    try:
        # Crear el vector store de LangChain conectado a Weaviate
        vector_store = WeaviateVectorStore(
            client=client,
            index_name=collection_name,
            text_key="text",  # Campo que contiene el texto para búsqueda
            embedding=embeddings
        )
        
        # Configurar el retriever según el modo
        if mode == "vector":
            # Búsqueda por similitud vectorial
            retriever = vector_store.as_retriever(
                search_type="similarity",
                search_kwargs={"k": limit}
            )
        elif mode == "hybrid":
            # Búsqueda híbrida (vector + keyword)
            retriever = vector_store.as_retriever(
                search_type="mmr",  # Maximal Marginal Relevance para diversidad
                search_kwargs={
                    "k": limit,
                    "lambda_mult": alpha  # Similar a alpha en hybrid search
                }
            )
        elif mode == "bm25":
            # Para BM25 puro, usaremos similarity pero con configuración específica
            retriever = vector_store.as_retriever(
                search_type="similarity",
                search_kwargs={"k": limit}
            )
        else:
            raise ValueError(f"Modo no soportado: {mode}")
        
        # Ejecutar la búsqueda
        docs = retriever.invoke(query)
        
        # Formatear resultados para compatibilidad con formato existente
        results = []
        for i, doc in enumerate(docs):
            # Extraer metadatos del documento
            metadata = doc.metadata or {}
            
            result = {
                "rank": i + 1,
                "score": metadata.get("score", 0.0),  # LangChain puede incluir score en metadata
                "text": doc.page_content,
                "chunk_id": metadata.get("chunk_id", ""),
                "chunk_type": metadata.get("chunk_type", "text"),
                "doc_id": metadata.get("doc_id", ""),
                "page_number": metadata.get("page_number", 0),
                "catalog_systems": metadata.get("catalog_systems", []),
                "scott_numbers": metadata.get("scott_numbers", []),
                "years": metadata.get("years", []),
                "colors": metadata.get("colors", []),
                "variety_classes": metadata.get("variety_classes", []),
                "topics_primary": metadata.get("topics_primary", ""),
                "has_catalog": metadata.get("has_catalog", False),
                "has_varieties": metadata.get("has_varieties", False),
                "is_guanacaste": metadata.get("is_guanacaste", False),
                "quality_score": metadata.get("quality_score", 0.0),
                "retriever_type": "vector_store_backed",
                "search_mode": mode
            }
            results.append(result)
        
        print(f"   ✅ Found {len(results)} results")
        return results
        
    except Exception as e:
        print(f"   ❌ Error en Vector Store-Backed Retriever: {e}")
        return []

### 3.2 Multi-Query Retriever

Retriever que genera múltiples variaciones de la consulta original para mejorar la cobertura de resultados.

In [None]:
def test_multi_query_retriever(client, query: str, collection_name: str = "Oxcart", 
                              limit: int = 10) -> List[Dict[str, Any]]:
    """
    Multi-Query Retriever que genera múltiples variaciones de la consulta con logging completo
    
    Args:
        client: Cliente Weaviate
        query: Consulta original de búsqueda
        collection_name: Nombre de la colección
        limit: Número máximo de resultados
    
    Returns:
        Lista de documentos únicos combinados de múltiples consultas
    """
    print(f"🔍 Multi-Query Retriever")
    print(f"   📝 Original Query: {query}")
    print(f"   📊 Limit: {limit}")
    
    try:
        # Prompt personalizado para generar variaciones específicas de filatelia
        query_generation_prompt = f"""You are an AI language model assistant specialized in philatelic (stamp collecting) queries.
Your task is to generate multiple search query variations for finding stamp-related information.

Given a single philatelic query, generate exactly 3 alternative search queries that would find the same information.
Consider these philatelic aspects when creating variations:
- Different catalog systems (Scott, Michel, Yvert, SG)  
- Alternative terminology (stamp/postage stamp, issue/emission, variety/error)
- Years and date formats (1907 vs nineteenth century)
- Technical terms (perforations, watermarks, overprints)
- Condition terms (mint, used, hinged, never hinged)
- Regional names (Costa Rica/Costa Rican/CR)

Original query: "{query}"

Generate exactly 3 alternative queries that search for the same information but with different wording.
Provide only the alternative queries, one per line, without numbering or explanations:"""

        # Generar queries usando LLM
        from langchain_core.messages import HumanMessage
        
        query_response = llm.invoke([HumanMessage(content=query_generation_prompt)])
        query_variations_text = query_response.content.strip()
        
        # Parsear las variaciones de consulta
        query_lines = [line.strip() for line in query_variations_text.split('\n') if line.strip()]
        generated_queries = [q for q in query_lines if q and not q.startswith('#')]  # Filtrar líneas vacías o comentarios
        
        # Agregar la consulta original
        all_queries = [query] + generated_queries
        
        print(f"   🔄 Generated Query Variations:")
        print(f"   📋 Total Queries: {len(all_queries)}")
        for i, q in enumerate(all_queries, 1):
            query_type = "ORIGINAL" if i == 1 else f"VARIATION {i-1}"
            print(f"   {i}. [{query_type}]: {q}")
        
        # Ejecutar búsquedas para cada query
        all_results = []
        seen_chunk_ids = set()
        
        for i, search_query in enumerate(all_queries, 1):
            query_type = "ORIGINAL" if i == 1 else f"VAR{i-1}"
            print(f"\n   🔍 Executing Query {i} [{query_type}]...")
            print(f"       📝 Query: {search_query}")
            
            try:
                # Usar search_chunks_semantic para cada consulta
                query_results = search_chunks_semantic(
                    client=client,
                    query=search_query,
                    collection_name=collection_name,
                    limit=limit,  # Obtener más resultados de cada query para mejor cobertura
                    filters={},  # Sin filtros para Multi-Query
                    mode="vector",
                    distance_metric="cosine"
                )
                
                print(f"       ✅ Found {len(query_results)} results from this query")
                
                # Agregar resultados únicos (deduplicar por chunk_id)
                new_results = 0
                for result in query_results:
                    chunk_id = result.get("chunk_id", f"unknown_{len(all_results)}")
                    
                    if chunk_id not in seen_chunk_ids:
                        seen_chunk_ids.add(chunk_id)
                        
                        # Marcar de qué query proviene este resultado
                        result["source_query"] = search_query
                        result["source_query_type"] = query_type
                        result["query_index"] = i
                        
                        all_results.append(result)
                        new_results += 1
                
                print(f"       📊 Added {new_results} unique results (duplicates filtered)")
                
            except Exception as query_error:
                print(f"       ❌ Error executing query {i}: {query_error}")
                continue
        
        # Formatear resultados finales
        formatted_results = []
        for i, result in enumerate(all_results[:limit]):  # Limitar al número solicitado
            formatted_result = {
                "rank": i + 1,
                "score": result.get("score", 0.0),
                "text": result.get("text", ""),
                "chunk_id": result.get("chunk_id", ""),
                "chunk_type": result.get("chunk_type", "text"),
                "doc_id": result.get("doc_id", ""),
                "page_number": result.get("page_number", 0),
                "catalog_systems": result.get("catalog_systems", []),
                "scott_numbers": result.get("scott_numbers", []),
                "years": result.get("years", []),
                "colors": result.get("colors", []),
                "variety_classes": result.get("variety_classes", []),
                "topics_primary": result.get("topics_primary", ""),
                "has_catalog": result.get("has_catalog", False),
                "has_varieties": result.get("has_varieties", False),
                "is_guanacaste": result.get("is_guanacaste", False),
                "quality_score": result.get("quality_score", 0.0),
                "retriever_type": "multi_query_enhanced",
                "search_mode": "multi_vector",
                "source_query": result.get("source_query", ""),
                "source_query_type": result.get("source_query_type", ""),
                "query_index": result.get("query_index", 0)
            }
            formatted_results.append(formatted_result)
        
        print(f"\n   📊 MULTI-QUERY SUMMARY:")
        print(f"   🔍 Queries executed: {len(all_queries)}")
        print(f"   📄 Total unique results found: {len(all_results)}")
        print(f"   ✅ Final results returned: {len(formatted_results)}")
        print(f"   🎯 Deduplication saved: {len(all_results) - len(seen_chunk_ids)} duplicates filtered")
        
        return formatted_results
        
    except Exception as e:
        print(f"   ❌ Error en Multi-Query Retriever Enhanced: {e}")
        
        # Fallback a búsqueda simple
        print(f"   🔄 Falling back to simple vector search...")
        try:
            results = search_chunks_semantic(
                client=client,
                query=query,
                collection_name=collection_name,
                limit=limit,
                filters={},
                mode="vector",
                distance_metric="cosine"
            )
            
            formatted_results = []
            for i, result in enumerate(results):
                formatted_result = {
                    "rank": i + 1,
                    "score": result.get("score", 0.0),
                    "text": result.get("text", ""),
                    "chunk_id": result.get("chunk_id", ""),
                    "chunk_type": result.get("chunk_type", "text"),
                    "doc_id": result.get("doc_id", ""),
                    "page_number": result.get("page_number", 0),
                    "catalog_systems": result.get("catalog_systems", []),
                    "scott_numbers": result.get("scott_numbers", []),
                    "years": result.get("years", []),
                    "colors": result.get("colors", []),
                    "variety_classes": result.get("variety_classes", []),
                    "topics_primary": result.get("topics_primary", ""),
                    "has_catalog": result.get("has_catalog", False),
                    "has_varieties": result.get("has_varieties", False),
                    "is_guanacaste": result.get("is_guanacaste", False),
                    "quality_score": result.get("quality_score", 0.0),
                    "retriever_type": "multi_query_fallback",
                    "search_mode": "vector_fallback",
                    "source_query": query,
                    "source_query_type": "FALLBACK",
                    "query_index": 1
                }
                formatted_results.append(formatted_result)
            
            print(f"   ✅ Fallback found {len(formatted_results)} results")
            return formatted_results
            
        except Exception as fallback_error:
            print(f"   ❌ Fallback also failed: {fallback_error}")
            return []

### 3.3 Self-Querying Retriever

Retriever inteligente que extrae automáticamente filtros de metadatos desde la consulta en lenguaje natural.

In [None]:
def test_self_querying_retriever(client, query: str, collection_name: str = "Oxcart", 
                                limit: int = 10) -> List[Dict[str, Any]]:
    """
    Self-Querying Retriever PERSONALIZADO que extrae filtros automáticamente usando Weaviate v4 nativo
    
    Args:
        client: Cliente Weaviate
        query: Consulta en lenguaje natural
        collection_name: Nombre de la colección
        limit: Número máximo de resultados
    
    Returns:
        Lista de documentos con filtros aplicados automáticamente
    """
    print(f"🔍 Self-Querying Retriever (Custom Implementation)")
    print(f"   📝 Query: {query}")
    print(f"   📊 Limit: {limit}")
    
    try:
        # Prompt para extraer filtros de la consulta usando OpenAI
        filter_extraction_prompt = f"""Analyze this philatelic query and extract specific filters that can be applied to search:

Query: "{query}"

Extract the following information if present in the query (return "null" if not found):

1. Years mentioned (as integers): 
2. Scott catalog numbers mentioned:
3. Colors mentioned:
4. Catalog systems mentioned (Scott, Michel, Yvert, SG):
5. Is this about varieties/errors? (true/false):
6. Is this about Guanacaste period stamps? (true/false):
7. Chunk type needed (text, table, figure, caption):
8. Mint condition mentioned (never hinged, lightly hinged, hinged):
9. Used condition mentioned (postally used, CTO):

Respond in this exact JSON format:
{{
    "years": [list of integers or null],
    "scott_numbers": [list of strings or null],
    "colors": [list of strings or null],
    "catalog_systems": [list of strings or null],
    "has_varieties": boolean or null,
    "is_guanacaste": boolean or null,
    "chunk_type": string or null,
    "mint_status": string or null,
    "used_status": string or null
}}"""

        # Usar LLM para extraer filtros
        from langchain_core.messages import HumanMessage
        
        filter_response = llm.invoke([HumanMessage(content=filter_extraction_prompt)])
        filter_text = filter_response.content.strip()
        
        print(f"   🎯 LLM Filter Extraction Response:")
        print(f"   📄 {filter_text}")
        
        # Parsear respuesta JSON
        import json
        try:
            extracted_filters = json.loads(filter_text)
            print(f"   ✅ Successfully parsed filters: {extracted_filters}")
        except json.JSONDecodeError:
            print(f"   ⚠️ Could not parse LLM response as JSON, using no filters")
            extracted_filters = {}
        
        # Convertir filtros extraídos al formato que usa _build_filters() de philatelic_weaviate
        weaviate_filters = {}
        
        if extracted_filters.get("years") and isinstance(extracted_filters["years"], list):
            years = [y for y in extracted_filters["years"] if isinstance(y, int)]
            if len(years) == 1:
                weaviate_filters["year_range"] = [years[0], years[0]]  # Año exacto
            elif len(years) >= 2:
                weaviate_filters["year_range"] = [min(years), max(years)]  # Rango
        
        if extracted_filters.get("scott_numbers") and isinstance(extracted_filters["scott_numbers"], list):
            scott_nums = [s for s in extracted_filters["scott_numbers"] if s]
            if scott_nums:
                weaviate_filters["scott_number"] = scott_nums[0]  # Usar el primero
        
        if extracted_filters.get("colors") and isinstance(extracted_filters["colors"], list):
            colors = [c for c in extracted_filters["colors"] if c]
            if colors:
                weaviate_filters["color"] = colors[0].lower()  # Usar el primero
        
        if extracted_filters.get("catalog_systems") and isinstance(extracted_filters["catalog_systems"], list):
            cat_systems = [c for c in extracted_filters["catalog_systems"] if c]
            if cat_systems:
                weaviate_filters["catalog_system"] = cat_systems[0]  # Usar el primero
        
        if extracted_filters.get("has_varieties") is True:
            weaviate_filters["has_varieties"] = True
        
        if extracted_filters.get("is_guanacaste") is True:
            weaviate_filters["is_guanacaste"] = True
            
        if extracted_filters.get("chunk_type") and isinstance(extracted_filters["chunk_type"], str):
            weaviate_filters["chunk_type"] = extracted_filters["chunk_type"]
        
        print(f"   🔧 Applied Weaviate Filters: {weaviate_filters}")
        
        # Usar la función search_chunks_semantic existente con los filtros extraídos
        results = search_chunks_semantic(
            client=client,
            query=query,
            collection_name=collection_name,
            limit=limit,
            filters=weaviate_filters,
            mode="vector",  # Usar búsqueda vectorial por defecto
            distance_metric="cosine"
        )
        
        # Formatear resultados para compatibilidad
        formatted_results = []
        for i, result in enumerate(results):
            formatted_result = {
                "rank": i + 1,
                "score": result.get("score", 0.0),
                "text": result.get("text", ""),
                "chunk_id": result.get("chunk_id", ""),
                "chunk_type": result.get("chunk_type", "text"),
                "doc_id": result.get("doc_id", ""),
                "page_number": result.get("page_number", 0),
                "catalog_systems": result.get("catalog_systems", []),
                "scott_numbers": result.get("scott_numbers", []),
                "years": result.get("years", []),
                "colors": result.get("colors", []),
                "variety_classes": result.get("variety_classes", []),
                "topics_primary": result.get("topics_primary", ""),
                "has_catalog": result.get("has_catalog", False),
                "has_varieties": result.get("has_varieties", False),
                "is_guanacaste": result.get("is_guanacaste", False),
                "quality_score": result.get("quality_score", 0.0),
                "retriever_type": "self_querying_custom",
                "search_mode": "filtered_vector",
                "applied_filters": weaviate_filters  # Info adicional sobre filtros aplicados
            }
            formatted_results.append(formatted_result)
        
        print(f"   ✅ Found {len(formatted_results)} results with extracted filters")
        if weaviate_filters:
            print(f"   📋 Filters successfully applied: {len(weaviate_filters)} filter(s)")
        else:
            print(f"   📋 No filters extracted, performed standard vector search")
        
        return formatted_results
        
    except Exception as e:
        print(f"   ❌ Error en Self-Querying Retriever Custom: {e}")
        
        # Fallback a búsqueda simple usando search_chunks_semantic
        print(f"   🔄 Falling back to simple vector search...")
        try:
            results = search_chunks_semantic(
                client=client,
                query=query,
                collection_name=collection_name,
                limit=limit,
                filters={},  # Sin filtros
                mode="vector",
                distance_metric="cosine"
            )
            
            formatted_results = []
            for i, result in enumerate(results):
                formatted_result = {
                    "rank": i + 1,
                    "score": result.get("score", 0.0),
                    "text": result.get("text", ""),
                    "chunk_id": result.get("chunk_id", ""),
                    "chunk_type": result.get("chunk_type", "text"),
                    "doc_id": result.get("doc_id", ""),
                    "page_number": result.get("page_number", 0),
                    "catalog_systems": result.get("catalog_systems", []),
                    "scott_numbers": result.get("scott_numbers", []),
                    "years": result.get("years", []),
                    "colors": result.get("colors", []),
                    "variety_classes": result.get("variety_classes", []),
                    "topics_primary": result.get("topics_primary", ""),
                    "has_catalog": result.get("has_catalog", False),
                    "has_varieties": result.get("has_varieties", False),
                    "is_guanacaste": result.get("is_guanacaste", False),
                    "quality_score": result.get("quality_score", 0.0),
                    "retriever_type": "self_querying_fallback",
                    "search_mode": "vector_fallback",
                    "applied_filters": {}
                }
                formatted_results.append(formatted_result)
            
            print(f"   ✅ Fallback found {len(formatted_results)} results")
            return formatted_results
            
        except Exception as fallback_error:
            print(f"   ❌ Fallback also failed: {fallback_error}")
            return []

## 4. Testing LangChain Retrievers

Probando los tres tipos de retrievers con la consulta de prueba.

In [None]:
# Function to display results in a nice format
def display_retriever_results(results, retriever_name, max_results=5):
    """Display retriever results in a formatted way with enhanced info for new retrievers"""
    print(f"\n{'='*50}")
    print(f"📋 {retriever_name}")
    print(f"{'='*50}")
    print(f"📊 Total Results: {len(results)}")
    
    if not results:
        print("❌ No results found")
        return
    
    # Show top results
    for i, result in enumerate(results[:max_results], 1):
        print(f"\n🏷️ Result #{i}")
        print(f"   📊 Score: {result.get('score', 'N/A'):.3f}")
        print(f"   🔧 Retriever: {result.get('retriever_type', 'unknown')}")
        print(f"   📄 Document: {result.get('doc_id', 'N/A')}")
        print(f"   📋 Type: {result.get('chunk_type', 'N/A')}")
        print(f"   📄 Page: {result.get('page_number', 'N/A')}")
        
        # Show enhanced info for Multi-Query results
        if result.get('retriever_type') == 'multi_query_enhanced':
            print(f"   🔍 Source Query: {result.get('source_query_type', 'N/A')} - Query #{result.get('query_index', 'N/A')}")
            source_query = result.get('source_query', '')
            if len(source_query) > 100:
                source_query = source_query[:100] + "..."
            print(f"   📝 Generated From: {source_query}")
        
        # Show enhanced info for Self-Query results  
        if result.get('retriever_type') == 'self_querying_custom':
            applied_filters = result.get('applied_filters', {})
            if applied_filters:
                print(f"   🎯 Applied Filters: {list(applied_filters.keys())}")
                for filter_key, filter_value in applied_filters.items():
                    print(f"      • {filter_key}: {filter_value}")
            else:
                print(f"   🎯 Applied Filters: None extracted")
        
        # Show philatelic metadata if available
        if result.get('scott_numbers'):
            print(f"   🔢 Scott: {result['scott_numbers']}")
        if result.get('years'):
            print(f"   📅 Years: {result['years']}")
        if result.get('colors'):
            print(f"   🎨 Colors: {result['colors']}")
        if result.get('catalog_systems'):
            print(f"   📖 Catalogs: {result['catalog_systems']}")
        
        # Show truncated text
        text = result.get('text', '')
        if len(text) > 200:
            text = text[:200] + "..."
        print(f"   📝 Text: {text}")
    
    if len(results) > max_results:
        print(f"   ... and {len(results) - max_results} more results")

print("✅ Enhanced helper function loaded")

In [None]:
# Test all three LangChain retrievers with the existing test query
if client:
    print(f"🚀 Testing Enhanced LangChain Retrievers")
    print(f"📝 Test Query: {test_query}")
    print(f"🎯 Test Mode: {test_mode}")
    print(f"🔄 Collection: {COLLECTION_NAME}")
    
    # Test 1: Vector Store-Backed Retriever
    print(f"\n{'='*70}")
    print("🧪 TEST 1: Vector Store-Backed Retriever")
    print(f"{'='*70}")
    
    try:
        vector_results = test_vector_store_retriever(
            client=client,
            query=test_query,
            collection_name=COLLECTION_NAME,
            mode=test_mode,
            limit=10
        )
        display_retriever_results(vector_results, "Vector Store-Backed Retriever", max_results=3)
    except Exception as e:
        print(f"❌ Vector Store-Backed Retriever failed: {e}")
    
    # Test 2: Multi-Query Retriever (Enhanced with Logging)
    print(f"\n{'='*70}")
    print("🧪 TEST 2: Multi-Query Retriever (Enhanced)")
    print(f"{'='*70}")
    
    try:
        multi_query_results = test_multi_query_retriever(
            client=client,
            query=test_query,
            collection_name=COLLECTION_NAME,
            limit=10
        )
        display_retriever_results(multi_query_results, "Multi-Query Retriever (Enhanced)", max_results=3)
    except Exception as e:
        print(f"❌ Multi-Query Retriever failed: {e}")
    
    # Test 3: Self-Querying Retriever (Custom Implementation)
    print(f"\n{'='*70}")
    print("🧪 TEST 3: Self-Querying Retriever (Custom)")
    print(f"{'='*70}")
    
    try:
        self_query_results = test_self_querying_retriever(
            client=client,
            query=test_query,
            collection_name=COLLECTION_NAME,
            limit=10
        )
        display_retriever_results(self_query_results, "Self-Querying Retriever (Custom)", max_results=3)
    except Exception as e:
        print(f"❌ Self-Querying Retriever failed: {e}")
    
    print(f"\n{'='*70}")
    print("📊 SUMMARY OF ENHANCED RETRIEVERS")
    print(f"{'='*70}")
    print("✅ All LangChain retrievers have been tested with enhancements!")
    print("")
    print("🔧 ENHANCEMENTS IMPLEMENTED:")
    print("   • Vector Store-Backed: Direct Weaviate v4 integration (unchanged)")
    print("   • Multi-Query Enhanced: Shows generated query variations + source tracking")
    print("   • Self-Querying Custom: Native Weaviate v4 implementation with filter extraction")
    print("")
    print("💡 KEY IMPROVEMENTS:")
    print("   🔍 Multi-Query now displays all generated query variations")
    print("   🎯 Self-Querying uses native Weaviate v4 API (no compatibility issues)")
    print("   📊 Enhanced logging shows filter extraction and query sources")
    print("   🧹 Deduplication prevents duplicate results across query variations")
    print("   🔄 Fallback mechanisms ensure robust operation")
    
else:
    print("❌ No Weaviate client available - cannot test retrievers")