# OXCART Philatelic RAG System

Sistema completo de indexaci√≥n y b√∫squeda sem√°ntica para documentos filat√©licos.

**Funcionalidades:**
- üìÑ Indexaci√≥n autom√°tica de todos los JSONs philatelic
- üîç B√∫squeda sem√°ntica avanzada con filtros filat√©licos
- ü§ñ RAG b√°sico con LLM para responder preguntas
- üìä Dashboard de estad√≠sticas y validaci√≥n
- üåê Interfaz Gradio para consultas interactivas

**Requisitos:**
- Weaviate corriendo en Docker: `docker-compose up -d`
- OpenAI API key configurada en `.env`
- JSONs philatelic en `results/final_jsons/`

## 1. Setup y Configuraci√≥n

Configuraci√≥n inicial del entorno y carga de librer√≠as.

In [None]:
import os
import json
import glob
import time
from pathlib import Path
from typing import Dict, Any, List, Optional
from datetime import datetime

# Cargar variables de entorno
from dotenv import load_dotenv
load_dotenv()

# Imports de terceros
import pandas as pd

print("‚úÖ Imports b√°sicos completados")

In [None]:
# Verificar variables de entorno
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
WEAVIATE_URL = os.getenv('WEAVIATE_URL', 'http://localhost:8083')
PHILATELIC_JSONS_DIR = os.getenv('PHILATELIC_JSONS_DIR', './results/final_jsons')
COLLECTION_NAME = os.getenv('WEAVIATE_COLLECTION_NAME', 'Oxcart')

print(f"üîß Configuraci√≥n:")
print(f"   ‚Ä¢ Weaviate URL: {WEAVIATE_URL}")
print(f"   ‚Ä¢ JSONs Directory: {PHILATELIC_JSONS_DIR}")
print(f"   ‚Ä¢ Collection Name: {COLLECTION_NAME}")
print(f"   ‚Ä¢ OpenAI API Key: {'‚úÖ Configurada' if OPENAI_API_KEY else '‚ùå Falta configurar'}")

if not OPENAI_API_KEY:
    print("\\n‚ö†Ô∏è  IMPORTANTE: Configura tu OPENAI_API_KEY en el archivo .env")
    print("   Copia .env.example a .env y agrega tu API key")

# Verificar que el directorio de JSONs existe
if not os.path.exists(PHILATELIC_JSONS_DIR):
    print(f"\\n‚ö†Ô∏è  Directorio {PHILATELIC_JSONS_DIR} no encontrado")
    print("   Aseg√∫rate de haber procesado documentos con el Dolphin parser")
else:
    json_files = glob.glob(os.path.join(PHILATELIC_JSONS_DIR, '*_final.json'))
    print(f"\\nüìÅ Encontrados {len(json_files)} archivos JSON filat√©licos")
    if json_files:
        print("   Ejemplos:")
        for file in json_files[:3]:
            print(f"   ‚Ä¢ {os.path.basename(file)}")
        if len(json_files) > 3:
            print(f"   ‚Ä¢ ... y {len(json_files) - 3} m√°s")

In [None]:
# Recargar m√≥dulos del sistema OXCART para obtener las √∫ltimas mejoras
import importlib

# Recargar el m√≥dulo philatelic_weaviate para obtener las funciones actualizadas
try:
    import philatelic_weaviate
    importlib.reload(philatelic_weaviate)
    print("üîÑ M√≥dulo philatelic_weaviate recargado")
except ImportError:
    pass

from philatelic_weaviate import (
    create_weaviate_client,
    create_oxcart_collection,
    index_philatelic_document,
    search_chunks_semantic,
    get_collection_stats,
    transform_chunk_to_weaviate
)

from philatelic_chunk_schema import (
    PhilatelicDocument,
    PhilatelicChunk,
    validate_chunk_structure,
    get_chunk_summary
)

print("‚úÖ M√≥dulos OXCART cargados exitosamente con las mejoras m√°s recientes")

## 2. Descubrimiento de Documentos

Escanear autom√°ticamente todos los archivos JSON philatelic disponibles.

In [None]:
def discover_philatelic_jsons(directory: str) -> List[Dict[str, Any]]:
    """
    Descubrir todos los archivos JSON philatelic en el directorio.
    Cuenta chunks ya indexados vs pendientes vs truncados vs no truncados.
    
    Returns:
        Lista de diccionarios con informaci√≥n de cada archivo
    """
    # Importar tqdm para progress bar
    from tqdm import tqdm
    
    json_files = []
    
    # Buscar archivos *_final.json
    pattern = os.path.join(directory, "*_final.json")
    philatelic_files = glob.glob(pattern)
    
    print(f"üîç Buscando archivos en: {directory}")
    print(f"üìã Patr√≥n de b√∫squeda: *_final.json")
    print(f"üìÑ Archivos encontrados: {len(philatelic_files)}")
    
    # Progress bar para descubrimiento
    for file_path in tqdm(philatelic_files, desc="üìÑ Analizando documentos", unit="doc"):
        try:
            # Obtener informaci√≥n del archivo
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
            file_name = os.path.basename(file_path)
            doc_id = file_name.replace("_final.json", "")
            
            # Cargar archivo para obtener estad√≠sticas b√°sicas
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            chunks = data.get("chunks", [])
            page_count = data.get("page_count", len(chunks))  # Estimado si no est√° disponible
            
            # Calcular estad√≠sticas b√°sicas y contar chunks por estado
            total_text_length = 0
            chunk_types = {}
            chunks_indexed = 0
            chunks_pending = 0
            chunks_truncated = 0
            chunks_not_truncated = 0
            chunks_truncated_unknown = 0
            
            for chunk in chunks:
                chunk_text = chunk.get("text", "") or chunk.get("content", "")
                total_text_length += len(chunk_text)
                
                chunk_type = chunk.get("chunk_type", "text")
                chunk_types[chunk_type] = chunk_types.get(chunk_type, 0) + 1
                
                # Verificar estado de indexaci√≥n
                if chunk.get("indexed", False):
                    chunks_indexed += 1
                else:
                    chunks_pending += 1
                
                # Verificar estado de truncado (trazabilidad completa)
                truncated_flag = chunk.get("truncated", None)
                if truncated_flag is True:
                    chunks_truncated += 1
                elif truncated_flag is False:
                    chunks_not_truncated += 1
                else:
                    # Chunk sin marcar (sin procesar a√∫n)
                    chunks_truncated_unknown += 1
            
            avg_chunk_length = total_text_length / len(chunks) if chunks else 0
            
            json_files.append({
                "file_path": file_path,
                "file_name": file_name,
                "doc_id": doc_id,
                "file_size_mb": round(file_size, 2),
                "chunks_count": len(chunks),
                "chunks_indexed": chunks_indexed,
                "chunks_pending": chunks_pending,
                "chunks_truncated": chunks_truncated,
                "chunks_not_truncated": chunks_not_truncated,
                "chunks_truncated_unknown": chunks_truncated_unknown,
                "page_count": page_count,
                "total_text_length": total_text_length,
                "avg_chunk_length": round(avg_chunk_length, 1),
                "chunk_types": chunk_types,
                "data": data  # Guardar datos para indexaci√≥n
            })
            
        except Exception as e:
            print(f"   ‚ùå Error procesando {file_path}: {e}")
    
    # Mostrar resumen de indexaci√≥n y trazabilidad
    if json_files:
        total_chunks = sum(f["chunks_count"] for f in json_files)
        total_indexed = sum(f["chunks_indexed"] for f in json_files)
        total_pending = sum(f["chunks_pending"] for f in json_files)
        total_truncated = sum(f["chunks_truncated"] for f in json_files)
        total_not_truncated = sum(f["chunks_not_truncated"] for f in json_files)
        total_truncated_unknown = sum(f["chunks_truncated_unknown"] for f in json_files)
        
        print(f"\nüìä ESTADO DE INDEXACI√ìN:")
        print(f"   üì¶ Total chunks: {total_chunks:,}")
        print(f"   ‚úÖ Ya indexados: {total_indexed:,} ({(total_indexed/total_chunks)*100:.1f}%)")
        print(f"   ‚è≥ Pendientes: {total_pending:,} ({(total_pending/total_chunks)*100:.1f}%)")
        
        print(f"\nüîç TRAZABILIDAD DE TRUNCADO:")
        print(f"   ‚úÇÔ∏è Truncados: {total_truncated:,} ({(total_truncated/total_chunks)*100:.1f}%)")
        print(f"   üìè No truncados: {total_not_truncated:,} ({(total_not_truncated/total_chunks)*100:.1f}%)")
        print(f"   ‚ùì Sin procesar: {total_truncated_unknown:,} ({(total_truncated_unknown/total_chunks)*100:.1f}%)")
        
        if total_pending == 0:
            print(f"   üéâ ¬°Todos los chunks est√°n indexados!")
        elif total_indexed > 0:
            print(f"   üîÑ Se continuar√° desde donde se qued√≥")
            
        # Informaci√≥n sobre trazabilidad
        if total_truncated > 0 or total_not_truncated > 0:
            total_processed = total_truncated + total_not_truncated
            print(f"   üí° {total_processed:,} chunks con trazabilidad completa de truncado")
            if total_truncated > 0:
                print(f"   üìÑ Los chunks truncados mantienen texto original en 'text_original'")
    
    return json_files

print("‚úÖ Funci√≥n de descubrimiento mejorada con trazabilidad completa de truncado")

In [None]:
# Descubrir archivos
discovered_files = discover_philatelic_jsons(PHILATELIC_JSONS_DIR)

print(f"\\nüìä RESUMEN DE DESCUBRIMIENTO:")
print(f"   üìÑ Archivos encontrados: {len(discovered_files)}")

if discovered_files:
    # === ESTAD√çSTICAS B√ÅSICAS ===
    total_chunks = sum(f["chunks_count"] for f in discovered_files)
    total_indexed = sum(f["chunks_indexed"] for f in discovered_files)
    total_pending = sum(f["chunks_pending"] for f in discovered_files)
    total_pages = sum(f["page_count"] for f in discovered_files)
    total_size = sum(f["file_size_mb"] for f in discovered_files)
    total_text_length = sum(f["total_text_length"] for f in discovered_files)
    
    print(f"   üì¶ Total chunks: {total_chunks:,}")
    print(f"   ‚úÖ Ya indexados: {total_indexed:,} ({(total_indexed/total_chunks)*100:.1f}%)")
    print(f"   ‚è≥ Pendientes: {total_pending:,} ({(total_pending/total_chunks)*100:.1f}%)")
    print(f"   üìÑ Total p√°ginas: {total_pages:,}")
    print(f"   üíæ Tama√±o total: {total_size:.1f} MB")
    
    # === ESTAD√çSTICAS AVANZADAS DE CHUNKS ===
    if total_chunks > 0:
        # Promedio global de tama√±o de chunks
        avg_chunk_size_global = total_text_length / total_chunks
        avg_chunks_per_doc = total_chunks / len(discovered_files)
        
        # Distribuci√≥n de tipos de chunks
        all_chunk_types = {}
        chunk_sizes = []
        
        for f in discovered_files:
            chunk_sizes.extend([f["avg_chunk_length"]] * f["chunks_count"])
            for chunk_type, count in f["chunk_types"].items():
                all_chunk_types[chunk_type] = all_chunk_types.get(chunk_type, 0) + count
        
        # Estad√≠sticas de tama√±o
        min_chunk_size = min(chunk_sizes) if chunk_sizes else 0
        max_chunk_size = max(chunk_sizes) if chunk_sizes else 0
        
        print(f"\\nüìä ESTAD√çSTICAS DE CHUNKS:")
        print(f"   üìè Tama√±o promedio global: {avg_chunk_size_global:.0f} caracteres")
        print(f"   üìà Promedio chunks/documento: {avg_chunks_per_doc:.1f}")
        print(f"   üìâ Rango de tama√±os: {min_chunk_size:.0f} - {max_chunk_size:.0f} chars")
        
        # Top tipos de chunks
        print(f"   üè∑Ô∏è Tipos m√°s comunes:")
        sorted_types = sorted(all_chunk_types.items(), key=lambda x: x[1], reverse=True)
        for chunk_type, count in sorted_types[:5]:
            percentage = (count / total_chunks) * 100
            print(f"      ‚Ä¢ {chunk_type}: {count:,} ({percentage:.1f}%)")
    
    # === ESTIMACI√ìN DE COSTOS OPENAI ===
    if total_pending > 0:
        print(f"\\nüí∞ ESTIMACI√ìN DE COSTOS OPENAI (SOLO CHUNKS PENDIENTES):")
        
        # Configuraci√≥n del modelo de embeddings
        # text-embedding-3-large: $0.00013 per 1K tokens (m√°s reciente y eficiente)
        EMBEDDING_MODEL = "text-embedding-3-large"
        COST_PER_1K_TOKENS = 0.00013  # USD
        CHARS_PER_TOKEN = 4  # Aproximaci√≥n para texto en espa√±ol
        
        # Calcular tokens estimados solo para chunks pendientes
        pending_text_length = 0
        for f in discovered_files:
            if f["chunks_pending"] > 0:
                # Estimar texto de chunks pendientes (proporcionalmente)
                pending_ratio = f["chunks_pending"] / f["chunks_count"]
                pending_text_length += f["total_text_length"] * pending_ratio
        
        estimated_tokens = pending_text_length / CHARS_PER_TOKEN
        estimated_cost = (estimated_tokens / 1000) * COST_PER_1K_TOKENS
        
        print(f"   ü§ñ Modelo: {EMBEDDING_MODEL}")
        print(f"   üìù Caracteres pendientes: {pending_text_length:,.0f}")
        print(f"   üéØ Tokens estimados: {estimated_tokens:,.0f}")
        print(f"   üíµ Costo estimado: ${estimated_cost:.4f} USD")
        
        # Estimaciones adicionales √∫tiles
        if estimated_cost > 0:
            cost_per_chunk = estimated_cost / total_pending
            docs_with_pending = sum(1 for f in discovered_files if f["chunks_pending"] > 0)
            cost_per_document = estimated_cost / docs_with_pending if docs_with_pending > 0 else 0
            
            print(f"   üìä Costo por chunk pendiente: ${cost_per_chunk:.6f} USD")
            print(f"   üìÑ Costo por documento con pendientes: ${cost_per_document:.4f} USD")
            
            # Rangos de referencia
            if estimated_cost < 0.01:
                cost_range = "üíö Muy bajo"
            elif estimated_cost < 0.10:
                cost_range = "üíô Bajo"
            elif estimated_cost < 1.00:
                cost_range = "üíõ Moderado"
            elif estimated_cost < 5.00:
                cost_range = "üß° Alto"
            else:
                cost_range = "‚ù§Ô∏è Muy alto"
            
            print(f"   üìà Rango de costo: {cost_range}")
    else:
        print(f"\\nüéâ ¬°No hay chunks pendientes para indexar!")
        print(f"   üí∞ Costo estimado: $0.00 USD")
    
    # === ADVERTENCIAS Y NOTAS ===
    print(f"\\n‚ö†Ô∏è NOTAS IMPORTANTES:")
    print(f"   ‚Ä¢ Solo se procesar√°n chunks pendientes (sin flag 'indexed': true)")
    print(f"   ‚Ä¢ Los chunks exitosos se marcar√°n autom√°ticamente como indexados")
    print(f"   ‚Ä¢ Los archivos JSON se actualizar√°n autom√°ticamente")
    print(f"   ‚Ä¢ Las futuras ejecuciones continuar√°n donde se qued√≥")
    print(f"   ‚Ä¢ Los costos son estimaciones basadas en {CHARS_PER_TOKEN} chars/token")
    
else:
    print(f"   ‚ö†Ô∏è No se encontraron archivos *_final.json en {PHILATELIC_JSONS_DIR}")
    print(f"   üí° Aseg√∫rate de haber procesado documentos con el Dolphin parser")

In [None]:
# Mostrar tabla resumen de archivos
if discovered_files:
    files_df = pd.DataFrame([
        {
            "Documento": f["doc_id"],
            "Chunks": f["chunks_count"],
            "P√°ginas": f["page_count"],
            "Tama√±o (MB)": f["file_size_mb"],
            "Promedio chunk": f["avg_chunk_length"],
            "Tipos principales": ", ".join([f"{k}: {v}" for k, v in list(f["chunk_types"].items())[:3]])
        }
        for f in discovered_files
    ])
    
    print("\nüìã DOCUMENTOS ENCONTRADOS:")
    print(files_df.to_string(index=False))
else:
    print("\n‚ùå No hay documentos para mostrar")

## 3. Configuraci√≥n de Weaviate

Conectar a Weaviate y crear la colecci√≥n con esquema optimizado.

In [None]:
import weaviate

weaviate.__version__

In [None]:
# Conectar a Weaviate
print("üîå Conectando a Weaviate...")

try:
    client = create_weaviate_client(WEAVIATE_URL, OPENAI_API_KEY)
    print("‚úÖ Conexi√≥n exitosa")
    
    # Verificar que Weaviate est√© funcionando
    meta = client.get_meta()
    print(f"üìä Weaviate versi√≥n: {meta.get('version', 'unknown')}")
    
    # Verificar si la colecci√≥n existe
    try:
        collections = client.collections.list_all()
        collection_names = [col.name for col in collections]
        
        if COLLECTION_NAME in collection_names:
            collection = client.collections.get(COLLECTION_NAME)
            total_objects = collection.aggregate.over_all(total_count=True).total_count
            print(f"üìä Colecci√≥n '{COLLECTION_NAME}' existe con {total_objects} documentos")
        else:
            print(f"üìù Colecci√≥n '{COLLECTION_NAME}' no existe (se crear√° durante la indexaci√≥n)")
    except Exception as e:
        print(f"‚ö†Ô∏è No se pudo verificar colecciones: {e}")
        
except Exception as e:
    print(f"‚ùå Error conectando a Weaviate: {e}")
    print("üí° Aseg√∫rate de que Weaviate est√© corriendo:")
    print("   docker-compose up -d")
    client = None

In [None]:
#client.collections.delete(COLLECTION_NAME)

In [None]:
# Crear colecci√≥n Oxcart
if client:
    print("\nüèóÔ∏è Configurando colecci√≥n Oxcart...")
    
    collection_created = create_oxcart_collection(client, COLLECTION_NAME)
    
    if collection_created:
        print("‚úÖ Colecci√≥n lista para indexaci√≥n")
        
        # Mostrar estad√≠sticas de la colecci√≥n
        stats = get_collection_stats(client, COLLECTION_NAME)
        if stats:
            print(f"üìä Chunks actuales en Weaviate: {stats.get('total_chunks', 0)}")
            if stats.get('documents'):
                print(f"üìÑ Documentos indexados: {list(stats['documents'].keys())}")
    else:
        print("‚ùå Error configurando colecci√≥n")
        client = None
else:
    print("‚ö†Ô∏è Saltando configuraci√≥n de colecci√≥n (sin conexi√≥n)")

## 4. Indexaci√≥n Autom√°tica

Indexar autom√°ticamente todos los documentos philatelic en Weaviate.

In [None]:
def index_all_documents(client, discovered_files: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Indexar todos los documentos descubiertos en Weaviate con progress bars, persistencia
    y manejo inteligente de chunks largos.
    
    Returns:
        Dict con resultados de indexaci√≥n
    """
    from tqdm import tqdm
    import json
    
    if not client:
        return {"error": "No hay conexi√≥n a Weaviate"}
    
    if not discovered_files:
        return {"error": "No hay documentos para indexar"}
    
    # Filtrar documentos con chunks pendientes
    documents_to_process = []
    total_pending_chunks = 0
    
    for file_info in discovered_files:
        chunks_pending = file_info.get("chunks_pending", 0)
        if chunks_pending > 0:
            documents_to_process.append(file_info)
            total_pending_chunks += chunks_pending
    
    if not documents_to_process:
        print("üéâ ¬°Todos los documentos ya est√°n completamente indexados!")
        return {
            "total_documents": len(discovered_files),
            "successful_documents": len(discovered_files),
            "failed_documents": 0,
            "total_chunks_indexed": 0,
            "total_chunks_pending": 0,
            "all_indexed": True
        }
    
    print(f"üöÄ INICIANDO INDEXACI√ìN MASIVA ROBUSTA CON MANEJO DE CHUNKS LARGOS")
    print(f"üìÑ Documentos con chunks pendientes: {len(documents_to_process)}")
    print(f"üì¶ Total chunks pendientes: {total_pending_chunks:,}")
    print(f"‚úÇÔ∏è Chunks largos ser√°n truncados autom√°ticamente a 12,000 caracteres")
    print("=" * 60)
    
    indexing_results = []
    total_chunks_indexed = 0
    total_chunks_failed = 0
    total_chunks_truncated = 0
    total_chars_saved = 0
    documents_updated = 0
    
    start_time = time.time()
    
    # Progress bar principal para documentos
    doc_pbar = tqdm(
        documents_to_process, 
        desc="üìÑ Procesando documentos", 
        unit="doc",
        position=0
    )
    
    # Progress bar secundaria para chunks del documento actual
    chunk_pbar = None
    
    for i, file_info in enumerate(doc_pbar):
        doc_id = file_info["doc_id"]
        document = file_info["data"]
        chunks_count = file_info["chunks_count"]
        chunks_pending = file_info["chunks_pending"]
        file_path = file_info["file_path"]
        
        doc_pbar.set_description(f"üìÑ Procesando {doc_id}")
        
        # Progress bar para chunks de este documento
        if chunks_pending > 0:
            chunk_pbar = tqdm(
                total=chunks_pending,
                desc=f"  üì¶ Indexando chunks",
                unit="chunk",
                position=1,
                leave=False
            )
            
            # Callback para actualizar progress bar de chunks
            def update_chunk_progress(successful_count):
                if chunk_pbar:
                    chunk_pbar.update(successful_count)
        else:
            update_chunk_progress = None
        
        try:
            # Indexar documento usando la funci√≥n mejorada
            result = index_philatelic_document(
                client, 
                document, 
                COLLECTION_NAME,
                progress_callback=update_chunk_progress
            )
            
            # Guardar resultado
            chunks_indexed = result.get("successful", 0)
            chunks_failed = len(result.get("errors", []))
            chunks_marked = result.get("chunks_marked_as_indexed", 0)
            
            # Estad√≠sticas de validaci√≥n (chunks truncados)
            validation_stats = result.get("validation_stats", {})
            chunks_truncated_doc = validation_stats.get("truncated_chunks", 0)
            chars_saved_doc = validation_stats.get("total_chars_saved", 0)
            
            indexing_results.append({
                "doc_id": doc_id,
                "success": chunks_indexed > 0 or result.get("already_indexed", False),
                "chunks_indexed": chunks_indexed,
                "chunks_failed": chunks_failed,
                "chunks_marked": chunks_marked,
                "chunks_truncated": chunks_truncated_doc,
                "chars_saved": chars_saved_doc,
                "already_indexed": result.get("already_indexed", False),
                "errors": result.get("errors", []),
                "validation_stats": validation_stats
            })
            
            total_chunks_indexed += chunks_indexed
            total_chunks_failed += chunks_failed
            total_chunks_truncated += chunks_truncated_doc
            total_chars_saved += chars_saved_doc
            
            # Actualizar descripci√≥n del progress bar
            status_parts = []
            if chunks_marked > 0:
                status_parts.append(f"{chunks_marked} marcados")
            if chunks_truncated_doc > 0:
                status_parts.append(f"{chunks_truncated_doc} truncados")
            
            if status_parts:
                doc_pbar.set_postfix_str(f"‚úÖ {', '.join(status_parts)}")
            
            # Guardar archivo JSON actualizado si hay chunks marcados
            if chunks_marked > 0:
                try:
                    with open(file_path, 'w', encoding='utf-8') as f:
                        json.dump(document, f, ensure_ascii=False, indent=2)
                    documents_updated += 1
                except Exception as save_error:
                    print(f"‚ö†Ô∏è Error guardando {file_path}: {save_error}")
            elif result.get("already_indexed"):
                doc_pbar.set_postfix_str("‚úÖ Ya indexado")
            
        except Exception as e:
            print(f"‚ùå Error indexando {doc_id}: {e}")
            indexing_results.append({
                "doc_id": doc_id,
                "success": False,
                "error": str(e)
            })
        
        finally:
            # Cerrar progress bar de chunks
            if chunk_pbar:
                chunk_pbar.close()
    
    doc_pbar.close()
    total_time = time.time() - start_time
    
    # Resumen final
    successful_docs = sum(1 for r in indexing_results if r.get("success", False))
    
    summary = {
        "total_documents": len(discovered_files),
        "documents_processed": len(documents_to_process),
        "successful_documents": successful_docs,
        "failed_documents": len(documents_to_process) - successful_docs,
        "documents_updated": documents_updated,
        "total_chunks_indexed": total_chunks_indexed,
        "total_chunks_failed": total_chunks_failed,
        "total_chunks_truncated": total_chunks_truncated,
        "total_chars_saved": total_chars_saved,
        "total_time_seconds": total_time,
        "avg_time_per_document": total_time / len(documents_to_process) if documents_to_process else 0,
        "chunks_per_second": total_chunks_indexed / total_time if total_time > 0 else 0,
        "results": indexing_results
    }
    
    print("\\n" + "=" * 60)
    print("üìä RESUMEN FINAL DE INDEXACI√ìN:")
    print(f"   ‚úÖ Documentos procesados: {len(documents_to_process)}")
    print(f"   ‚úÖ Documentos exitosos: {successful_docs}")
    print(f"   üíæ Archivos JSON actualizados: {documents_updated}")
    print(f"   üì¶ Chunks indexados: {total_chunks_indexed:,}")
    print(f"   ‚ùå Chunks fallidos: {total_chunks_failed:,}")
    
    # Mostrar estad√≠sticas de truncado si hay chunks truncados
    if total_chunks_truncated > 0:
        print(f"   ‚úÇÔ∏è Chunks truncados exitosamente: {total_chunks_truncated:,}")
        print(f"   üíæ Caracteres removidos: {total_chars_saved:,}")
        truncation_rate = (total_chunks_truncated / (total_chunks_indexed + total_chunks_failed)) * 100
        print(f"   üìä Tasa de truncado: {truncation_rate:.1f}%")
        print("   üí° Los chunks truncados mantienen informaci√≥n clave del inicio")
    
    print(f"   ‚è±Ô∏è Tiempo total: {total_time:.1f} segundos")
    print(f"   üöÄ Velocidad: {summary['chunks_per_second']:.1f} chunks/segundo")
    
    success_rate = (total_chunks_indexed / (total_chunks_indexed + total_chunks_failed)) * 100 if (total_chunks_indexed + total_chunks_failed) > 0 else 0
    print(f"   üìà Tasa de √©xito: {success_rate:.1f}%")
    
    return summary

print("‚úÖ Funci√≥n de indexaci√≥n mejorada con manejo inteligente de chunks largos")

In [None]:
# === PRUEBA CON MANEJO DE CHUNKS LARGOS ===
# Cambiar test_mode = True para probar la soluci√≥n de chunks largos
test_mode = False  # ‚úÖ ACTIVADO para probar soluci√≥n de chunks largos
if test_mode and discovered_files:
    print("üß™ MODO PRUEBA: Probando manejo inteligente de chunks largos")
    print("‚úÇÔ∏è Esta prueba validar√° que chunks > 30,000 caracteres sean truncados autom√°ticamente")
    test_files = [discovered_files[2]]  # Solo el primer documento
    indexing_summary = index_all_documents(client, test_files)
    
    print("\\nüß™ AN√ÅLISIS DE PRUEBA:")
    if indexing_summary and "results" in indexing_summary:
        result = indexing_summary["results"][0]
        if result.get("chunks_truncated", 0) > 0:
            print(f"‚úÖ √âXITO: {result['chunks_truncated']} chunks fueron truncados autom√°ticamente")
            print(f"üíæ Caracteres removidos: {result['chars_saved']:,}")
            print("‚úÖ No deber√≠a haber errores de 'maximum context length'")
        else:
            print("‚ÑπÔ∏è No se encontraron chunks que requirieran truncado en este documento")
        
        if result.get("chunks_indexed", 0) > 0:
            print(f"‚úÖ √âXITO: {result['chunks_indexed']} chunks indexados exitosamente")
        else:
            print("‚ö†Ô∏è No se index√≥ ning√∫n chunk - revisar errores")
            
    print("\\nüß™ Prueba completada. Si no hay errores de 'maximum context length', la soluci√≥n funciona.")
elif test_mode and not discovered_files:
    print("‚ö†Ô∏è No hay documentos para probar")
else:
    print("‚ÑπÔ∏è Modo prueba desactivado (test_mode = False)")

# === INDEXACI√ìN COMPLETA ===

In [None]:
# Ejecutar indexaci√≥n
if client and discovered_files:
    print("üéØ ¬øProceder con la indexaci√≥n robusta?")
    
    # Calcular solo chunks pendientes
    total_pending = sum(f.get("chunks_pending", 0) for f in discovered_files)
    docs_with_pending = sum(1 for f in discovered_files if f.get("chunks_pending", 0) > 0)
    
    if total_pending == 0:
        print("üéâ ¬°Todos los chunks ya est√°n indexados!")
        print("   No hay nada que procesar.")
        indexing_summary = {
            "all_indexed": True,
            "message": "Todos los chunks ya est√°n indexados"
        }
    else:
        print(f"   üìÑ Documentos con chunks pendientes: {docs_with_pending}")
        print(f"   üì¶ Total chunks pendientes: {total_pending:,}")
        
        # Estimar tiempo mejorado (con rate limiting y reintentos)
        estimated_minutes = total_pending / 75  # M√°s conservador: ~75 chunks por minuto
        print(f"   ‚è±Ô∏è Tiempo estimado: {estimated_minutes:.1f} minutos")
        print(f"   üîÑ Incluye reintentos autom√°ticos y manejo de rate limits")
        print(f"   üíæ Los archivos JSON se actualizar√°n autom√°ticamente")
        
        # Ejecutar indexaci√≥n robusta
        indexing_summary = index_all_documents(client, discovered_files)
        
        # Guardar resultados
        results_file = "indexing_results_robust.json"
        with open(results_file, 'w', encoding='utf-8') as f:
            json.dump(indexing_summary, f, ensure_ascii=False, indent=2)
        print(f"\\nüíæ Resultados guardados en: {results_file}")
        
        # Mostrar resumen de archivos actualizados
        if indexing_summary.get("documents_updated", 0) > 0:
            print(f"\\nüìù ARCHIVOS JSON ACTUALIZADOS:")
            print(f"   ‚Ä¢ {indexing_summary['documents_updated']} archivos con chunks marcados como indexados")
            print(f"   ‚Ä¢ Las futuras ejecuciones saltar√°n autom√°ticamente estos chunks")
    
else:
    print("‚ö†Ô∏è No se puede proceder con la indexaci√≥n:")
    if not client:
        print("   - Sin conexi√≥n a Weaviate")
    if not discovered_files:
        print("   - No hay documentos para indexar")
    indexing_summary = None

## 5. Validaci√≥n y Estad√≠sticas

Verificar que la indexaci√≥n fue exitosa y mostrar estad√≠sticas detalladas.

In [None]:
# Validar indexaci√≥n
if client:
    print("üîç VALIDANDO INDEXACI√ìN...")
    
    # Obtener estad√≠sticas actuales
    current_stats = get_collection_stats(client, COLLECTION_NAME, 2000)
    
    if current_stats:
        print(f"\nüìä ESTAD√çSTICAS DE WEAVIATE:")
        print(f"   üì¶ Total chunks indexados: {current_stats.get('total_chunks', 0):,}")
        print(f"   üìÑ Documentos √∫nicos: {current_stats.get('total_documents', 0)}")
        
        # Mostrar documentos indexados
        if current_stats.get('documents'):
            print(f"\nüìã DOCUMENTOS EN WEAVIATE:")
            for doc_id, chunk_count in current_stats['documents'].items():
                print(f"   ‚Ä¢ {doc_id}: {chunk_count:,} chunks")
        
        # Mostrar tipos de chunks
        if current_stats.get('chunk_types'):
            print(f"\nüè∑Ô∏è TIPOS DE CHUNKS:")
            for chunk_type, count in current_stats['chunk_types'].items():
                print(f"   ‚Ä¢ {chunk_type}: {count:,}")
        
        # Comparar con archivos originales
        if 'discovered_files' in locals() and discovered_files:
            expected_chunks = sum(f["chunks_count"] for f in discovered_files)
            indexed_chunks = current_stats.get('total_chunks', 0)
            
            print(f"\nüîÑ COMPARACI√ìN:")
            print(f"   üì• Chunks esperados: {expected_chunks:,}")
            print(f"   üì§ Chunks indexados: {indexed_chunks:,}")
            
            if indexed_chunks == expected_chunks:
                print(f"   ‚úÖ ¬°Indexaci√≥n completa al 100%!")
            elif indexed_chunks > 0:
                coverage = (indexed_chunks / expected_chunks) * 100
                print(f"   üìä Cobertura: {coverage:.1f}%")
                if coverage < 100:
                    missing = expected_chunks - indexed_chunks
                    print(f"   ‚ö†Ô∏è Faltan {missing:,} chunks")
            else:
                print(f"   ‚ùå No hay chunks indexados")
    else:
        print("‚ùå No se pudieron obtener estad√≠sticas de Weaviate")
else:
    print("‚ö†Ô∏è Sin conexi√≥n a Weaviate para validaci√≥n")

## 6. Pruebas de B√∫squeda Sem√°ntica

Probar el sistema de b√∫squeda sem√°ntica con consultas filat√©licas espec√≠ficas.

In [None]:
# -*- coding: utf-8 -*-
"""
Thresholded + boosted + diversified Weaviate retrieval
- Works with Weaviate Python client v4 (collections.query.*)
- Backward-compatible name: search_chunks_semantic(...)
- Adds: min_score gating, domain boosts (Scott/year/quality), dedup, MMR, multi-mode fallback
"""

import re, time, math, hashlib
from typing import List, Dict, Any, Optional, Tuple
from collections import defaultdict

# ---- Weaviate v4 query helpers
try:
    from weaviate.classes import query as wv_query
    from weaviate.classes.query import Filter as WvFilter
except Exception:
    # If you import differently in your project, adjust these two imports accordingly.
    wv_query = None
    WvFilter = None


# =========================
# Utility helpers
# =========================

def _distance_to_similarity(distance: Optional[float], metric: str = "cosine") -> Optional[float]:
    if distance is None:
        return None
    d = float(distance)
    if metric == "cosine":
        return max(0.0, min(1.0, 1.0 - d))  # cosine distance -> similarity in [0..1]
    elif metric in ("l2", "euclidean"):
        return 1.0 / (1.0 + d)
    elif metric == "dot":  # heuristic
        return 1.0 - (d / 2.0)
    return None

def _norm_score(raw: Optional[float]) -> float:
    """Normalize to [0,1]. Weaviate hybrid/bm25 'score' is usually [0..1]; vector similarity from our converter is also [0..1]."""
    if raw is None:
        return 0.0
    return max(0.0, min(1.0, float(raw)))

def _text_hash(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8", errors="ignore")).hexdigest()

def _extract_query_years(query: str) -> List[int]:
    years = []
    for y in re.findall(r"\b(18\d{2}|19\d{2}|20\d{2})\b", query):
        try:
            years.append(int(y))
        except:
            pass
    return years

def _year_overlap(query_years: List[int], hit_years: List[int]) -> bool:
    if not query_years or not hit_years:
        return False
    qs = set(int(y) for y in query_years if str(y).isdigit())
    hs = set(int(y) for y in hit_years if str(y).isdigit())
    return len(qs.intersection(hs)) > 0

def _boosts(hit: Dict[str, Any], query: str, requested_scotts: Optional[List[str]], query_years: List[int]) -> float:
    """
    Domain-aware boosts capped to 0.30 total.
    - Scott exact match: +0.15
    - Year overlap     : +0.08
    - Chunk quality    : up to +0.07
    """
    boost = 0.0

    # Scott boost
    if requested_scotts:
        scotts = {str(s).strip().lower() for s in (hit.get("scott_numbers") or [])}
        want   = {str(s).strip().lower() for s in requested_scotts}
        if scotts & want:
            boost += 0.15

    # Year overlap boost
    if _year_overlap(query_years, hit.get("years") or []):
        boost += 0.08

    # Quality boost
    q = hit.get("quality_score", 0.0)
    try:
        qn = max(0.0, min(1.0, float(q)))
        boost += 0.07 * qn
    except:
        pass

    return min(boost, 0.30)

def _passes_content_gates(hit: Dict[str, Any], min_chars: int) -> Tuple[bool, str]:
    t = (hit.get("text") or "").strip()
    if len(t) < min_chars:
        return False, f"too_short<{min_chars}"
    # Gentle philately guard to prevent off-topic noise
    if not re.search(r"\bstamp\b|\bperforat|\bwatermark|\bscott\b|\bsurcharge|\bissue\b", t, flags=re.I):
        return False, "weak_domain_signal"
    return True, "ok"

def _dedup(hits: List[Dict[str, Any]], max_per_doc: int = 2) -> List[Dict[str, Any]]:
    """Cap to N per (doc_id, page_number) and deduplicate by text hash."""
    by_doc = defaultdict(int)
    seen_hash = set()
    out = []
    for h in hits:
        key = (h.get("doc_id"), h.get("page_number"))
        hsh = _text_hash(h.get("text") or "")
        if hsh in seen_hash:
            h["_reject_reason"] = "dup_text"
            continue
        if by_doc[key] >= max_per_doc:
            h["_reject_reason"] = "doc_cap"
            continue
        by_doc[key] += 1
        seen_hash.add(hsh)
        out.append(h)
    return out

def _mmr_select(candidates: List[Dict[str, Any]], k: int, lambda_diversity: float = 0.7) -> List[Dict[str, Any]]:
    """
    Lightweight MMR using Jaccard similarity on token sets.
    Assumes 'final_score' exists.
    """
    if not candidates:
        return []
    chosen, rest = [], candidates[:]
    for c in rest:
        toks = set(re.findall(r"[a-z0-9]+", (c.get("text") or "").lower()))
        c["_tokset"] = toks
    while rest and len(chosen) < k:
        best, best_val = None, -1e9
        for c in rest:
            relevance = float(c.get("final_score", 0.0))
            diversity_bonus = 0.0
            if chosen:
                max_sim = 0.0
                for p in chosen:
                    inter = len(c["_tokset"].intersection(p["_tokset"]))
                    union = len(c["_tokset"].union(p["_tokset"])) or 1
                    jacc = inter / union
                    max_sim = max(max_sim, jacc)
                diversity_bonus = (1 - max_sim)  # prefer lower similarity
            val = lambda_diversity * relevance + (1 - lambda_diversity) * diversity_bonus
            if val > best_val:
                best, best_val = c, val
        chosen.append(best)
        rest.remove(best)
    for c in chosen:
        c.pop("_tokset", None)
    return chosen


# =========================
# Filters (Weaviate v4)
# =========================

def _build_filters(filters: Optional[Dict[str, Any]]) -> Optional[Any]:
    """
    Build Weaviate v4 Filter (where clause) from your light dict.
    Supported keys in `filters`:
      - "year_range": (start:int, end:int)
      - "scott_numbers": List[str]   (TEXT_ARRAY containsAny)
      - "catalog_system": "Scott"    (TEXT eq)
      - you can extend as needed
    """
    if not filters:
        return None
    clauses = []

    if "year_range" in filters and isinstance(filters["year_range"], (tuple, list)) and len(filters["year_range"]) == 2:
        ys, ye = filters["year_range"]
        # Prefer range-capable fields if present in your schema; if not, this will still work if you added year_start/year_end.
        try:
            c1 = WvFilter.by_property("year_start").greater_than_equal(ys)
            c2 = WvFilter.by_property("year_end").less_than_equal(ye)
            clauses.append(c1)
            clauses.append(c2)
        except Exception:
            # Fallback: if you only have INT_ARRAY 'years', we approximate with containsAny of all years in range (coarse)
            year_list = list(range(int(ys), int(ye) + 1))
            clauses.append(WvFilter.by_property("years").contains_any(year_list))

    if filters.get("catalog_system"):
        clauses.append(WvFilter.by_property("catalog_systems").contains_any([filters["catalog_system"]]))

    if filters.get("scott_numbers"):
        clauses.append(WvFilter.by_property("scott_numbers").contains_any(list(filters["scott_numbers"])))

    if not clauses:
        return None

    # AND all clauses
    where = clauses[0]
    for c in clauses[1:]:
        where = where & c
    return where


# =========================
# Main: thresholded + diversified retrieval
# =========================

def search_chunks_semantic(
    client,
    query: str,
    collection_name: str = "Oxcart",
    limit: int = 5,
    filters: Optional[Dict[str, Any]] = None,
    mode: str = "vector",          # kept for backward-compat; now we may try multi-stage if needed
    alpha: float = 0.35,           # for hybrid
    distance_metric: str = "cosine",
    # --- New safety/quality knobs (tunable) ---
    min_score: float = 0.55,       # threshold AFTER boosts (0..1)
    min_chars: int = 280,          # tiny snippet filter
    mmr_lambda: float = 0.7,       # 0.5..0.8 usually fine
    overfetch_factor: int = 3,     # fetch N√ólimit then gate
    k_min: int = 3,                # minimum contexts needed
    requested_scotts: Optional[List[str]] = None,  # domain boost
) -> List[Dict[str, Any]]:
    """
    Advanced retrieval with thresholding, boosts, dedup, MMR and multi-stage fallback.
    Returns a list of results with 'final_score' and 'stage' annotations.
    """
    # Prepare
    coll = client.collections.get(collection_name)
    f = _build_filters(filters)
    query_years = _extract_query_years(query)
    rejected_reasons = defaultdict(int)

    def _run(mode_local: str, label: str, hard_limit: int) -> List[Dict[str, Any]]:
        # Query Weaviate
        if mode_local == "hybrid":
            resp = coll.query.hybrid(
                query=query,
                alpha=alpha,
                limit=hard_limit,
                filters=f,
                return_properties=[
                    "chunk_id","chunk_type","text","text_original","doc_id","page_number",
                    "catalog_systems","catalog_numbers","scott_numbers","years","colors",
                    "topics_primary","variety_classes","has_catalog","has_prices","has_varieties",
                    "is_guanacaste","quality_score"
                ],
                return_metadata=wv_query.MetadataQuery(score=True, distance=True),
            )
        elif mode_local == "bm25":
            resp = coll.query.bm25(
                query=query,
                limit=hard_limit,
                filters=f,
                return_properties=[
                    "chunk_id","chunk_type","text","text_original","doc_id","page_number",
                    "catalog_systems","catalog_numbers","scott_numbers","years","colors",
                    "topics_primary","variety_classes","has_catalog","has_prices","has_varieties",
                    "is_guanacaste","quality_score"
                ],
                return_metadata=wv_query.MetadataQuery(score=True),
            )
        else:
            resp = coll.query.near_text(
                query=query,
                limit=hard_limit,
                filters=f,
                return_properties=[
                    "chunk_id","chunk_type","text","text_original","doc_id","page_number",
                    "catalog_systems","catalog_numbers","scott_numbers","years","colors",
                    "topics_primary","variety_classes","has_catalog","has_prices","has_varieties",
                    "is_guanacaste","quality_score"
                ],
                return_metadata=wv_query.MetadataQuery(distance=True),
            )

        raw_out = []
        for obj in (resp.objects or []):
            props = obj.properties or {}
            meta = getattr(obj, "metadata", None)
            distance = getattr(meta, "distance", None) if meta else None
            hybrid_score = getattr(meta, "score", None) if meta else None

            similarity = _distance_to_similarity(distance, metric=distance_metric)
            base_score = hybrid_score if hybrid_score is not None else (similarity if similarity is not None else 0.0)

            # De-duplicate figure markdown inside text_original (your original logic, preserved)
            figure_pattern = r'(!\[([^\]]*)\]\([^)]+\))'
            original_content = props.get("text_original", "") or props.get("text", "") or ""

            figures = re.findall(figure_pattern, original_content)
            seen_figures = set()
            unique_figures = []
            for fig in figures:
                img_path = re.search(r'\]\(([^)]+)\)', fig[0])
                if img_path:
                    img_identifier = img_path.group(1)
                    if img_identifier not in seen_figures:
                        seen_figures.add(img_identifier)
                        unique_figures.append(fig)

            existing_figures = set()
            for fig in unique_figures:
                if fig[0] in original_content:
                    img_path = re.search(r'\]\(([^)]+)\)', fig[0])
                    if img_path:
                        existing_figures.add(img_path.group(1))

            missing_figures = []
            for fig in unique_figures:
                img_path = re.search(r'\]\(([^)]+)\)', fig[0])
                if img_path and img_path.group(1) not in existing_figures:
                    missing_figures.append(fig[0])

            if missing_figures:
                figures_text = "\n\n" + "\n".join(missing_figures)
                original_content = original_content + figures_text

            raw_out.append({
                "uuid": str(obj.uuid),
                "score": base_score,             # original score (hybrid/bm25) or similarity (vector)
                "similarity": similarity,
                "distance": distance,
                "chunk_id": props.get("chunk_id", ""),
                "chunk_type": props.get("chunk_type", ""),
                "text": original_content,
                "doc_id": props.get("doc_id", ""),
                "page_number": props.get("page_number", 0),
                "catalog_systems": props.get("catalog_systems", []),
                "catalog_numbers": props.get("catalog_numbers", []),
                "scott_numbers": props.get("scott_numbers", []),
                "years": props.get("years", []),
                "colors": props.get("colors", []),
                "topics_primary": props.get("topics_primary", ""),
                "variety_classes": props.get("variety_classes", []),
                "has_catalog": props.get("has_catalog", False),
                "has_prices": props.get("has_prices", False),
                "has_varieties": props.get("has_varieties", False),
                "is_guanacaste": props.get("is_guanacaste", False),
                "quality_score": props.get("quality_score", 0.0),
                "mode": mode_local,
                "stage": label,
            })
        return raw_out

    def _gate_and_rank(raw_hits: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        gated = []
        for h in raw_hits:
            s_norm = _norm_score(h.get("score"))
            b = _boosts(h, query, requested_scotts, query_years)
            final = max(0.0, min(1.0, s_norm + b))
            ok, reason = _passes_content_gates(h, min_chars=min_chars)
            if not ok:
                h["_reject_reason"] = reason
                rejected_reasons[reason] += 1
                continue
            if final < min_score:
                h["_reject_reason"] = f"below_threshold<{min_score:.2f}>"
                rejected_reasons[h["_reject_reason"]] += 1
                continue

            h["norm_score"] = s_norm
            h["boost"] = b
            h["final_score"] = final
            gated.append(h)

        # dedup per doc & by text hash
        deduped = _dedup(gated, max_per_doc=2)
        # diversify with MMR
        diversified = _mmr_select(
            sorted(deduped, key=lambda x: x["final_score"], reverse=True),
            k=min(limit, len(deduped)),
            lambda_diversity=mmr_lambda
        )
        return diversified

    # -------------------------
    # Multi-stage retrieval
    # -------------------------
    # 1) Try the user-requested mode first (keeps backward compatibility)
    modes_order = [mode]
    for m in ("hybrid", "bm25", "vector"):
        if m not in modes_order:
            modes_order.append(m)

    gathered: List[Dict[str, Any]] = []
    for mi, m in enumerate(modes_order):
        raw = _run(m, f"S{mi+1}:{m}+filters", hard_limit=limit * overfetch_factor)
        gated = _gate_and_rank(raw)

        # Merge by uuid, keep best final_score
        heap = {g["uuid"]: g for g in gathered}
        for g in gated:
            if g["uuid"] not in heap or g["final_score"] > heap[g["uuid"]]["final_score"]:
                heap[g["uuid"]] = g
        gathered = list(heap.values())
        gathered.sort(key=lambda x: x["final_score"], reverse=True)
        gathered = gathered[:limit]
        if len(gathered) >= k_min:
            break

    # 2) Relax filters if still not enough (drop Scott, then year)
    if len(gathered) < k_min and filters:
        relaxed = dict(filters)
        if "scott_numbers" in relaxed:
            relaxed.pop("scott_numbers")
        elif "year_range" in relaxed:
            relaxed.pop("year_range")

        f_relaxed = _build_filters(relaxed)
        if f_relaxed is not None:
            for mi, m in enumerate(modes_order):
                # temporarily replace f for this pass
                _old_f = f
                try:
                    f = f_relaxed
                    raw = _run(m, f"Rx{mi+1}:{m}+relaxed", hard_limit=limit * overfetch_factor)
                    gated = _gate_and_rank(raw)
                    heap = {g["uuid"]: g for g in gathered}
                    for g in gated:
                        if g["uuid"] not in heap or g["final_score"] > heap[g["uuid"]]["final_score"]:
                            heap[g["uuid"]] = g
                    gathered = list(heap.values())
                    gathered.sort(key=lambda x: x["final_score"], reverse=True)
                    gathered = gathered[:limit]
                    if len(gathered) >= k_min:
                        break
                finally:
                    f = _old_f  # restore

    # You can inspect `rejected_reasons` here for debugging if needed.
    return gathered



In [None]:
# =========================
# Example usage (commented)
# =========================
results = search_chunks_semantic(
    client=client,
    query="Costa Rica 1881-82 surcharges Scott 55",
    collection_name="Oxcart",
    limit=60,
    filters={},
    mode="hybrid",
    alpha=0.35,
    min_score=0.55,
    min_chars=280,
    k_min=3,
    requested_scotts=[],
)

for r in results:
    print(f"{r['final_score']:.3f} | {r['stage']} | {r['doc_id']} p.{r['page_number']} | Scotts={r.get('scott_numbers')}")



In [None]:
def test_philatelic_searches(client) -> None:
    """
    Ejecutar b√∫squedas de prueba para validar el sistema.
    """
    if not client:
        print("‚ùå Sin conexi√≥n a Weaviate")
        return
    
    # Consultas de prueba filat√©licas
    test_queries = [
        {
            "name": "B√∫squeda general de sellos",
            "query": "stamps Scott catalog Costa Rica",
            "filters": None
        },
        {
            "name": "Cat√°logo Scott espec√≠fico",
            "query": "Scott catalog numbers",
            "filters": {"catalog_system": "Scott"}
        },
        {
            "name": "Sobrecargas y variedades",
            "query": "overprint surcharge variety error",
            "filters": {"has_varieties": True}
        },
        {
            "name": "Periodo Guanacaste",
            "query": "Guanacaste overprint historical Costa Rica",
            "filters": {"is_guanacaste": True}
        },
        {
            "name": "Especificaciones t√©cnicas",
            "query": "perforation paper printing watermark",
            "filters": {"has_technical_specs": True}
        },
        {
            "name": "Tablas con datos",
            "query": "catalog table prices values",
            "filters": {"chunk_type": "table"}
        }
    ]
    
    print("üîç EJECUTANDO B√öSQUEDAS DE PRUEBA")
    print("=" * 60)
    
    for i, test in enumerate(test_queries, 1):
        print(f"\nüîé [{i}/{len(test_queries)}] {test['name']}")
        print(f"   Query: \"{test['query']}\"")
        if test['filters']:
            print(f"   Filtros: {test['filters']}")
        
        try:
            results = search_chunks_semantic(
                client, 
                test["query"], 
                "Oxcart", 
                limit=3,
                filters=test["filters"]
            )
            
            print(f"   üìä Resultados: {len(results)}")
            
            for j, result in enumerate(results, 1):
                print(f"\n      üè∑Ô∏è #{j} (Score: {result['score']:.3f})")
                print(f"         üìÑ Documento: {result['doc_id']}")
                print(f"         üìã Tipo: {result['chunk_type']}")
                print(f"         üìÑ P√°gina: {result['page_number']}")
                
                # Mostrar metadatos relevantes
                if result.get('catalog_systems'):
                    print(f"         üìñ Cat√°logos: {result['catalog_systems']}")
                if result.get('scott_numbers'):
                    print(f"         üî¢ Scott: {result['scott_numbers']}")
                if result.get('years'):
                    print(f"         üìÖ A√±os: {result['years']}")
                if result.get('colors'):
                    print(f"         üé® Colores: {result['colors']}")
                if result.get('variety_classes'):
                    print(f"         üîÄ Variedades: {result['variety_classes']}")
                
                # Texto truncado
                text = result.get('text', '')
                if len(text) > 200:
                    text = text[:200] + "..."
                print(f"         üìù Texto: {text}")
            
            if not results:
                print(f"   ‚ö†Ô∏è No se encontraron resultados")
            
        except Exception as e:
            print(f"   ‚ùå Error en b√∫squeda: {e}")
        
        print("   " + "-" * 50)

# # Ejecutar pruebas de b√∫squeda
# if client:
#     test_philatelic_searches(client)
# else:
#     print("‚ö†Ô∏è No se pueden ejecutar b√∫squedas sin conexi√≥n a Weaviate")

In [None]:
results = search_chunks_semantic(
                client, 
                "Costa Rica 1907 2 colones stamp with original gum. Scott 68 issue of 1907", 
                "Oxcart", 
                limit=100,
                filters=[],
                mode = "hybrid",
                alpha= 0.45
                
            )
            
print(f"   üìä Resultados: {len(results)}")

for j, result in enumerate(results, 1):
    print(f"\n      üè∑Ô∏è #{j} (Score: {result['score']:.3f})")
    print(f"         üìÑ Documento: {result['doc_id']}")
    print(f"         üìã Tipo: {result['chunk_type']}")
    print(f"         üìÑ P√°gina: {result['page_number']}")
    
    # Mostrar metadatos relevantes
    if result.get('catalog_systems'):
        print(f"         üìñ Cat√°logos: {result['catalog_systems']}")
    if result.get('scott_numbers'):
        print(f"         üî¢ Scott: {result['scott_numbers']}")
    if result.get('years'):
        print(f"         üìÖ A√±os: {result['years']}")
    if result.get('colors'):
        print(f"         üé® Colores: {result['colors']}")
    if result.get('variety_classes'):
        print(f"         üîÄ Variedades: {result['variety_classes']}")
    
    # Texto truncado
    text = result.get('text', '')
    # if len(text) > 200:
    #     text = text[:200] + "..."
    print(f"         üìù Texto: {text}")
    print("**********************************************************************************************************")

## 7. Interfaz Gradio para RAG

Interfaz web interactiva para b√∫squedas sem√°nticas y consultas RAG.

In [None]:
# try:
#     import gradio as gr
#     import openai
#     gradio_available = True
#     print("‚úÖ Gradio disponible")
# except ImportError:
#     print("‚ö†Ô∏è Gradio no est√° instalado")
#     print("üí° Para instalar: pip install gradio")
#     gradio_available = False

# # Configurar OpenAI para RAG
# if OPENAI_API_KEY:
#     openai.api_key = OPENAI_API_KEY
#     openai_available = True
# else:
#     openai_available = False
#     print("‚ö†Ô∏è OpenAI API key no configurada para RAG")

In [None]:
# import os
# from typing import Any, Dict, List, Tuple

# def search_and_answer(
#     query: str,
#     rag_system: Dict[str, Any],
#     use_filters: bool = False,
#     catalog_system: str = "",
#     chunk_type: str = "",
#     has_varieties: bool = False,
#     max_results: int = 10,
# ) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any]]:
#     """
#     B√∫squeda sem√°ntica + RAG (OpenAI >= 1.0, modelo gpt-4o-mini).
#     Devuelve: (respuesta_rag, resultados(lista de dicts), metadatos(dict))
#     """
#     # Validaci√≥n de conexi√≥n
#     if not rag_system or not rag_system.get("client"):
#         meta = {"query": query, "total_results": 0, "max_results": max_results, "filters_used": {}, "context_length": 0}
#         return "‚ùå Error: Sin conexi√≥n a Weaviate", [], meta

#     client_wv = rag_system["client"]
#     collection_name = rag_system.get("collection_name", "Oxcart")

#     # Construir filtros
#     filt = None
#     if use_filters:
#         filt = {}
#         if catalog_system:
#             filt["catalog_system"] = catalog_system
#         if chunk_type:
#             filt["chunk_type"] = chunk_type
#         if has_varieties:
#             filt["has_varieties"] = True

#     # B√∫squeda sem√°ntica (usa tu funci√≥n ya definida)
#     results = search_chunks_semantic(
#         client=client_wv,
#         query=query,
#         collection_name=collection_name,
#         limit=int(max_results),
#         filters=filt,
#         mode = "hybrid",
#         alpha= 0.35
#     )

#     # Preparar contexto para RAG (top 3)
#     top = results[:3]
#     context = "\n\n".join(
#         f"Documento {r.get('doc_id', 'N/A')} (P√°gina {r.get('page_number', '¬ø?')}): {r.get('text','')}"
#         for r in top
#     )
#     context_len = len(context)

#     # Generar respuesta RAG (OpenAI >= 1.0.0)
#     rag_answer = "‚ö†Ô∏è No se encontraron resultados para generar respuesta"
#     openai_key = os.getenv("OPENAI_API_KEY")
#     if not results:
#         rag_answer = "‚ö†Ô∏è No se encontraron resultados para generar respuesta"
#     elif not openai_key:
#         rag_answer = "‚ö†Ô∏è RAG no disponible: OpenAI API key no configurada"
#     else:
#         try:
#             from openai import OpenAI
#             oa_client = OpenAI(api_key=openai_key)

#             system_prompt = (
#                 "You are an expert in costa rica philately (stamps, covers, etc). "
#                 "Only answer based with the information provided. If there is not enough info for answer please, "
#                 "answer with: 'I dont have information'. You must include any references about philatelic like scott catalogue references, dates, etc."
#             )

#             model = os.getenv("RAG_MODEL", "gpt-4o-mini")
#             resp = oa_client.chat.completions.create(
#                 model=model,
#                 messages=[
#                     {"role": "system", "content": system_prompt},
#                     {"role": "user", "content": f"Here is the information for your answers:\n{context}\n\nAnswer this only with the information provided: {query}"}
#                 ],
#                 temperature=0.3,
#                 max_tokens=1000,
#             )

#             rag_text = resp.choices[0].message.content if resp.choices else ""
#             if not rag_text:
#                 rag_text = "No se obtuvo texto de respuesta del modelo."

#             rag_answer = (
#                 "ü§ñ **Respuesta RAG:**\n\n"
#                 + rag_text
#                 + f"\n\nüìä *Basado en {len(results)} resultados de b√∫squeda*"
#             )
#         except Exception as e:
#             rag_answer = f"‚ùå Error generando respuesta RAG: {e}"

#     metadata = {
#         "query": query,
#         "total_results": len(results),
#         "max_results": int(max_results),
#         "filters_used": filt or {},
#         "context_length": context_len,
#     }
#     return rag_answer, results, metadata


In [None]:
# def get_collection_info() -> str:
#     """
#     Obtener informaci√≥n de la colecci√≥n para mostrar en la interfaz.
#     """
#     if not client:
#         return "‚ùå Sin conexi√≥n a Weaviate"
    
#     try:
#         stats = get_collection_stats(client, "Oxcart")
#         if stats:
#             info = f"üìä **Estad√≠sticas de la Colecci√≥n Oxcart:**\n\n"
#             info += f"üì¶ **Total chunks:** {stats['total_chunks']:,}\n"
#             info += f"üìÑ **Documentos:** {stats['total_documents']}\n\n"
            
#             if stats.get('documents'):
#                 info += "**Documentos indexados:**\n"
#                 for doc_id, count in stats['documents'].items():
#                     info += f"‚Ä¢ {doc_id}: {count:,} chunks\n"
            
#             return info
#         else:
#             return "‚ùå No se pudieron obtener estad√≠sticas"
#     except Exception as e:
#         return f"‚ùå Error: {e}"

# print("‚úÖ Funciones RAG definidas")

In [None]:
# stats = get_collection_stats(client, "Oxcart")
# stats['total_documents']
# stats['total_chunks']

In [None]:
# # Estructura que usan tus funciones de b√∫squeda/respuesta
# rag_system = {
#     "success": True,
#     "client": client,                    # para que search_and_answer pueda consultar
#     "collection_name": COLLECTION_NAME,  # nombre de la colecci√≥n
#     "weaviate_url": WEAVIATE_URL,        # info para la UI
#     "total_documents": stats['total_documents'],       # para mostrar estado
#     "total_chunks": stats['total_chunks'],        # opcional en la UI
#     # puedes a√±adir m√°s campos que tu search_and_answer necesite
# }

In [None]:
# import os
# import gradio as gr
# from typing import Dict, Any
# import threading
# import time

# def create_gradio_interface(rag_system: Dict[str, Any]) -> gr.Blocks:
#     """
#     Crea la interfaz Gradio para consultas RAG.
#     """

#     def gradio_search_and_answer(query, use_filters, catalog_system, chunk_type, has_varieties, max_results):
#         """
#         Wrapper para Gradio: llama a search_and_answer y formatea salidas.
#         """
#         if not rag_system:
#             return "‚ùå Sistema RAG no est√° configurado", "No hay resultados", "No hay metadatos"

#         # Llamada a tu funci√≥n (se asume definida en tu entorno)
#         answer, results, metadata = search_and_answer(
#             query=query,
#             rag_system=rag_system,
#             use_filters=use_filters,
#             catalog_system=catalog_system,
#             chunk_type=chunk_type,
#             has_varieties=has_varieties,
#             max_results=int(max_results),
#         )

#         # --- Formatear resultados de b√∫squeda ---
#         lines = []
#         if results:
#             for i, r in enumerate(results):
#                 doc_id = r.get("doc_id") or r.get("document_id", "N/A")
#                 chunk_type_val = r.get("chunk_type", "N/A")
#                 page_number = r.get("page_number", "N/A")
#                 catalogs = r.get("catalog_systems") or []
#                 scotts = r.get("scott_numbers") or []
#                 years = r.get("years") or []

#                 # Vista previa: usa content_preview si existe; si no, toma 'text'
#                 preview = r.get("content_preview")
#                 if not preview:
#                     text = r.get("text", "")
#                     preview = (text[:300] + "...") if len(text) > 300 else text

#                 block = []
#                 block.append(f"**Resultado {i+1}**")
#                 block.append(f"‚Ä¢ Documento: {doc_id}")
#                 block.append(f"‚Ä¢ Tipo: {chunk_type_val} | P√°gina: {page_number}")
#                 if catalogs:
#                     block.append(f"‚Ä¢ Cat√°logos: {', '.join(catalogs)}")
#                 if scotts:
#                     block.append(f"‚Ä¢ Scott: {', '.join(scotts)}")
#                 if years:
#                     block.append(f"‚Ä¢ A√±os: {', '.join(str(y) for y in years)}")
#                 block.append(f"‚Ä¢ Vista previa: {preview}")
#                 block.append("-" * 50)
#                 lines.append("\n".join(block))
#             search_output = "\n".join(lines)
#         else:
#             search_output = "No se encontraron resultados"

#         # --- Formatear metadatos ---
#         metadata = metadata or {}
#         metadata_output = (
#             "**Metadatos de la consulta:**\n"
#             f"‚Ä¢ Consulta: {metadata.get('query', 'N/A')}\n"
#             f"‚Ä¢ Resultados encontrados: {metadata.get('total_results', 0)}\n"
#             f"‚Ä¢ M√°ximo solicitado: {metadata.get('max_results', 'N/A')}\n"
#             f"‚Ä¢ Filtros usados: {metadata.get('filters_used', {})}\n"
#             f"‚Ä¢ Longitud del contexto: {metadata.get('context_length', 'N/A')} caracteres\n"
#         )

#         return answer, search_output, metadata_output

#     # Valores informativos del sistema
#     collection_name = rag_system.get("collection_name", "Oxcart")
#     total_docs = rag_system.get("total_documents", 0)
#     weaviate_url = rag_system.get("weaviate_url") or os.getenv("WEAVIATE_URL", "http://localhost:8080")

#     # --- UI ---
#     with gr.Blocks(title="OXCART RAG - Consultas Filat√©licas") as interface:
#         gr.Markdown(
#             "# üîç OXCART RAG - Sistema de Consultas Filat√©licas\n\n"
#             "Realiza consultas inteligentes sobre tu colecci√≥n de documentos filat√©licos "
#             "usando b√∫squeda sem√°ntica y respuestas generadas por IA."
#         )

#         with gr.Row():
#             with gr.Column(scale=2):
#                 # Input principal
#                 query_input = gr.Textbox(
#                     label="üí≠ Tu consulta filat√©lica",
#                     placeholder="Ej: ¬øQu√© sellos de Espa√±a de 1950 est√°n catalogados como Scott?",
#                     lines=2,
#                 )

#                 # Bot√≥n de b√∫squeda
#                 search_btn = gr.Button("üîç Buscar y Responder", variant="primary")

#                 # Consultas de ejemplo
#                 gr.Markdown("**üí° Consultas de ejemplo:**")
#                 example_queries = [
#                     "¬øQu√© sellos conmemorativos de Espa√±a est√°n en la colecci√≥n?",
#                     "Mu√©strame informaci√≥n sobre sellos con errores de perforaci√≥n",
#                     "¬øCu√°les son los sellos m√°s valiosos seg√∫n el cat√°logo Michel?",
#                     "Informaci√≥n sobre sellos de M√©xico de la d√©cada de 1960",
#                     "¬øQu√© variedades filat√©licas est√°n documentadas?",
#                 ]
#                 # Botones que rellenan el textbox
#                 for example in example_queries:
#                     gr.Button(example, variant="secondary").click(
#                         fn=(lambda ex=example: ex),
#                         inputs=None,
#                         outputs=query_input,
#                     )

#             with gr.Column(scale=1):
#                 # Filtros avanzados
#                 gr.Markdown("**üéØ Filtros Avanzados**")

#                 use_filters = gr.Checkbox(label="Usar filtros espec√≠ficos", value=False)

#                 catalog_system = gr.Dropdown(
#                     choices=["", "Scott", "Michel", "Yvert", "Stanley Gibbons", "Edifil"],
#                     label="Sistema de cat√°logo",
#                     value="",
#                 )

#                 chunk_type = gr.Dropdown(
#                     choices=["", "text", "table", "figure", "title", "header"],
#                     label="Tipo de contenido",
#                     value="",
#                 )

#                 has_varieties = gr.Checkbox(label="Solo documentos con variedades", value=False)

#                 max_results = gr.Slider(
#                     minimum=1,
#                     maximum=100,
#                     value=5,
#                     step=1,
#                     label="M√°ximo resultados",
#                 )

#         # Outputs
#         with gr.Row():
#             with gr.Column():
#                 gr.Markdown("## ü§ñ Respuesta IA")
#                 answer_output = gr.Textbox(label="Respuesta generada", lines=8, interactive=False)

#         with gr.Row():
#             with gr.Column():
#                 gr.Markdown("## üìÑ Documentos Encontrados")
#                 search_output = gr.Textbox(label="Resultados de b√∫squeda", lines=12, interactive=False)

#             with gr.Column():
#                 gr.Markdown("## üìä Metadatos")
#                 metadata_output = gr.Textbox(label="Informaci√≥n de la consulta", lines=10, interactive=False)

#         # Eventos
#         search_btn.click(
#             fn=gradio_search_and_answer,
#             inputs=[query_input, use_filters, catalog_system, chunk_type, has_varieties, max_results],
#             outputs=[answer_output, search_output, metadata_output],
#         )

#         query_input.submit(
#             fn=gradio_search_and_answer,
#             inputs=[query_input, use_filters, catalog_system, chunk_type, has_varieties, max_results],
#             outputs=[answer_output, search_output, metadata_output],
#         )

#         # Informaci√≥n del sistema
#         gr.Markdown(
#             "---\n"
#             f"**üìä Estado del Sistema:**\n"
#             f"‚Ä¢ Colecci√≥n: {collection_name}\n"
#             f"‚Ä¢ Documentos indexados: {total_docs:,}\n"
#             f"‚Ä¢ Weaviate URL: {weaviate_url}\n"
#             "‚Ä¢ Estado: ‚úÖ Operativo\n"
#         )

#     return interface


# # ---- Lanzador robusto con manejo de errores de t√∫nel p√∫blico ----
# if rag_system and rag_system.get("success", False):
#     print("\n" + "=" * 60)
#     print("üöÄ LANZANDO INTERFAZ GRADIO (CON MANEJO DE ERRORES)")
#     print("=" * 60)

#     gradio_app = create_gradio_interface(rag_system)

#     GRADIO_PORT = int(os.getenv("GRADIO_PORT", 7860))
#     GRADIO_SHARE = os.getenv("GRADIO_SHARE", "false").lower() == "true"  # Por defecto False por problemas de conectividad

#     print(f"‚öôÔ∏è Puerto: {GRADIO_PORT}")
#     print(f"üåç URL P√∫blica: {'‚ö†Ô∏è Intentando...' if GRADIO_SHARE else '‚ùå Deshabilitada (m√°s seguro)'}")
    
#     try:
#         print("üîÑ Iniciando servidor Gradio...")
        
#         # Intentar con t√∫nel p√∫blico primero si est√° habilitado
#         if GRADIO_SHARE:
#             print("‚è≥ Intentando crear t√∫nel p√∫blico...")
#             try:
#                 demo = gradio_app.launch(
#                     server_port=GRADIO_PORT,
#                     share=True,
#                     inbrowser=False,
#                     show_error=True,
#                     prevent_thread_lock=False,
#                     quiet=False
#                 )
                
#                 print("\nüéâ ¬°√âXITO! T√∫nel p√∫blico creado")
#                 print(f"üåê URLs DISPONIBLES:")
#                 print(f"   üì± Local: http://localhost:{GRADIO_PORT}")
                
#                 if hasattr(demo, 'share_url') and demo.share_url:
#                     print(f"   üåç P√∫blica: {demo.share_url}")
#                     print(f"\nüîó **URL P√öBLICA:** {demo.share_url}")
#                 else:
#                     print(f"   üåç P√∫blica: Revisa la salida de Gradio arriba ‚òùÔ∏è")
                
#             except Exception as share_error:
#                 print(f"‚ö†Ô∏è Error creando t√∫nel p√∫blico: {share_error}")
#                 print("üîÑ Cambiando a modo local solamente...")
                
#                 # Fallback: solo local
#                 demo = gradio_app.launch(
#                     server_port=GRADIO_PORT,
#                     share=False,
#                     inbrowser=True,
#                     show_error=True,
#                     prevent_thread_lock=False
#                 )
                
#                 print(f"\n‚úÖ SERVIDOR LOCAL OPERATIVO:")
#                 print(f"   üì± URL Local: http://localhost:{GRADIO_PORT}")
#                 print(f"   ‚ö†Ô∏è URL P√∫blica: No disponible (error en t√∫nel)")
                
#         else:
#             # Solo modo local
#             demo = gradio_app.launch(
#                 server_port=GRADIO_PORT,
#                 share=False,
#                 inbrowser=True,
#                 show_error=True,
#                 prevent_thread_lock=False
#             )
            
#             print(f"\n‚úÖ SERVIDOR LOCAL OPERATIVO:")
#             print(f"   üì± URL Local: http://localhost:{GRADIO_PORT}")
#             print(f"   üí° Para URL p√∫blica, cambia GRADIO_SHARE=true en .env")
        
#         print(f"\nüìã INSTRUCCIONES:")
#         print(f"   ‚Ä¢ La interfaz est√° operativa y lista para consultas")
#         print(f"   ‚Ä¢ Para detenerla: gr.close_all()")
#         print(f"   ‚Ä¢ Comparte la URL local en tu red si necesitas acceso remoto")
        
#         print(f"\n{'='*60}")
#         print(f"üéØ INTERFAZ RAG LISTA - ¬°Comienza a hacer consultas!")
#         print(f"{'='*60}")
        
#     except Exception as e:
#         print(f"‚ùå Error cr√≠tico lanzando Gradio: {e}")
#         print("\nüîß SOLUCIONES SUGERIDAS:")
#         print("   1. Ejecuta: gr.close_all()")
#         print("   2. Cambia el puerto: GRADIO_PORT=7861 en .env")
#         print("   3. Verifica que no hay otros servicios en el puerto")
#         print("   4. Reinicia el notebook")
        
# else:
#     print("\n‚ö†Ô∏è  No se puede crear la interfaz Gradio:")
#     if not rag_system:
#         print("   ‚Ä¢ Sistema RAG no est√° configurado")
#     else:
#         print(f"   ‚Ä¢ Error en RAG: {rag_system.get('error', 'Error desconocido')}")
#     print("\nüîß Para solucionar:")
#     print("   1. Verifica que Weaviate est√© corriendo")
#     print("   2. Configura OPENAI_API_KEY en .env") 
#     print("   3. Ejecuta la indexaci√≥n de documentos")
#     print("   4. Reinicia este notebook")

In [None]:
# # Cerrar instancias anteriores de Gradio si existen
# import gradio as gr
# gr.close_all()
# print("üîÑ Cerrando instancias anteriores de Gradio")