In [1]:
import json
import asyncio
from typing import List, Dict, Tuple
from ragas import SingleTurnSample
from ragas.metrics import (
    Faithfulness,
    ContextRecall,
    ContextPrecision,
    NoiseSensitivity
)
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
import os

os.environ["OPENAI_API_KEY"] = "sk-ea25363437e1476fadd3e65759d42903"
os.environ["OPENAI_API_BASE"] = "https://chat.campusai.compute.dtu.dk/api"

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Cargar ground truth
with open("gt.json", "r", encoding="utf-8") as f:
    gt_data = json.load(f)

# Cargar retrieval results dense
with open("extra/retrieval_results_dense.json", "r", encoding="utf-8") as f:
    rag_data_dense = json.load(f)

# Cargar retrieval results sparse
with open("extra/retrieval_results_sparse.json", "r", encoding="utf-8") as f:
    bm25_data = json.load(f)

# Cargar chat logs
with open("k_testing/k3/chat_logs.json", "r", encoding="utf-8") as f:
    chat_data = json.load(f)

print(f"✓ Datos cargados correctamente")
print(f"  - Total de preguntas: {len(gt_data)}")

✓ Datos cargados correctamente
  - Total de preguntas: 24


In [9]:
# Mostrar todas las preguntas con sus índices
print("\n=== PREGUNTAS DISPONIBLES ===")
print("\nÍndice | Pregunta")
print("-" * 80)

for idx, item in enumerate(gt_data):
    question = item["question"].strip()
    # Truncar pregunta si es muy larga
    display_q = question if len(question) <= 70 else question[:67] + "..."
    print(f"{idx:6} | {display_q}")

print("\n" + "="*80)


=== PREGUNTAS DISPONIBLES ===

Índice | Pregunta
--------------------------------------------------------------------------------
     0 | Which solution is suitable for measuring room acoustics and speech ...
     1 | What product should be used for façade sound insulation testing on ...
     2 | Which sound source is recommended for calibrated speech intelligibi...
     3 | What sound source should be used for ISO 3382-compliant room acoust...
     4 | Which product supports compliance with ISO 9612 for workplace noise...
     5 | Which product is suitable for investigating environmental noise com...
     6 | Which product is designed for measuring exhaust noise in vehicles?
     7 | Which product helps verify safe noise emissions from toys and machi...
     8 | Which HBK 2255 variant is best suited for long-term environmental n...
     9 | Which HBK 2255 variant should be used for evaluating workplace nois...
    10 | What HBK 2255 variant is recommended for sound insulation testin

In [10]:
# ========================================
# MODIFICA ESTA LÍNEA CON TUS ÍNDICES
# ========================================

# SELECTED_INDICES = [22]  # <- Cambia estos números
SELECTED_INDICES = list(range(0, 24)) 
# Ejemplos:
# SELECTED_INDICES = [5, 10, 15, 20]  # Preguntas específicas
# SELECTED_INDICES = list(range(0, 10))  # Primeras 10 preguntas
# SELECTED_INDICES = list(range(20, 30))  # Preguntas 20-29

# ========================================

print(f"\n✓ Seleccionadas {len(SELECTED_INDICES)} preguntas para re-evaluación:")
print(f"  Índices: {SELECTED_INDICES}\n")

print("Preguntas seleccionadas:")
print("-" * 80)
for idx in SELECTED_INDICES:
    if idx < len(gt_data):
        question = gt_data[idx]["question"].strip()
        display_q = question if len(question) <= 70 else question[:67] + "..."
        print(f"{idx:6} | {display_q}")
    else:
        print(f"{idx:6} | ERROR: Índice fuera de rango")


✓ Seleccionadas 24 preguntas para re-evaluación:
  Índices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

Preguntas seleccionadas:
--------------------------------------------------------------------------------
     0 | Which solution is suitable for measuring room acoustics and speech ...
     1 | What product should be used for façade sound insulation testing on ...
     2 | Which sound source is recommended for calibrated speech intelligibi...
     3 | What sound source should be used for ISO 3382-compliant room acoust...
     4 | Which product supports compliance with ISO 9612 for workplace noise...
     5 | Which product is suitable for investigating environmental noise com...
     6 | Which product is designed for measuring exhaust noise in vehicles?
     7 | Which product helps verify safe noise emissions from toys and machi...
     8 | Which HBK 2255 variant is best suited for long-term environmental n...
     9 | Which HBK 2255 varia

In [11]:
# Filtrar ground truth data
query_texts_pairs = [
    (
        item["question"].strip(),
        item["reference"].strip(),
        [seg.strip() for seg in item.get("context", [])]
    )
    for i, item in enumerate(gt_data)
    if i in SELECTED_INDICES
]

# Filtrar dense retrieval results
all_queries_dense = list(rag_data_dense.keys())
query_retrieved_pairs_dense = [
    (
        query,
        [item["text"].strip() for item in rag_data_dense[query]]
    )
    for i, query in enumerate(all_queries_dense)
    if i in SELECTED_INDICES
]

# Filtrar sparse retrieval results
query_retrieved_pairs_sparse = [
    (
        item["query"],
        [res["window"].strip() for res in item["results"]]
    )
    for i, item in enumerate(bm25_data)
    if i in SELECTED_INDICES
]

# Filtrar chat logs
dense_pairs = []
sparse_pairs = []

dense_idx = 0
sparse_idx = 0

for item in chat_data:
    query = item.get("query", "").strip()
    mode = item.get("mode", "").strip().lower()
    response = item.get("response", "").strip()
    
    if mode == "dense rag":
        if dense_idx in SELECTED_INDICES:
            dense_pairs.append((query, response))
        dense_idx += 1
    elif mode == "sparse rag":
        if sparse_idx in SELECTED_INDICES:
            sparse_pairs.append((query, response))
        sparse_idx += 1

print(f"✓ Datos filtrados:")
print(f"  - Ground truth pairs: {len(query_texts_pairs)}")
print(f"  - Dense retrieval pairs: {len(query_retrieved_pairs_dense)}")
print(f"  - Sparse retrieval pairs: {len(query_retrieved_pairs_sparse)}")
print(f"  - Dense response pairs: {len(dense_pairs)}")
print(f"  - Sparse response pairs: {len(sparse_pairs)}")

✓ Datos filtrados:
  - Ground truth pairs: 24
  - Dense retrieval pairs: 24
  - Sparse retrieval pairs: 24
  - Dense response pairs: 24
  - Sparse response pairs: 24


In [12]:
# Inicializar LLM
llm = ChatOpenAI(
    model="DeepSeek-R1",
    temperature=0,
    max_retries=3,
    request_timeout=300
)
evaluator_llm = LangchainLLMWrapper(llm)

# Inicializar métricas
metrics = {
    'faithfulness': Faithfulness(llm=evaluator_llm),
    'context_precision': ContextPrecision(llm=evaluator_llm),
    'context_recall': ContextRecall(llm=evaluator_llm),
    'noise_sensitivity': NoiseSensitivity(llm=evaluator_llm)
}

print("✓ LLM y métricas inicializadas")

✓ LLM y métricas inicializadas


  evaluator_llm = LangchainLLMWrapper(llm)


In [13]:
async def evaluate_metrics(mode, query_texts_pairs, query_retrieved_pairs, response_pairs):
    """
    Evalúa todas las métricas para las preguntas seleccionadas
    """
    print(f"\n{'='*70}")
    print(f"=== Evaluando {mode} RAG - Preguntas Seleccionadas ===")
    print(f"{'='*70}\n")
    
    all_scores = {metric_name: [] for metric_name in metrics.keys()}
    metric_arrays = {metric_name: [] for metric_name in metrics.keys()}
    
    for idx, ((query_gt, gt_answer, gt_texts), (query_ret, ret_texts), (query_resp, response)) in enumerate(zip(
        query_texts_pairs, 
        query_retrieved_pairs, 
        response_pairs
    ), 1):
        # Verificar consistencia
        assert query_gt == query_ret == query_resp, f"Query mismatch: {query_gt} vs {query_ret} vs {query_resp}"
        
        print(f"\n[{mode} RAG - Pregunta {idx}/{len(query_texts_pairs)}]")
        print(f"Query: {query_gt[:80]}...")
        
        # Crear sample
        sample = SingleTurnSample(
            user_input=query_gt,
            response=response,
            reference=gt_answer,
            retrieved_contexts=ret_texts
        )
        
        # Evaluar cada métrica
        for metric_name, scorer in metrics.items():
            try:
                score = await scorer.single_turn_ascore(sample)
                all_scores[metric_name].append((query_gt, score))
                metric_arrays[metric_name].append(round(score, 4))
                print(f"  {metric_name.replace('_', ' ').title()}: {score:.4f}")
            except Exception as e:
                print(f"  {metric_name.replace('_', ' ').title()}: ERROR - {str(e)}")
                all_scores[metric_name].append((query_gt, None))
                metric_arrays[metric_name].append(None)
    
    return all_scores, metric_arrays

## 8. Ejecutar Evaluación para Sparse RAG

In [17]:
sparse_scores, sparse_arrays = await evaluate_metrics(
    "Sparse",
    query_texts_pairs,
    query_retrieved_pairs_sparse,
    sparse_pairs
)


=== Evaluando Sparse RAG - Preguntas Seleccionadas ===


[Sparse RAG - Pregunta 1/1]
Query: Which HBK 2255 variant includes both environmental noise monitoring and a calibr...
  Faithfulness: 1.0000
  Context Precision: 0.0000
  Context Recall: 0.0000
  Noise Sensitivity: 0.5000


## 9. Ejecutar Evaluación para Dense RAG

In [None]:
dense_scores, dense_arrays = await evaluate_metrics(
    "Dense",
    query_texts_pairs,
    query_retrieved_pairs_dense,
    dense_pairs
)


=== Evaluando Dense RAG - Preguntas Seleccionadas ===


[Dense RAG - Pregunta 1/24]
Query: Which solution is suitable for measuring room acoustics and speech intelligibili...


In [22]:
# Mostrar resultados detallados
print("\n" + "="*70)
print("=== RESULTADOS DETALLADOS ===")
print("="*70)

metric_names = ['faithfulness', 'context_precision', 'context_recall', 'noise_sensitivity']

print("\n--- Sparse RAG ---")
for metric_name in metric_names:
    print(f"\n{metric_name.replace('_', ' ').title()}:")
    print(f"  Valores: {sparse_arrays[metric_name]}")
    valid_scores = [s for s in sparse_arrays[metric_name] if s is not None]
    if valid_scores:
        avg = sum(valid_scores) / len(valid_scores)
        print(f"  Promedio: {avg:.4f}")

print("\n--- Dense RAG ---")
for metric_name in metric_names:
    print(f"\n{metric_name.replace('_', ' ').title()}:")
    print(f"  Valores: {dense_arrays[metric_name]}")
    valid_scores = [s for s in dense_arrays[metric_name] if s is not None]
    if valid_scores:
        avg = sum(valid_scores) / len(valid_scores)
        print(f"  Promedio: {avg:.4f}")


=== RESULTADOS DETALLADOS ===

--- Sparse RAG ---

Faithfulness:
  Valores: [0.0]
  Promedio: 0.0000

Context Precision:
  Valores: [0.0]
  Promedio: 0.0000

Context Recall:
  Valores: [0.0]
  Promedio: 0.0000

Noise Sensitivity:
  Valores: [0.0]
  Promedio: 0.0000

--- Dense RAG ---

Faithfulness:
  Valores: [1.0]
  Promedio: 1.0000

Context Precision:
  Valores: [0.5]
  Promedio: 0.5000

Context Recall:
  Valores: [1.0]
  Promedio: 1.0000

Noise Sensitivity:
  Valores: [0.75]
  Promedio: 0.7500
