### RAG LLM Evaluation Metrics

ScienceSage RAG LLM Evaluation Metrics

This notebook computes and visualizes key retrieval metrics for your RAG pipeline:
- Precision@k
- Recall@k
- Mean Reciprocal Rank (MRR)
- Normalized Discounted Cumulative Gain (nDCG)
- Contextual Recall and Sufficiency

In [4]:
import json
import pandas as pd
import numpy as np
import sys
import os
import matplotlib.pyplot as plt

sys.path.append("../sciencesage")
from sciencesage.config import GOLDEN_DATA_FILE, EVAL_RESULTS_FILE, TOP_K, METRICS_SUMMARY_FILE

In [None]:
def load_jsonl(path):
    with open(path) as f:
        return [json.loads(line) for line in f if line.strip()]

golden = load_jsonl("../" + GOLDEN_DATA_FILE)
results = load_jsonl("../" + EVAL_RESULTS_FILE)

FileNotFoundError: [Errno 2] No such file or directory: 'data/eval/golden_dataset.jsonl'

In [None]:
golden_df = pd.DataFrame(golden)
results_df = pd.DataFrame(results)

In [None]:
def precision_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    return len([chunk for chunk in retrieved_k if chunk in relevant_set]) / k


In [None]:
def recall_at_k(retrieved, relevant, k):
    retrieved_k = retrieved[:k]
    relevant_set = set(relevant)
    return len([chunk for chunk in retrieved_k if chunk in relevant_set]) / len(relevant_set) if relevant_set else 0.0

In [None]:
def reciprocal_rank(retrieved, relevant):
    for idx, chunk in enumerate(retrieved, 1):
        if chunk in relevant:
            return 1.0 / idx
    return 0.0

In [None]:
def dcg(retrieved, relevant, k):
    dcg_val = 0.0
    for i, chunk in enumerate(retrieved[:k]):
        rel = 1 if chunk in relevant else 0
        dcg_val += rel / np.log2(i + 2)
    return dcg_val

In [None]:
def ndcg_at_k(retrieved, relevant, k):
    ideal_rels = [1] * min(len(relevant), k)
    ideal_dcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(ideal_rels)])
    if ideal_dcg == 0:
        return 0.0
    return dcg(retrieved, relevant, k) / ideal_dcg

In [None]:
# Contextual Recall and Sufficiency: For demo, treat as recall@k (customize as needed)
def contextual_recall_and_sufficiency(retrieved, relevant, k):
    # Placeholder: in practice, this may require human or LLM judgment
    return recall_at_k(retrieved, relevant, k)

### Compute Metrics for All Queries

In [None]:
metrics = []
for g, r in zip(golden, results):
    retrieved = r.get("retrieved_chunks", [])
    relevant = g.get("ground_truth_chunks", [])
    metrics.append({
        "query": g.get("query", ""),
        f"precision@{TOP_K}": precision_at_k(retrieved, relevant, TOP_K),
        f"recall@{TOP_K}": recall_at_k(retrieved, relevant, TOP_K),
        "MRR": reciprocal_rank(retrieved, relevant),
        "nDCG": ndcg_at_k(retrieved, relevant, TOP_K),
        "contextual_recall_sufficiency": contextual_recall_and_sufficiency(retrieved, relevant, TOP_K)
    })

metrics_df = pd.DataFrame(metrics)

In [None]:
metrics_df.head()

In [None]:
agg_metrics = metrics_df.mean(numeric_only=True)
print("Average Metrics:")
display(agg_metrics)

In [None]:
# Bar plot of average metrics
agg_metrics.plot(kind="bar", figsize=(8,4), ylim=(0,1), title="Average RAG Retrieval Metrics")
plt.ylabel("Score")
plt.show()

In [None]:
# Distribution Plots
metrics_df[[f"precision@{TOP_K}", f"recall@{TOP_K}", "MRR", "nDCG", "contextual_recall_sufficiency"]].plot.hist(alpha=0.7, bins=10, figsize=(10,5), title="Metric Distributions")
plt.xlabel("Score")
plt.show()

In [None]:
# Metrics by Topic or Level
if "topic" in golden_df.columns:
    merged = pd.concat([metrics_df, golden_df[["topic"]]], axis=1)
    topic_means = merged.groupby("topic").mean(numeric_only=True)
    topic_means.plot(kind="bar", figsize=(12,6), title="Metrics by Topic")
    plt.ylabel("Score")
    plt.show()

In [None]:
metrics_df.to_csv(METRICS_SUMMARY_FILE, index=False)
print(f"Saved metrics summary to {METRICS_SUMMARY_FILE}")