# **EXPERIMENT: EVALUATING HALLUCINATION DETECTION FRAMEWORKS**

**DESCRIPTION**

* Benchmarks hallucination detection across multiple frameworks on a shared QA dataset.
* Evaluates and appends per-query scores using UpTrain, Vectara, LettuceDetect, RAGAS, Cleanlab-TLM, and DeepEval.
* Visualizes prediction quality via Seaborn confusion matrices for various decision thresholds.
* Run this notebook on Colab
* Use the generate hallucinated dataset file to produce hallucinated_answers.json file



**REQUIREMENTS**

In [None]:
%%bash
pip install -q --upgrade pip

# core numeric stack (these versions are tested together)
pip install -q numpy scipy  pandas scikit-learn

# everything else you need
pip install -q \
  langchain openai chromadb pypdf llama-index tiktoken langchain_community \
  matplotlib transformers torch \
  lettucedetect cleanlab-tlm ragas uptrain deepeval evaluate datasets tqdm rouge-score

**IMPORTS**

In [8]:
import asyncio
import json
import os
import time
from pathlib import Path
from typing import List, Dict, Any
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix
from tqdm.auto import tqdm
from uptrain import EvalLLM, Evals
from transformers import pipeline, AutoTokenizer
from lettucedetect.models.inference import HallucinationDetector
from cleanlab_tlm import TLM
from ragas.metrics._factual_correctness import FactualCorrectness
from ragas.dataset_schema import SingleTurnSample
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
from deepeval.test_case import LLMTestCase
from deepeval.metrics import HallucinationMetric
import seaborn as sns

DATA_PATH = Path("hallucinated_answers.json")
UPTRAIN_OUT = Path("uptrain_results.json")
VECTARA_OUT = Path("vectara_results.json")
LETTUCE_OUT = Path("lettuce_results.json")
TLM_OUT = Path("tlm_results.json")
RAGAS_OUT = Path("ragas_results.json")
DEEPEVAL_OUT = Path("deepeval_results.json")

OPENAI_API_KEY = "your-api-key"
CLEANLAB_TLM_API_KEY = "your-api-key"

**GENERAL FUNCTIONS**

In [9]:
def load_items(fp: Path) -> List[Dict[str, Any]]:
    with fp.open() as f:
        return json.load(f)


def save_items(items: List[Dict[str, Any]], fp: Path) -> None:
    fp.write_text(json.dumps(items, indent=2))


def plot_confusions(df: pd.DataFrame, score_col: str, thresholds: List[float],
                     invert: bool = False, title_prefix: str = "") -> None:
    """Generic confusion-matrix visual using seaborn."""
    import numpy as np
    fig, axes = plt.subplots(1, len(thresholds), figsize=(6 * len(thresholds), 5))
    if len(thresholds) == 1:
        axes = [axes]

    y_true = df["is_hallucinated"].astype(bool)
    scores = df[score_col]
    scores = 1 - scores if invert else scores

    for ax, t in zip(axes, thresholds):
        y_pred = scores < t
        cm = confusion_matrix(y_true, y_pred, labels=[True, False])
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False,
                    xticklabels=["Hallucinated", "Not Hallucinated"],
                    yticklabels=["Hallucinated", "Not Hallucinated"], ax=ax)
        ax.set_title(f"{title_prefix} < {t}")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")

    plt.tight_layout()
    plt.show()


def eval_uptrain(items: List[Dict[str, Any]]) -> None:
    """Append UpTrain factual_accuracy_score + explanation."""
    evaluator = EvalLLM(openai_api_key=OPENAI_API_KEY)
    for itm in tqdm(items, desc="UpTrain"):
        query = {"question": itm["question"], "context": itm["context"], "response": itm["hallucinated_answer"]}
        t0 = time.time()
        res = evaluator.evaluate([query], checks=[Evals.FACTUAL_ACCURACY])[0]
        itm["uptrain_score"] = res["score_factual_accuracy"]
        itm["uptrain_explanation"] = res["explanation_factual_accuracy"]
        itm["uptrain_time"] = round(time.time() - t0, 3)


def eval_vectara(items: List[Dict[str, Any]]) -> None:
    tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
    classifier = pipeline("text-classification", model="vectara/hallucination_evaluation_model",
                          tokenizer=tokenizer, trust_remote_code=True)
    prompt_tmpl = ("<pad> Determine if the hypothesis is true given the premise?\n\n" "Premise: {premise}\n\nHypothesis: {hypo}")
    for itm in tqdm(items, desc="Vectara"):
        premise = "\n".join(itm["context"])
        prompt = prompt_tmpl.format(premise=premise, hypo=itm["hallucinated_answer"])
        t0 = time.time()
        score = next(s for s in classifier(prompt, top_k=None) if s["label"] == "consistent")["score"]
        itm["vectara_score"] = score
        itm["vectara_time"] = round(time.time() - t0, 3)


def eval_lettuce(items: List[Dict[str, Any]]) -> None:
    detector = HallucinationDetector(method="transformer",
                                     model_path="KRLabsOrg/lettucedect-base-modernbert-en-v1")
    for itm in tqdm(items, desc="Lettuce"):
        prompt = f"Given the following background information:\n\n{itm['context']}\n\nAnswer the question: {itm['question']}"
        spans = detector.predict_prompt(prompt, answer=itm["hallucinated_answer"], output_format="spans") or []
        itm["lettuce_spans"] = [s["text"] for s in spans]
        itm["lettuce_conf"] = (sum(s["confidence"] for s in spans) / len(spans)) if spans else 0.0
        itm["lettuce_predicted_hallucinated"] = bool(spans)


def eval_tlm(items: List[Dict[str, Any]]) -> None:
    os.environ["CLEANLAB_TLM_API_KEY"] = CLEANLAB_TLM_API_KEY or ""
    tlm = TLM(options={"custom_eval_criteria": [{"name": "Factuality",
                                                 "criteria": "Determine if the response is factually correct based on the context."}]})
    prompts = [f"Context:\n{' '.join(itm['context'])}\n\nQuestion: {itm['question']}" for itm in items]
    answers = [itm["hallucinated_answer"] for itm in items]
    res = tlm.get_trustworthiness_score(prompts, answers)
    for itm, r in zip(items, res):
        itm["tlm_trust"] = r["trustworthiness_score"]
        itm["tlm_factuality"] = r["log"]["custom_eval_criteria"][0]["score"]


def eval_ragas(items: List[Dict[str, Any]]) -> None:
    scorer = FactualCorrectness(llm=LangchainLLMWrapper(ChatOpenAI(model="gpt-4o")))

    async def _run():
        for itm in tqdm(items, desc="RAGAS"):
            sample = SingleTurnSample(user_input=itm["question"],
                                      response=itm["hallucinated_answer"],
                                      retrieved_contexts=itm["context"])
            try:
                score = await scorer.single_turn_ascore(sample)
            except Exception:
                score = None
            itm["ragas_score"] = score
    asyncio.run(_run())


def eval_deepeval(items: List[Dict[str, Any]]) -> None:
    metric = HallucinationMetric(threshold=0.5)
    for itm in tqdm(items, desc="DeepEval"):
        tc = LLMTestCase(input=itm["question"], actual_output=itm["hallucinated_answer"], context=itm["context"])
        try:
            metric.measure(tc)
            itm["deepeval_score"] = metric.score
        except Exception:
            itm["deepeval_score"] = None

**RUN**

The thresholds can be set using the ROC curve or F1 score, but in this experiment, they were manually tried, and some are a combination of both approaches to see where we get the best results.

In [None]:

def main():
    items = load_items(DATA_PATH)

    # Ensure ground‑truth label is bool for later plots
    for itm in items:
        itm["is_hallucinated"] = bool(itm.get("is_hallucinated", False))

    # === Run evaluators (comment out to skip) ===
    eval_uptrain(items)
    save_items(items, UPTRAIN_OUT)

    eval_vectara(items)
    save_items(items, VECTARA_OUT)

    eval_lettuce(items)
    save_items(items, LETTUCE_OUT)

    eval_tlm(items)
    save_items(items, TLM_OUT)

    eval_ragas(items)
    save_items(items, RAGAS_OUT)

    eval_deepeval(items)
    save_items(items, DEEPEVAL_OUT)

    # === Visualise confusion matrices for a few metrics ===
    df = pd.DataFrame(items)

    plot_confusions(df, "uptrain_score", thresholds=[0.8, 0.6], invert=False, title_prefix="UpTrain")
    plot_confusions(df, "vectara_score", thresholds=[0.7, 0.8], invert=False, title_prefix="Vectara")
    plot_confusions(df, "lettuce_conf", thresholds=[0.6, 0.5], invert=False, title_prefix="Lettuce")
    plot_confusions(df, "tlm_trust", thresholds=[0.95, 0.9], invert=False, title_prefix="TLM trust")

    ## RAGAS score: higher = better, so invert for "score < threshold" semantics
    plot_confusions(df, "ragas_score", thresholds=[0.75, 0.25], invert=True, title_prefix="RAGAS")
    plot_confusions(df, "deepeval_score", thresholds=[0.5], invert=True, title_prefix="DeepEval")

    print("🎉 Benchmark complete — results saved next to the dataset.")


if __name__ == "__main__":
    main()


**REFERENCES**

https://docs.uptrain.ai/predefined-evaluations/context-awareness/factual-accuracy

https://docs.ragas.io/en/latest/concepts/metrics/available_metrics/factual_correctness/

https://deepeval.com/docs/getting-started

https://cleanlab.ai/blog/trustworthy-language-model/

https://github.com/KRLabsOrg/LettuceDetect

https://huggingface.co/vectara/hallucination_evaluation_model