<a href="https://colab.research.google.com/github/naisofly/HalluShield/blob/main/compare_llm_hallucination_recall.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets pandas torch

In [None]:
# Import necessary libraries
import os
from datasets import load_dataset
import pandas as pd

# ----------------------------
# GPU Setup in Google Colab
# ----------------------------

# Verify GPU availability
import torch
print("PyTorch version:", torch.__version__)  # Should show >=
print("CUDA available:", torch.cuda.is_available())

# ----------------------------
# Hugging Face Authentication
# ----------------------------

# Hugging Face authentication - replace with your token
HF_TOKEN = "ADD_YOUR_HUGGINGFACE_TOKEN_HERE"  # Get from https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = HF_TOKEN  # Set as environment variable

from transformers import pipeline

### 1: Load the MedHallu dataset

In [None]:
# The dataset contains medical questions, hallucinated answers, and ground truth answers.
ds = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")
df = ds['train'].to_pandas()

### 2. Filter for hard hallucinations

In [None]:
# Focus on challenging cases where hallucinations are harder to detect
hard_hallucinations = df[df['Difficulty Level'] == 'hard']
print(f"Number of hard hallucination entries: {len(hard_hallucinations)}")

### 3: Initialize the LLMs to be evaluated

In [None]:
gemma_model = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    token=HF_TOKEN,
    device="cuda",  # Use GPU for inference
    torch_dtype=torch.float16  # Use mixed precision for faster inference
)

### 4: Define system and user prompts for hallucination detection

In [None]:
# ----------------------------
# Define Prompt Templates and Batch Processing Function
# ----------------------------

system_prompt = "You are a hallucination checker for medical questions and answers. Check for hallucinations in answers. Answer strictly with 'Yes' or 'No'."

def generate_user_prompts(batch):
    """
    Generate user prompts for a batch of questions and hallucinated answers.
    """
    return [
        f"""{system_prompt}
        **Task**: Check if the provided Answer contains hallucinations by comparing it to the Ground Truth.

        **Question**: {row['Question']}

        **Answer to Evaluate**:
        {row['Hallucinated Answer']}

        **Ground Truth Reference**:
        {row['Ground Truth']}

        **Instruction**: Compare the Answer to the Ground Truth. If the Answer contains factual inaccuracies, contradictions,
        or unsupported claims relative to the Ground Truth, respond with "Yes". If it aligns factually, respond with "No".
        """
        for _, row in batch.iterrows()
    ]

def process_batch(batch):
    """
    Process a batch of prompts through the Gemma model.
    """
    prompts = generate_user_prompts(batch)

    # Generate responses for all prompts in the batch
    responses = gemma_model(
        prompts,
        max_new_tokens=2,  # Strictly limit output to "Yes"/"No"
        do_sample=False,   # Disable sampling (greedy decoding)
        batch_size=8,      # Adjust based on GPU memory (T4: 8-16)
        top_k=1            # Optional: Explicitly enforce greedy behavior
    )

    # Process and validate responses
    processed_responses = []
    for idx, response in enumerate(responses):
        # Extract generated text
        response_text = response[0]['generated_text'].strip()

        # Clean and standardize response
        model_response = "Yes" if "yes" in response_text.lower() else "No"

        # # Print verification information
        # print(f"\nBatch Index: {idx}")
        # print(f"Prompt Preview: {prompts[idx][:100]}...")  # Show first 100 chars
        # print(f"Raw Response: {response}")
        # print(f"Processed Response: {model_response}")

        processed_responses.append(model_response)

    return processed_responses

### 5. Evaluate Model on Hard Hallucinations

In [None]:
# ----------------------------
# Evaluate Model Using Batches
# ----------------------------

batch_size = 32  # Define batch size (adjust based on GPU memory)
results = []

for start_idx in range(0, len(hard_hallucinations), batch_size):
    # Select a batch of data
    batch = hard_hallucinations.iloc[start_idx:start_idx + batch_size]

    # Process the batch through the model
    responses = process_batch(batch)

    # Validate responses and store results
    for idx, response in enumerate(responses):
        row = batch.iloc[idx]
        is_correct = response == "Yes"  # Expected response is always 'Yes' since we're always giving hallucinated answers to the model

        results.append({
            "Question": row["Question"],
            "Hallucinated Answer": row["Hallucinated Answer"],
            "Model Response": response,
            "Correct Flagging": is_correct,
            "Ground Truth Annotation": row["Ground Truth"]  # For reference only
        })

### 6. Calculate Recall Scores

In [None]:

# ----------------------------
# Performance Analysis and Results Saving
# ----------------------------

true_positives = sum(1 for r in results if r["Correct Flagging"])
false_negatives = sum(1 for r in results if not r["Correct Flagging"])
recall_score = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"\nFinal Recall Score: {recall_score:.2%}")

results_df = pd.DataFrame(results)

print("\nSaving results to CSV...")
results_df.to_csv("gemma_hallucination_results.csv", index=False)
print("Results saved successfully!")

print("\nPreview of Results:")
print(results_df.head())