<a href="https://colab.research.google.com/github/naisofly/HalluShield/blob/main/compare_llm_hallucination_recall.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets pandas torch

In [None]:
# Import necessary libraries
import os
from datasets import load_dataset
import pandas as pd

# ----------------------------
# GPU Setup in Google Colab
# ----------------------------

# Verify GPU availability
import torch
print("PyTorch version:", torch.__version__)  # Should show >=
print("CUDA available:", torch.cuda.is_available())

# ----------------------------
# Hugging Face Authentication
# ----------------------------

# Hugging Face authentication - replace with your token
HF_TOKEN = "ADD_YOUR_HUGGINGFACE_TOKEN_HERE"  # Get from https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = HF_TOKEN  # Set as environment variable

from transformers import pipeline

### 1: Load the MedHallu dataset

In [None]:
# The dataset contains medical questions, hallucinated answers, and ground truth answers.
ds = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")
df = ds['train'].to_pandas()

### 2. Create new test dataset from Hard Hallucinations

In [None]:
# Focus on challenging cases where hallucinations are harder to detect
hard_hallucinations = df[df['Difficulty Level'] == 'hard']
print(f"Number of hard hallucination entries: {len(hard_hallucinations)}")

# ----------------------------
# Creating Mixed Test Dataset
# ----------------------------

# Sample equal number of ground truth and hallucinated answers
num_samples = len(hard_hallucinations) // 2
ground_truth_samples = hard_hallucinations.sample(n=num_samples, random_state=42).copy()  # Ground truth answers
hallucination_samples = hard_hallucinations.sample(n=num_samples, random_state=84).copy() # Hallucinated answers

# Create 'answer' column with ground truth and hallucinated answers
ground_truth_samples['answer'] = ground_truth_samples['Ground Truth']
hallucination_samples['answer'] = hallucination_samples['Hallucinated Answer']

# Assign labels
ground_truth_samples['label'] = 'non-hallucination'
hallucination_samples['label'] = 'hallucination'

# Combine into a single DataFrame
test_df = pd.concat([ground_truth_samples, hallucination_samples])

# Shuffle the dataset
test_df = test_df.sample(frac=1, random_state=126).reset_index(drop=True)

# Print new labels
print("\nLabel counts in new dataset:")
print(test_df['label'].value_counts())

### 3: Initialize the LLMs to be evaluated

In [None]:
gemma_model = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    token=HF_TOKEN,
    device="cuda",  # Use GPU for inference
    torch_dtype=torch.float16  # Use mixed precision for faster inference
)

### 4: Define system and user prompts for hallucination detection

In [None]:
# ----------------------------
# Define Prompt Templates and Batch Processing Function
# ----------------------------

system_prompt = """You are a hallucination checker for medical Q&A.
Check if answers contain factual inaccuracies. Respond ONLY with 'Yes' or 'No'."""

def generate_user_prompt(question, answer, context):
    """
    Generate user prompt from question, answer, and context.
    """
    return f"{system_prompt}\nContext: {context}\nQuestion: {question}\nAnswer: {answer}"

def evaluate_model(test_df, model, batch_size=32):
    """
    Evaluate the model on the provided test dataset in batches.
    """
    results = []
    for i in range(0, len(test_df), batch_size):
        batch = test_df.iloc[i:i+batch_size]
        prompts = [generate_user_prompt(row['Question'], row['answer'], row['Ground Truth']) for _, row in batch.iterrows()]

        # Generate responses for the batch
        responses = model(prompts, max_new_tokens=2, do_sample=False, temperature=0.0)
        model_responses = ["Yes" if "yes" in res[0]['generated_text'].lower() else "No" for res in responses]

        for idx, row in batch.iterrows():
            model_response = model_responses[idx % batch_size]
            # Determine if the model's response is correct
            is_correct = (
                (row['label'] == 'hallucination' and model_response == 'Yes') or
                (row['label'] == 'non-hallucination' and model_response == 'No')
            )

            results.append({
                "Question": row["Question"],
                "Answer": row["answer"],
                "Label": row["label"],
                "Model Response": model_response,
                "Correct Flagging": is_correct,
                "Ground Truth Annotation": row["Ground Truth"] # For reference only
            })

    return pd.DataFrame(results)

### 5. Evaluate Model on Hard Hallucinations

In [None]:
# ----------------------------
# Evaluate Model Using Batches
# ----------------------------

results_df = evaluate_model(test_df, gemma_model)

### 6. Calculate Recall Scores

In [None]:
# ----------------------------
# Performance Analysis and Results Saving
# ----------------------------

print("\n6. Analyzing performance...")
# Calculate recall and precision for hallucination detection
true_positives = len(results_df[(results_df['Label'] == 'hallucination') & (results_df['Model Response'] == 'Yes')])
false_positives = len(results_df[(results_df['Label'] == 'non-hallucination') & (results_df['Model Response'] == 'Yes')])
false_negatives = len(results_df[(results_df['Label'] == 'hallucination') & (results_df['Model Response'] == 'No')])

# Calculate metrics
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0

print(f"Recall: {recall:.2f}")
print(f"Precision: {precision:.2f}")

# Save results to CSV
print("\nSaving results to CSV...")
results_df.to_csv("gemma_hallucination_results.csv", index=False)
print("Results saved successfully!")

print("\nPreview of Results:")
print(results_df.head())