In [2]:
!pip install transformers datasets pandas torch

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [5]:
# Import necessary libraries
import os
from datasets import load_dataset
import pandas as pd

# ----------------------------
# GPU Setup in Google Colab
# ----------------------------

# Verify GPU availability
import torch
print("PyTorch version:", torch.__version__)  # Should show >=
print("CUDA available:", torch.cuda.is_available())

# ----------------------------
# Hugging Face Authentication
# ----------------------------

# Hugging Face authentication - replace with your token
HF_TOKEN = "ADD_YOUR_HUGGINGFACE_TOKEN_HERE"  # Get from https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = HF_TOKEN  # Set as environment variable

from transformers import pipeline

PyTorch version: 2.6.0+cu124
CUDA available: True


### 1: Load the MedHallu dataset

In [6]:
# The dataset contains medical questions, hallucinated answers, and ground truth answers.
ds = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")
df = ds['train'].to_pandas()

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

### 2. Filter for hard hallucinations

In [7]:
# Focus on challenging cases where hallucinations are harder to detect
hard_hallucinations = df[df['Difficulty Level'] == 'hard']
print(f"Number of hard hallucination entries: {len(hard_hallucinations)}")

Number of hard hallucination entries: 408


### 3: Initialize the LLMs to be evaluated

In [8]:
gemma_model = pipeline(
    "text-generation",
    model="google/gemma-2-2b-it",
    token=HF_TOKEN,
    device="cuda",  # Use GPU for inference
    torch_dtype=torch.float16  # Use mixed precision for faster inference
)

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Device set to use cuda


### 4: Define system and user prompts for hallucination detection

In [20]:
# ----------------------------
# Define Prompt Templates and Batch Processing Function
# ----------------------------

system_prompt = "You are a hallucination checker for medical questions and answers. Check for hallucinations in answers. Answer strictly with 'Yes' or 'No'."

def generate_user_prompts(batch):
    """
    Generate user prompts for a batch of questions and hallucinated answers.
    """
    return [
        f"""{system_prompt}
        **Task**: Check if the provided Answer contains hallucinations by comparing it to the Ground Truth.

        **Question**: {row['Question']}

        **Answer to Evaluate**:
        {row['Hallucinated Answer']}

        **Ground Truth Reference**:
        {row['Ground Truth']}

        **Instruction**: Compare the Answer to the Ground Truth. If the Answer contains factual inaccuracies, contradictions,
        or unsupported claims relative to the Ground Truth, respond with "Yes". If it aligns factually, respond with "No".
        """
        for _, row in batch.iterrows()
    ]

def process_batch(batch):
    """
    Process a batch of prompts through the Gemma model.
    """
    prompts = generate_user_prompts(batch)

    # Generate responses for all prompts in the batch
    responses = gemma_model(
        prompts,
        max_new_tokens=2,  # Strictly limit output to "Yes"/"No"
        do_sample=False,   # Disable sampling (greedy decoding)
        batch_size=8,      # Adjust based on GPU memory (T4: 8-16)
        top_k=1            # Optional: Explicitly enforce greedy behavior
    )

    # Process and validate responses
    processed_responses = []
    for idx, response in enumerate(responses):
        # Extract generated text
        response_text = response[0]['generated_text'].strip()

        # Clean and standardize response
        model_response = "Yes" if "yes" in response_text.lower() else "No"

        # # Print verification information
        # print(f"\nBatch Index: {idx}")
        # print(f"Prompt Preview: {prompts[idx][:100]}...")  # Show first 100 chars
        # print(f"Raw Response: {response}")
        # print(f"Processed Response: {model_response}")

        processed_responses.append(model_response)

    return processed_responses

### 5. Evaluate Model on Hard Hallucinations

In [21]:
# ----------------------------
# Evaluate Model Using Batches
# ----------------------------

batch_size = 32  # Define batch size (adjust based on GPU memory)
results = []

for start_idx in range(0, len(hard_hallucinations), batch_size):
    # Select a batch of data
    batch = hard_hallucinations.iloc[start_idx:start_idx + batch_size]

    # Process the batch through the model
    responses = process_batch(batch)

    # Validate responses and store results
    for idx, response in enumerate(responses):
        row = batch.iloc[idx]
        is_correct = response == "Yes"  # Expected response is always 'Yes' since we're always giving hallucinated answers to the model

        results.append({
            "Question": row["Question"],
            "Hallucinated Answer": row["Hallucinated Answer"],
            "Model Response": response,
            "Correct Flagging": is_correct,
            "Ground Truth Annotation": row["Ground Truth"]  # For reference only
        })




Batch Index: 0
Prompt Preview: You are a hallucination checker for medical questions and answers. Check for hallucinations in answe...
Raw Response: [{'generated_text': 'You are a hallucination checker for medical questions and answers. Check for hallucinations in answers. Answer strictly with \'Yes\' or \'No\'.\n        **Task**: Check if the provided Answer contains hallucinations by comparing it to the Ground Truth.\n        \n        **Question**: Landolt C and snellen e acuity: differences in strabismus amblyopia?\n        \n        **Answer to Evaluate**: \n        Patients with strabismus amblyopia showed a significant underestimation of visual acuity when assessed with the Landolt C chart compared to the Snellen E chart.\n        \n        **Ground Truth Reference**: \n        Using the charts described, there was only a slight overestimation of visual acuity by the Snellen E compared to the Landolt C, even in strabismus amblyopia. Small differences in the lower visual acuity 

### 6. Calculate Recall Scores

In [18]:

# ----------------------------
# Performance Analysis and Results Saving
# ----------------------------

true_positives = sum(1 for r in results if r["Correct Flagging"])
false_negatives = sum(1 for r in results if not r["Correct Flagging"])
recall_score = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0

print(f"\nFinal Recall Score: {recall_score:.2%}")

results_df = pd.DataFrame(results)

print("\nSaving results to CSV...")
results_df.to_csv("gemma_hallucination_results.csv", index=False)
print("Results saved successfully!")

print("\nPreview of Results:")
print(results_df.head())


Final Recall Score: 100.00%

Saving results to CSV...
Results saved successfully!

Preview of Results:
                                            Question  \
0  Landolt C and snellen e acuity: differences in...   
1  Syncope during bathing in infants, a pediatric...   
2  Can tailored interventions increase mammograph...   
3  Is adjustment for reporting heterogeneity nece...   
4  Do mutations causing low HDL-C promote increas...   

                                 Hallucinated Answer Model Response  \
0  Patients with strabismus amblyopia showed a si...            Yes   
1  Syncope during bathing in infants is a manifes...            Yes   
2  Tailored text messages were found to be as eff...            Yes   
3  Adjustment for reporting heterogeneity is esse...            Yes   
4  Mutations causing low HDL-C levels do promote ...            Yes   

   Correct Flagging                            Ground Truth Annotation  
0              True  Using the charts described, there was 