<a href="https://colab.research.google.com/github/naisofly/HalluShield/blob/main/OpenBioLLM_compare_llm_hallucination_recall.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets pandas scikit-learn
!pip install -U datasets fsspec



In [2]:
# Import necessary libraries
import os
import torch
import pandas as pd
from datasets import load_dataset
from transformers import pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# ----------------------------
# GPU Setup in Google Colab
# ----------------------------

# Verify GPU availability
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

# ----------------------------
# Hugging Face Authentication
# ----------------------------

# Hugging Face authentication - replace with your token
HF_TOKEN = "ADD_HF_TOKEN_HERE"  # Get from https://huggingface.co/settings/tokens
os.environ["HF_TOKEN"] = HF_TOKEN  # Set as environment variable


PyTorch version: 2.6.0+cu124
CUDA available: False


### 1: Load & Prep the MedHallu dataset

In [3]:
# The dataset contains medical questions, hallucinated answers, and ground truth answers.
def prepare_test_data():
    """Create balanced test set from hard hallucinations"""
    ds = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")
    df = ds['train'].to_pandas()

    hard_samples = df[df['Difficulty Level'] == 'hard']
    num_samples = len(hard_samples)

    return pd.concat([
        hard_samples.sample(num_samples, random_state=42)
            .assign(answer=lambda x: x['Ground Truth'], label='non-hallucination'),
        hard_samples.sample(num_samples, random_state=84)
            .assign(answer=lambda x: x['Hallucinated Answer'], label='hallucination')
    ]).sample(frac=1, random_state=126).reset_index(drop=True)

test_df = prepare_test_data()
print("Test dataset labels:\n", test_df['label'].value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Test dataset labels:
 label
hallucination        408
non-hallucination    408
Name: count, dtype: int64


### 2. Create new test dataset from Hard Hallucinations

In [None]:
# # Focus on challenging cases where hallucinations are harder to detect
# hard_hallucinations = df[df['Difficulty Level'] == 'hard']
# print(f"Number of hard hallucination entries: {len(hard_hallucinations)}")

# # Create balanced test set
# num_samples = len(hard_hallucinations)
# test_df = pd.concat([
#     hard_hallucinations.sample(n=num_samples, random_state=42)
#     .assign(answer=lambda x: x['Ground Truth'], label='non-hallucination'),
#     hard_hallucinations.sample(n=num_samples, random_state=84)
#     .assign(answer=lambda x: x['Hallucinated Answer'], label='hallucination')
# ]).sample(frac=1, random_state=126).reset_index(drop=True)

# print("\nLabel counts in new dataset:")
# print(test_df['label'].value_counts())

### 3: Initialize the LLMs to be evaluated

In [4]:
def init_medical_model():
    """Initialize OpenBioLLM with correct chat template"""
    return pipeline(
        "text-generation",
        model="aaditya/OpenBioLLM-Llama3-8B",
        model_kwargs={
            "torch_dtype": torch.bfloat16,
            "offload_folder": "./offload",  # <--- Add this line
        },
        device_map="auto",
    )

medical_model = init_medical_model()

config.json:   0%|          | 0.00/704 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

pytorch_model-00003-of-00004.bin:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

pytorch_model-00002-of-00004.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

pytorch_model-00001-of-00004.bin:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

pytorch_model-00004-of-00004.bin:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

Device set to use cpu


### 4: Define system and user prompts for hallucination detection

In [12]:

# ----------------------------
# Prompt Engineering
# ----------------------------
SYSTEM_PROMPT = """You are OpenBioLLM, a medical expert developed by Saama AI Labs.
Evaluate answers for factual accuracy using medical terminology and guidelines.
Respond ONLY with 'Yes' or 'No'."""

def format_prompt(row):
    system_message = (
        "You are OpenBioLLM, a medical expert developed by Saama AI Labs. "
        "Evaluate answers for factual accuracy using medical terminology and guidelines. "
        "Respond ONLY with 'Yes' or 'No'."
    )
    user_message = (
        f"Medical Context: {row['Ground Truth']}\n"
        f"Question: {row['Question']}\n"
        f"Answer: {row['answer']}\n\n"
        "Does this answer contain factual inaccuracies? Respond ONLY with 'Yes' or 'No'."
    )
    prompt = (
        "<|system|>\n" + system_message + "\n<|end|>\n"
        "<|user|>\n" + user_message + "\n<|end|>\n"
        "<|assistant|>\n"
    )
    return prompt


In [13]:

# ----------------------------
# Evaluation Logic
# ----------------------------
def parse_response(response):
    """Robust parser for model outputs"""
    response = response.lower().strip()
    if not response:
        return "invalid"
    return response.split()[0][:3]  # Capture first 3 chars for 'yes/no' detection

def batch_evaluate(df, model, batch_size=4):  # Reduced for GPU safety
    """Safe batch evaluation with error handling"""
    results = []
    for i in range(0, len(df), batch_size):
        try:
            batch = df.iloc[i:i+batch_size]
            prompts = [format_prompt(row) for _, row in batch.iterrows()]

            outputs = model(
                prompts,
                max_new_tokens=15,
                do_sample=False,
                pad_token_id=model.tokenizer.eos_token_id
            )

            for idx, (_, row) in enumerate(batch.iterrows()):
                raw_text = outputs[idx][0]['generated_text'].split("<|assistant|>")[-1]
                prediction = "Yes" if "yes" in parse_response(raw_text) else "No"

                results.append({
                    "question": row["Question"],
                    "label": row["label"],
                    "prediction": prediction,
                    "correct": (prediction == "Yes") == (row["label"] == "hallucination")
                })

        except Exception as e:
            print(f"Batch {i//batch_size} failed: {str(e)}")

    return pd.DataFrame(results)


In [8]:
# # ----------------------------
# # Define Prompt Templates and Batch Processing Function
# # ----------------------------

# system_prompt = """You are a medical hallucination detector.
# Check if answers contain factual inaccuracies. Respond EXCLUSIVELY with 'Yes' or 'No'."""

# def generate_prompt(row):
#     messages = [
#         {"role": "system", "content": system_prompt},
#         {"role": "user", "content": f"""
#             Medical Context: {row['Ground Truth']}
#             Question: {row['Question']}
#             Answer to Evaluate: {row['answer']}

#             Does the Answer contain any factual inaccuracies? Respond ONLY with 'Yes' or 'No'."""}
#     ]
#     return messages

# def parse_response(response):
#     """Strict parser with enhanced validation"""
#     response = response.lower().strip()

#     # Handle empty responses
#     if not response:
#         print("Warning: Empty response detected")
#         return "invalid_response"

#     # Extract first meaningful word
#     first_word = response.split()[0] if response.split() else ""

#     # Strict validation
#     if first_word == "yes":
#         return "yes"
#     elif first_word == "no":
#         return "no"

#     # Debug unexpected responses
#     print(f"Unexpected response: {response[:50]}")
#     return f"invalid_{response[:20].replace(' ', '_')}"

# def evaluate_model(test_df, model, batch_size=8):  # Reduced batch size for T4 GPU
#     results = []
#     for i in range(0, len(test_df), batch_size):
#         batch = test_df.iloc[i:i + batch_size]
#         prompts = [generate_prompt(row) for _, row in batch.iterrows()]

#         try:
#             responses = model(
#                 prompts,
#                 do_sample=False,
#                 temperature=0.0,
#                 return_full_text=False,
#                 pad_token_id=model.tokenizer.eos_token_id
#             )

#             for idx, (_, row) in enumerate(batch.iterrows()):
#                 raw_response = responses[idx][0]['generated_text'].strip()
#                 parsed = parse_response(raw_response)

#                 # Debug output for analysis
#                 debug_info = {
#                     "Expected": row['label'],
#                     "Raw Response": raw_response,
#                     "Parsed": parsed
#                 }
#                 print(f"Debug: {debug_info}") if parsed not in ['yes', 'no'] else None

#                 model_response = 'Yes' if parsed == 'yes' else 'No'
#                 is_correct = (model_response == 'Yes') == (row['label'] == 'hallucination')

#                 results.append({
#                     "Question": row["Question"],
#                     "Answer": row["answer"],
#                     "Label": row["label"],
#                     "Model Response": model_response,
#                     "Raw Response": raw_response,
#                     "Correct": is_correct
#                 })

#         except Exception as e:
#             print(f"Error processing batch {i//batch_size}: {str(e)}")

#     return pd.DataFrame(results)

### 5. Evaluate Model on Hard Hallucinations

In [None]:
# ----------------------------
# Evaluate Model Using Batches
# ----------------------------

results = batch_evaluate(test_df, medical_model)

### 6. Calculate Recall Scores

In [None]:
# ----------------------------
# Performance Analysis and Results Saving
# ----------------------------

print("\n6. Calculating metrics...")
true_labels = results_df['Label'].map({'hallucination': 1, 'non-hallucination': 0})
predicted_labels = results_df['Model Response'].map({'Yes': 1, 'No': 0})

cm = confusion_matrix(true_labels, predicted_labels)
disp = ConfusionMatrixDisplay(cm, display_labels=['Non-Hallucination', 'Hallucination'])
disp.plot(cmap='Blues', values_format='d')
disp.ax_.set_title("Confusion Matrix\n(1=Hallucination, 0=Non-Hallucination)")

print("\nConfusion Matrix Breakdown:")
print(f"True Positives (TP): {cm[1,1]}")  # Correctly identified hallucinations
print(f"False Positives (FP): {cm[0,1]}") # Non-hallucinations flagged as hallucinations
print(f"False Negatives (FN): {cm[1,0]}") # Missed hallucinations
print(f"True Negatives (TN): {cm[0,0]}")  # Correctly identified non-hallucinations

precision = cm[1,1] / (cm[1,1] + cm[0,1]) if (cm[1,1] + cm[0,1]) > 0 else 0
recall = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0

print(f"\nPrecision: {precision:.2f} (How many flagged hallucinations were correct)")
print(f"Recall: {recall:.2f} (How many actual hallucinations were detected)")

# Save results
results_df.to_csv("hallucination_evaluation_results.csv", index=False)
print("\nResults saved to 'hallucination_evaluation_results.csv'")