In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

from datasets import load_dataset

In [2]:
from datasets import load_dataset
ds = load_dataset("UTAustin-AIHealth/MedHallu", "pqa_labeled")

In [3]:
ds.shape

{'train': (1000, 6)}

### MedHallu dataset human lablled

In [4]:
df = ds['train'].to_pandas()
print(df.shape)
print(df.columns)
df.head()

(1000, 6)
Index(['Question', 'Knowledge', 'Ground Truth', 'Difficulty Level',
       'Hallucinated Answer', 'Category of Hallucination'],
      dtype='object')


Unnamed: 0,Question,Knowledge,Ground Truth,Difficulty Level,Hallucinated Answer,Category of Hallucination
0,Do mitochondria play a role in remodelling lac...,[Programmed cell death (PCD) is the regulated ...,Results depicted mitochondrial dynamics in viv...,medium,Mitochondria regulate the formation of perfora...,Mechanism and Pathway Misattribution
1,Landolt C and snellen e acuity: differences in...,[Assessment of visual acuity depends on the op...,"Using the charts described, there was only a s...",hard,Patients with strabismus amblyopia showed a si...,Incomplete Information
2,"Syncope during bathing in infants, a pediatric...",[Apparent life-threatening events in infants a...,"""Aquagenic maladies"" could be a pediatric form...",hard,Syncope during bathing in infants is a manifes...,Misinterpretation of #Question#
3,Are the long-term results of the transanal pul...,[The transanal endorectal pull-through (TERPT)...,Our long-term study showed significantly bette...,easy,Both transanal and transabdominal pull-through...,Misinterpretation of #Question#
4,Can tailored interventions increase mammograph...,[Telephone counseling and tailored print commu...,The effects of the intervention were most pron...,hard,Tailored text messages were found to be as eff...,Incomplete Information


In [5]:
df['Category of Hallucination'].value_counts()

Category of Hallucination
Misinterpretation of #Question#            752
Incomplete Information                     212
Mechanism and Pathway Misattribution        33
Methodological and Evidence Fabrication      3
Name: count, dtype: int64

In [6]:
df['Difficulty Level'].value_counts()

Difficulty Level
hard      408
medium    318
easy      274
Name: count, dtype: int64

In [7]:
import goodfire
GOODFIRE_API_KEY = os.getenv("GOODFIRE_API_KEY")
client = goodfire.Client(api_key=GOODFIRE_API_KEY)
variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-8B-Instruct")

In [8]:
shuffled_df = df.sample(frac=1, random_state=42)

In [9]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(shuffled_df, test_size=0.3, random_state=42)
print(f"Training set size: {len(train_df)}, Test set size: {len(test_df)}")

Training set size: 700, Test set size: 300


In [10]:
FEATURE_COMPUTE_SIZE = 60

### Using goodfire to get contrast features for hallucinated and non-hallucinated answers

In [11]:
non_hallucinated_features, hallucinated_features = client.features.contrast(
    dataset_1=[
        [
            {
                "role": "user",
                "content": f"Here is some relevant knowledge: {row['Knowledge']}\n\nQuestion: {row['Question']}",
            },
            {"role": "assistant", "content": row['Ground Truth']},
        ]
        for _,row in train_df[0:FEATURE_COMPUTE_SIZE].iterrows()
    ],
    dataset_2=[
        [
            {
                "role": "user",
                "content": f"Here is some relevant knowledge: {row['Knowledge']}\n\nQuestion: {row['Question']}",
            },
            {
                "role": "assistant", 
                "content": f"{row['Hallucinated Answer']}]" 
            }
        ]
        for _,row in train_df[0:FEATURE_COMPUTE_SIZE].iterrows()
    ],
    model=variant,
    top_k=100,
)

### Reranking features to get the most important ones based on query


In [63]:
hallucination_query = "medical misinformation and factual errors in healthcare answers"
non_hallucination_query = "accurate and factual medical information with proper evidence"


non_hallucinated_features = client.features.rerank(
    features=non_hallucinated_features,
    query=non_hallucination_query,
    model=variant,
    top_k=5
)
hallucinated_features = client.features.rerank(
    features=hallucinated_features,
    query=hallucination_query,
    model=variant,
    top_k=5
)

In [64]:
non_hallucinated_features

FeatureGroup([
   0: "Academic or technical results and findings being presented",
   1: "The assistant should provide careful qualifications and disclaimers",
   2: "Scientific methodology and analytical terminology",
   3: "Evidence-based implementation and evaluation strategies",
   4: "Scientific uncertainty and calls for more research"
])

In [65]:
hallucinated_features

FeatureGroup([
   0: "Statistical changes in risk levels in medical literature",
   1: "Medical diagnostic relationships between diseases and symptoms",
   2: "Potential hazards or negative outcomes in technical writing",
   3: "The assistant is making incorrect statements with high confidence",
   4: "Major medical journal names and citation formats"
])

In [66]:
features_to_look_at = non_hallucinated_features | hallucinated_features
features_to_look_at

FeatureGroup([
   0: "Academic or technical results and findings being presented",
   1: "The assistant should provide careful qualifications and disclaimers",
   2: "Scientific methodology and analytical terminology",
   3: "Evidence-based implementation and evaluation strategies",
   4: "Scientific uncertainty and calls for more research",
   5: "Statistical changes in risk levels in medical literature",
   6: "Medical diagnostic relationships between diseases and symptoms",
   7: "Potential hazards or negative outcomes in technical writing",
   8: "The assistant is making incorrect statements with high confidence",
   9: "Major medical journal names and citation formats"
])

In [67]:
async_client = goodfire.AsyncClient(api_key=GOODFIRE_API_KEY)

In [None]:
import pandas as pd
import asyncio
from tqdm.asyncio import tqdm_asyncio

CLASSIFIER_FULL_SET_SIZE = 200

async def _get_feature_acts_for_sample_class(
    sample_class: pd.DataFrame,
    features_to_use_for_classification: goodfire.FeatureGroup,
    is_positive_class: bool,
    k=100,
    batch_size=10
):
    if k < len(features_to_use_for_classification):
        raise ValueError(
            "k must be greater than the number of features to use for classification"
        )

    samples = []
    all_samples = sample_class[0:CLASSIFIER_FULL_SET_SIZE]

    # Process in batches
    for i in range(0, len(all_samples), batch_size):
        batch = all_samples[i:i + batch_size]
        tasks = []

        for _, row in batch.iterrows():
            tasks.append(
                async_client.features.inspect(
                    [
                        {
                            "role": "user",
                            "content": f"Here is some relevant knowledge: {row['Knowledge']}\n\nQuestion: {row['Question']}",
                        },
                        {"role": "assistant", "content": row['Ground Truth'] if is_positive_class else f"{row['Hallucinated Answer']}]"}

                    ],
                    model=variant,
                    features=features_to_use_for_classification,
                )
            )

        # Process this batch
        batch_results = await tqdm_asyncio.gather(*tasks)
        for context in batch_results:
            features = context.top(k=k)
            samples.append(features)

    return samples

async def process_all_classes(dataset,start=0 , end=None):
    classification_df = dataset[start:end]

    non_hallucinated_class_features = await _get_feature_acts_for_sample_class(
        classification_df, features_to_look_at, k=100, is_positive_class=True
    )

    hallucinated_class_features = await _get_feature_acts_for_sample_class(
        classification_df, features_to_look_at, k=100, is_positive_class=False
    )

    return non_hallucinated_class_features, hallucinated_class_features


non_hallucinated_class_features, hallucinated_class_features = await process_all_classes(train_df,start=FEATURE_COMPUTE_SIZE , end=FEATURE_COMPUTE_SIZE+CLASSIFIER_FULL_SET_SIZE)

100%|██████████| 10/10 [00:09<00:00,  1.11it/s]
100%|██████████| 10/10 [00:09<00:00,  1.01it/s]
100%|██████████| 10/10 [00:08<00:00,  1.16it/s]
100%|██████████| 10/10 [00:08<00:00,  1.23it/s]
100%|██████████| 10/10 [00:10<00:00,  1.03s/it]
100%|██████████| 10/10 [00:16<00:00,  1.64s/it]
100%|██████████| 10/10 [00:08<00:00,  1.15it/s]
100%|██████████| 10/10 [00:09<00:00,  1.04it/s]
100%|██████████| 10/10 [00:07<00:00,  1.36it/s]
100%|██████████| 10/10 [00:08<00:00,  1.14it/s]
100%|██████████| 10/10 [00:08<00:00,  1.22it/s]
100%|██████████| 10/10 [00:11<00:00,  1.18s/it]
100%|██████████| 10/10 [00:11<00:00,  1.12s/it]
100%|██████████| 10/10 [00:10<00:00,  1.02s/it]
100%|██████████| 10/10 [00:10<00:00,  1.09s/it]
100%|██████████| 10/10 [00:08<00:00,  1.16it/s]
100%|██████████| 10/10 [00:09<00:00,  1.10it/s]
100%|██████████| 10/10 [00:07<00:00,  1.37it/s]
100%|██████████| 10/10 [00:08<00:00,  1.19it/s]
100%|██████████| 10/10 [00:07<00:00,  1.27it/s]
100%|██████████| 10/10 [00:09<00:00,  1.

In [None]:
TEST_CLASSIFIER_FULL_SET_SIZE =50

non_hallucinated_test_class_features, hallucinated_test_class_features = await process_all_classes(test_df,start=0 , end=TEST_CLASSIFIER_FULL_SET_SIZE)

In [50]:
from itertools import combinations


class FeatureSearch:
    """A class for systematically searching through combinations of features to evaluate their predictive power."""

    def __init__(self, feature_group):
        self.feature_group = feature_group

    def grid(self, k_features_per_combo: int = 2):
        """Perform a grid search over all possible combinations of features.

        Args:
            k_features_per_combo (int): The number of features to include in each combination.

        Returns:
            list: All possible k-sized combinations of features from the feature group.
        """

        # Get all possible combinations of features
        return list(combinations(self.feature_group, k_features_per_combo))

In [None]:
import tqdm
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn import tree
# Now add a function to evaluate on test set
def evaluate_on_test_set(model, combo):
    # Helper function to extract feature activations - same as in training
    def _select_feature_acts(combo, row):
        output = []
        for feature in combo:
            for feature_act in row:
                if feature_act.feature.uuid == feature.uuid:
                    output.append(feature_act.activation)
                    break
        return output
    
    # Get test features
    x_test_negative = [
        _select_feature_acts(combo, row) for row in hallucinated_test_class_features
    ]
    x_test_positive = [
        _select_feature_acts(combo, row) for row in non_hallucinated_test_class_features
    ]
    y_test_negative = [-1] * len(x_test_negative)
    y_test_positive = [1] * len(x_test_positive)
    
    X_test = x_test_negative + x_test_positive
    y_test = y_test_negative + y_test_positive
    
    # Make predictions
    test_preds = model.predict(X_test)
    
     # Calculate basic metrics
    test_accuracy = accuracy_score(y_test, test_preds)
    test_f1 = f1_score(y_test, test_preds, average='weighted')
    test_balanced_acc = balanced_accuracy_score(y_test, test_preds)
    test_precision = precision_score(y_test, test_preds, average='weighted')
    test_recall = recall_score(y_test, test_preds, average='weighted')
    
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, test_preds).ravel()
        

    #print(f"Test Accuracy: {test_accuracy:.4f}")
    return {
        'accuracy': test_accuracy,
        'f1_score': test_f1,
        'balanced_accuracy': test_balanced_acc,
        'precision': test_precision,
        'recall': test_recall,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp
    }

# Update the find_best_combo function to use train/test split properly
def find_best_combo(features, k_features_per_combo=2):
    combos = FeatureSearch(features).grid(k_features_per_combo=k_features_per_combo)
    best_combo = None
    best_model = None
    best_score = 0
    best_test_metrics = None

    for combo in tqdm.tqdm(combos):
        # Helper function to extract feature activations
        def _select_feature_acts(combo, row):
            output = []
            for feature in combo:
                for feature_act in row:
                    if feature_act.feature.uuid == feature.uuid:
                        output.append(feature_act.activation)
                        break
            return output

        # Train on training data
        x_train_negative = [_select_feature_acts(combo, row) for row in hallucinated_class_features]
        x_train_positive = [_select_feature_acts(combo, row) for row in non_hallucinated_class_features]
        y_train_negative = [-1] * len(x_train_negative)
        y_train_positive = [1] * len(x_train_positive)

        X_train = x_train_negative + x_train_positive
        y_train = y_train_negative + y_train_positive

        # Create and train model
        model = tree.DecisionTreeClassifier(
            max_depth=len(combo), 
            min_samples_leaf=len(X_train) // 10, 
            random_state=42
        )
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        test_metrics = evaluate_on_test_set(model, combo)
        
        # Use test F1 score for selection
        test_f1 = test_metrics['f1_score']
        
        if test_f1 > best_score:
            best_score = test_f1
            best_combo = combo
            best_model = model
            best_test_metrics = test_metrics

    print(f"Best combo test metrics: {best_test_metrics}")
    return best_combo, best_score, best_model



best_combo_at_k = {}
for i in range(3):
    best_combo, best_score, best_model = find_best_combo(
        features_to_look_at, k_features_per_combo=i + 1
    )
    print(i + 1, best_combo, best_score, best_model)
    best_combo_at_k[i + 1] = (best_combo, best_score, best_model)

In [None]:
# ... existing code ...
import tqdm
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import numpy as np

# The evaluate_on_test_set function can remain the same
# ... existing code ...

# Update the find_best_combo function to use RandomForest instead of Decision Tree
def find_best_combo_random_forest(features, k_features_per_combo=2):
    combos = FeatureSearch(features).grid(k_features_per_combo=k_features_per_combo)
    best_combo = None
    best_model = None
    best_score = 0
    best_test_metrics = None

    for combo in tqdm.tqdm(combos):
        # Helper function to extract feature activations
        def _select_feature_acts(combo, row):
            output = []
            for feature in combo:
                for feature_act in row:
                    if feature_act.feature.uuid == feature.uuid:
                        output.append(feature_act.activation)
                        break
            return output

        # Train on training data
        x_train_negative = [_select_feature_acts(combo, row) for row in hallucinated_class_features]
        x_train_positive = [_select_feature_acts(combo, row) for row in non_hallucinated_class_features]
        y_train_negative = [-1] * len(x_train_negative)
        y_train_positive = [1] * len(x_train_positive)

        X_train = x_train_negative + x_train_positive
        y_train = y_train_negative + y_train_positive

        # Create and train Random Forest model
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,  # Allow trees to grow fully
            min_samples_leaf=len(X_train) // 20,  # Smaller leaf size for RF
            random_state=42
        )
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        test_metrics = evaluate_on_test_set(model, combo)
        
        # Use test F1 score for selection
        test_f1 = test_metrics['f1_score']
        
        if test_f1 > best_score:
            best_score = test_f1
            best_combo = combo
            best_model = model
            best_test_metrics = test_metrics

    print(f"Best combo test metrics: {best_test_metrics}")
    return best_combo, best_score, best_model

# Function to visualize feature importance
def plot_feature_importance(model, feature_combo):
    feature_labels = [f.label for f in feature_combo]
    importances = model.feature_importances_
    
    # Sort features by importance
    indices = np.argsort(importances)
    
    plt.figure(figsize=(10, len(feature_combo) * 0.5))
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [feature_labels[i] for i in indices])
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    plt.show()
    
    # Also print numerical values
    for i in indices:
        print(f"{feature_labels[i]}: {importances[i]:.4f}")

# Now run the search with different numbers of features
best_combo_at_k = {}
for i in range(3):
    print(f"\nRunning search with {i+1} features per combo...")
    best_combo, best_score, best_model = find_best_combo(
        features_to_look_at, k_features_per_combo=i + 1
    )
    print(f"Best combo ({i+1} features): {[f.label for f in best_combo]}")
    print(f"Best test F1 score: {best_score}")
    best_combo_at_k[i + 1] = (best_combo, best_score, best_model)
    
    # Plot feature importance for this model
    print("\nFeature importance:")
    plot_feature_importance(best_model, best_combo)
# ... existing code ...

In [None]:
def find_best_combo_rf(features, k_features_per_combo=3):
    """Find the best combination of features using Random Forest."""
    combos = FeatureSearch(features).grid(k_features_per_combo=k_features_per_combo)
    best_combo = None
    best_model = None
    best_score = 0
    best_test_metrics = None

    for combo in tqdm.tqdm(combos):
        # Helper function to extract feature activations
        def _select_feature_acts(combo, row):
            output = []
            for feature in combo:
                for feature_act in row:
                    if feature_act.feature.uuid == feature.uuid:
                        output.append(feature_act.activation)
                        break
            return output

        # Train on training data
        x_train_negative = [_select_feature_acts(combo, row) for row in hallucinated_class_features]
        x_train_positive = [_select_feature_acts(combo, row) for row in non_hallucinated_class_features]
        y_train_negative = [-1] * len(x_train_negative)
        y_train_positive = [1] * len(x_train_positive)

        X_train = x_train_negative + x_train_positive
        y_train = y_train_negative + y_train_positive

        # Create and train Random Forest model
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=None,
            min_samples_leaf=len(X_train) // 20,
            random_state=42
        )
        model.fit(X_train, y_train)
        
        # Evaluate on test set
        test_metrics = evaluate_on_test_set(model, combo)
        test_f1 = test_metrics['f1_score']
        
        if test_f1 > best_score:
            best_score = test_f1
            best_combo = combo
            best_model = model
            best_test_metrics = test_metrics

    print(f"Best combo test metrics: {best_test_metrics}")
    return best_combo, best_score, best_model

best_combo_at_k = {}
for i in range(3):
    best_combo, best_score, best_model = find_best_combo_rf(
        features_to_look_at, k_features_per_combo=i + 1
    )
    print(i + 1, best_combo, best_score, best_model)
    best_combo_at_k[i + 1] = (best_combo, best_score, best_model)
