# Baseline Heuristic

Simple heuristic baselines for comparison:
- Random guessing
- Majority class
- Effect size threshold

In [None]:
import json
import numpy as np
from collections import Counter

## Load Data

In [None]:
with open('../data/splits/openai_subsets/p3sa_500.jsonl') as f:
    samples = [json.loads(line) for line in f]

print(f"Total samples: {len(samples)}")

## Baseline 1: Random Guessing

In [None]:
np.random.seed(42)

random_preds = np.random.choice(['yes', 'no'], size=len(samples))
ground_truth = [s['binary_answer'] for s in samples]

random_acc = np.mean([p == g for p, g in zip(random_preds, ground_truth)])
print(f"Random baseline accuracy: {random_acc:.1%}")

## Baseline 2: Majority Class

In [None]:
label_counts = Counter(ground_truth)
majority_class = label_counts.most_common(1)[0][0]

print(f"Label distribution: {dict(label_counts)}")
print(f"Majority class: '{majority_class}'")

majority_acc = label_counts[majority_class] / len(samples)
print(f"Majority baseline accuracy: {majority_acc:.1%}")

## Baseline 3: Effect Size Threshold

In [None]:
def predict_by_effect_size(sample, threshold=0.5):
    """Predict 'yes' if |effect_size| > threshold."""
    effect = abs(sample.get('effect_size', 0))
    return 'yes' if effect > threshold else 'no'

effect_preds = [predict_by_effect_size(s) for s in samples]
effect_acc = np.mean([p == g for p, g in zip(effect_preds, ground_truth)])

print(f"Effect-size threshold accuracy: {effect_acc:.1%}")

## Summary

In [None]:
print("\n=== Baseline Comparison ===")
print(f"Random:       {random_acc:.1%}")
print(f"Majority:     {majority_acc:.1%}")
print(f"Effect-size:  {effect_acc:.1%}")
print("\nLLM models should significantly outperform these baselines.")