# Quick Evaluation with GPT-4o

Demonstrates how to evaluate an LLM on the Causal-Supreme dataset.

In [None]:
import json
import os
from openai import OpenAI

# Set your API key
client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

## Load Test Samples

In [None]:
with open('../data/splits/openai_subsets/p3sa_500.jsonl') as f:
    samples = [json.loads(line) for line in f][:10]  # First 10 for demo

print(f"Loaded {len(samples)} samples for evaluation")

## Evaluate Model

In [None]:
SYSTEM_PROMPT = """You are an expert in causal reasoning. Answer the following question about causal relationships.
Respond with ONLY 'yes' or 'no'."""

def evaluate_sample(sample):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": sample['nl_prompt']}
        ],
        max_tokens=10,
        temperature=0
    )
    
    answer = response.choices[0].message.content.strip().lower()
    ground_truth = sample['binary_answer'].lower()
    
    return {
        'rung': sample['query']['rung'],
        'prediction': answer,
        'ground_truth': ground_truth,
        'correct': answer == ground_truth
    }

In [None]:
# Run evaluation (uncomment to execute - costs API credits)
# results = [evaluate_sample(s) for s in samples]
# accuracy = sum(r['correct'] for r in results) / len(results)
# print(f"Accuracy: {accuracy:.1%}")

## Example Output

```
Accuracy: 65.0%
Per-rung:
  Rung 1 (Association): 80%
  Rung 2 (Intervention): 60%
  Rung 3 (Counterfactual): 55%
```