# Compare LLM Responses Across Models

A/B test prompts across multiple AI models side-by-side.


In [None]:
%pip install -qU pixeltable anthropic openai


In [None]:
import os, getpass
if 'OPENAI_API_KEY' not in os.environ:
    os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')
if 'ANTHROPIC_API_KEY' not in os.environ:
    os.environ['ANTHROPIC_API_KEY'] = getpass.getpass('Anthropic API Key:')


In [None]:
import pixeltable as pxt
from pixeltable.functions import openai, anthropic


In [None]:
# Step 1: Create comparison table
pxt.create_dir('compare', if_exists='ignore')
prompts = pxt.create_table('compare.prompts', {'prompt': pxt.String}, if_exists='ignore')


In [None]:
# Step 2: Add responses from multiple models
msgs = [{'role': 'user', 'content': prompts.prompt}]

prompts.add_computed_column(if_exists='ignore',
    gpt4=openai.chat_completions(
        model='gpt-4o-mini', messages=msgs
    ).choices[0].message.content)

prompts.add_computed_column(if_exists='ignore',
    claude=anthropic.messages(
        model='claude-3-haiku-20240307', max_tokens=300, messages=msgs
    ).content[0].text)


In [None]:
# Step 3: Compare responses
prompts.insert([{'prompt': 'Explain quantum computing in one sentence'}])


In [None]:
# View results side-by-side
prompts.select(prompts.prompt, prompts.gpt4, prompts.claude).head()


**What's Happening:**
- Single prompt runs through multiple models
- Results stored in separate columns
- Easy to compare quality/style/cost
- Add more models as columns

**Variation:** Add model scoring:
```python
@pxt.udf
def rate_response(text: str) -> float:
    # Custom scoring logic
    return len(text) / 100  # Simple length score

prompts.add_computed_column(gpt4_score=rate_response(prompts.gpt4))
```

**Next:** `analyze-financial-data-with-llms.ipynb` • `classify-customer-support-tickets.ipynb`
