# 04: Evaluation - Measuring Performance

Now let's measure how well our BS detector works.

In [3]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

from modules.m1_baseline import check_claim
from modules.m3_langgraph import check_claim_with_graph
from config.llm_factory import LLMFactory
import json

# Load test dataset
with open('../data/aviation_claims_dataset.json', 'r') as f:
    dataset = json.load(f)

print(f"📊 Test Dataset: {len(dataset['claims'])} aviation claims")

📊 Test Dataset: 30 aviation claims


## Simple Accuracy Test

Let's test our detectors on some claims:

In [4]:
# Test on easy claims
easy_claims = [c for c in dataset['claims'] if c['difficulty'] == 'easy']

# Create LLM instance for baseline
llm = LLMFactory.create_llm()

def evaluate_detector(detector_func, claims, name, needs_llm=False):
    correct = 0
    for claim_data in claims:
        if needs_llm:
            result = detector_func(claim_data['claim'], llm)
        else:
            result = detector_func(claim_data['claim'])
        if result['verdict'] == claim_data['verdict']:
            correct += 1
            print("✅", end="")
        else:
            print("❌", end="")
    
    accuracy = correct / len(claims) * 100
    print(f"\n{name} Accuracy: {accuracy:.1f}%")
    return accuracy

print("Testing Baseline detector:")
baseline_acc = evaluate_detector(check_claim, easy_claims[:4], "Baseline", needs_llm=True)

print("\nTesting LangGraph detector:")
langgraph_acc = evaluate_detector(check_claim_with_graph, easy_claims[:4], "LangGraph")

Testing Baseline detector:
✅✅✅❌
Baseline Accuracy: 75.0%

Testing LangGraph detector:
✅✅✅❌
LangGraph Accuracy: 75.0%


## Test by Difficulty Level

In [5]:
# Group claims by difficulty
by_difficulty = {'easy': [], 'medium': [], 'hard': []}
for claim in dataset['claims']:
    by_difficulty[claim['difficulty']].append(claim)

# Test on each difficulty
print("📈 Performance by Difficulty:\n")
for level, claims in by_difficulty.items():
    print(f"{level.upper()} ({len(claims)} claims):")
    acc = evaluate_detector(check_claim_with_graph, claims[:3], level)
    print()

📈 Performance by Difficulty:

EASY (4 claims):
✅✅✅
easy Accuracy: 100.0%

MEDIUM (11 claims):
✅✅✅
medium Accuracy: 100.0%

HARD (15 claims):
✅✅✅
hard Accuracy: 100.0%



## Production Evaluation (No Ground Truth)

In production, we don't have labels. Let's evaluate using quality metrics:

In [6]:
from pydantic import BaseModel

class QualityMetrics(BaseModel):
    """Metrics we can measure without ground truth"""
    reasoning_length: int
    confidence_matches_language: bool
    response_time: float

def evaluate_quality(claim: str) -> QualityMetrics:
    """Evaluate claim quality without knowing the answer"""
    import time
    
    start = time.time()
    # Use baseline with LLM for consistency
    llm = LLMFactory.create_llm()
    result = check_claim(claim, llm)
    duration = time.time() - start
    
    # Check if confidence matches the language
    high_confidence_words = ['definitely', 'certainly', 'clearly']
    low_confidence_words = ['possibly', 'might', 'could be']
    
    reasoning = result.get('reasoning', '').lower()
    confidence = result.get('confidence', 0)
    
    matches = True
    if confidence > 80 and any(word in reasoning for word in low_confidence_words):
        matches = False
    if confidence < 60 and any(word in reasoning for word in high_confidence_words):
        matches = False
    
    return QualityMetrics(
        reasoning_length=len(reasoning),
        confidence_matches_language=matches,
        response_time=duration
    )

# Test quality metrics
test_claim = "The Boeing 747 can fly backwards using reverse thrust"
metrics = evaluate_quality(test_claim)

print(f"📊 Quality Metrics for: '{test_claim}'")
print(f"  Reasoning length: {metrics.reasoning_length} chars")
print(f"  Confidence matches language: {'✅' if metrics.confidence_matches_language else '❌'}")
print(f"  Response time: {metrics.response_time:.2f}s")

📊 Quality Metrics for: 'The Boeing 747 can fly backwards using reverse thrust'
  Reasoning length: 282 chars
  Confidence matches language: ✅
  Response time: 3.13s


## Key Takeaways

1. **Test with labeled data** during development
2. **Monitor quality metrics** in production
3. **Performance drops** on harder claims
4. **LangGraph's retry logic** helps with errors

Next: Let's add tools to improve accuracy on hard claims!