In [2]:
import yaml
import json
from pathlib import Path

# Load test cases (all documents)
with open('suites/basic/sentiment_tests.yaml') as f:
    tests = list(yaml.safe_load_all(f))

# Load config
with open('configs/xlm_roberta_baseline.yaml') as f:
    config = yaml.safe_load(f)

print(f"Loaded {len(tests)} test cases")
print(f"Model: {config['model']}")
print(f"Provider: {config['provider']}")

Loaded 10 test cases
Model: xlm-roberta-base
Provider: huggingface


In [3]:
# Explore test cases
print("=== First Test Case ===")
print(tests[0])
print("\n")

# Explore config
print("=== Config Details ===")
for key, value in config.items():
    print(f"{key}: {value}")

=== First Test Case ===
{'id': 'sentiment_001', 'task': 'Classify the sentiment of this Czech review as POSITIVE, NEGATIVE, or NEUTRAL.', 'input': 'Výborný produkt! Velmi spokojený s kvalitou a dodáním.', 'expected': 'POSITIVE', 'rubric': {'accuracy': {'weight': 0.8, 'rule': 'exact_match'}, 'confidence': {'weight': 0.2, 'rule': 'score_above_0.85'}}, 'tags': ['domain:sentiment', 'language:czech', 'difficulty:easy', 'risk_level:low']}


=== Config Details ===
name: xlm-roberta-baseline
description: XLM-RoBERTa base model - no fine-tuning
provider: huggingface
model: xlm-roberta-base
task_type: text-classification
num_labels: 3
temperature: 0.3
max_tokens: 50
batch_size: 8
seed: 42
fine_tuned_weights: None


In [4]:
# See all test cases
for i, test in enumerate(tests):
    print(f"Test {i+1}: {test['id']} - {test['expected']}")

Test 1: sentiment_001 - POSITIVE
Test 2: sentiment_002 - NEGATIVE
Test 3: sentiment_003 - NEUTRAL
Test 4: sentiment_004 - NEGATIVE
Test 5: sentiment_005 - POSITIVE
Test 6: sentiment_006 - POSITIVE
Test 7: sentiment_007 - NEGATIVE
Test 8: sentiment_008 - NEGATIVE
Test 9: sentiment_009 - POSITIVE
Test 10: sentiment_010 - NEGATIVE


In [6]:
class EvaluationRunner:
    """Runs tests against a model config and scores results"""
    
    def __init__(self, config):
        self.config = config
        self.results = []
    
    def run_test(self, test, mock_output=None):
        """
        Run a single test case
        mock_output: For now, we mock the model output
        """
        if mock_output is None:
            # Placeholder: later this will call the actual model
            mock_output = test['expected']  # Mock: always returns expected
        
        # Score the test
        score = self.score_test(test, mock_output)
        
        result = {
            'id': test['id'],
            'input': test['input'],
            'expected': test['expected'],
            'actual': mock_output,
            'score': score,
            'passed': score >= 0.8
        }
        self.results.append(result)
        return result
    
    def score_test(self, test, actual_output):
        """Score based on rubric"""
        rubric = test['rubric']
        total_score = 0
        
        for criterion_name, criterion in rubric.items():
            weight = criterion['weight']
            rule = criterion['rule']
            
            # For now: exact_match rule
            if rule == 'exact_match':
                criterion_score = 1.0 if actual_output == test['expected'] else 0.0
            else:
                criterion_score = 1.0  # Placeholder for other rules
            
            total_score += criterion_score * weight
        
        return total_score
    
    def run_suite(self, tests, mock_outputs=None):
        """Run all tests in a suite"""
        for i, test in enumerate(tests):
            mock_output = mock_outputs[i] if mock_outputs else None
            self.run_test(test, mock_output)
        
        return self.results
    
    def summary(self):
        """Print summary of results"""
        total = len(self.results)
        passed = sum(1 for r in self.results if r['passed'])
        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
        
        print(f"\n=== Evaluation Summary ===")
        print(f"Model: {self.config['model']}")
        print(f"Tests run: {total}")
        print(f"Passed: {passed}/{total}")
        print(f"Average score: {avg_score:.2%}")
        print()
        
        for result in self.results:
            status = "✓" if result['passed'] else "✗"
            print(f"{status} {result['id']}: {result['actual']} (expected: {result['expected']}, score: {result['score']:.2f})")

In [7]:
# Create runner
runner = EvaluationRunner(config)

# Run all tests (mocked for now)
results = runner.run_suite(tests)

# Print summary
runner.summary()


=== Evaluation Summary ===
Model: xlm-roberta-base
Tests run: 10
Passed: 10/10
Average score: 100.00%

✓ sentiment_001: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_002: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_003: NEUTRAL (expected: NEUTRAL, score: 1.00)
✓ sentiment_004: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_005: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_006: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_007: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_008: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_009: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_010: NEGATIVE (expected: NEGATIVE, score: 1.00)
