In [1]:
import yaml
import json
from pathlib import Path
from datetime import datetime

In [2]:
# Load test cases
with open('suites/basic/sentiment_tests.yaml') as f:
    tests = list(yaml.safe_load_all(f))

# Load config
with open('configs/xlm_roberta_baseline.yaml') as f:
    config = yaml.safe_load(f)

print(f"Loaded {len(tests)} test cases")
print(f"Model: {config['model']}")
print(f"Provider: {config['provider']}")

Loaded 10 test cases
Model: bert-base-multilingual-cased
Provider: huggingface


In [3]:
import yaml
import json
from pathlib import Path

# Load test cases (all documents)
with open('suites/basic/sentiment_tests.yaml') as f:
    tests = list(yaml.safe_load_all(f))

# Load config
with open('configs/xlm_roberta_baseline.yaml') as f:
    config = yaml.safe_load(f)

print(f"Loaded {len(tests)} test cases")
print(f"Model: {config['model']}")
print(f"Provider: {config['provider']}")

Loaded 10 test cases
Model: bert-base-multilingual-cased
Provider: huggingface


In [4]:
# Explore test cases
print("=== First Test Case ===")
print(tests[0])
print("\n")

# Explore config
print("=== Config Details ===")
for key, value in config.items():
    print(f"{key}: {value}")

=== First Test Case ===
{'id': 'sentiment_001', 'task': 'Classify the sentiment of this Czech review as POSITIVE, NEGATIVE, or NEUTRAL.', 'input': 'Výborný produkt! Velmi spokojený s kvalitou a dodáním.', 'expected': 'POSITIVE', 'rubric': {'accuracy': {'weight': 0.8, 'rule': 'exact_match'}, 'confidence': {'weight': 0.2, 'rule': 'score_above_0.85'}}, 'tags': ['domain:sentiment', 'language:czech', 'difficulty:easy', 'risk_level:low']}


=== Config Details ===
name: bert-multilingual-baseline
description: BERT multilingual model - works with Czech/Slovak
provider: huggingface
model: bert-base-multilingual-cased
task_type: text-classification
num_labels: 3
temperature: 0.3
max_tokens: 50
batch_size: 8
seed: 42
fine_tuned_weights: None


In [5]:
# See all test cases
for i, test in enumerate(tests):
    print(f"Test {i+1}: {test['id']} - {test['expected']}")

Test 1: sentiment_001 - POSITIVE
Test 2: sentiment_002 - NEGATIVE
Test 3: sentiment_003 - NEUTRAL
Test 4: sentiment_004 - NEGATIVE
Test 5: sentiment_005 - POSITIVE
Test 6: sentiment_006 - POSITIVE
Test 7: sentiment_007 - NEGATIVE
Test 8: sentiment_008 - NEGATIVE
Test 9: sentiment_009 - POSITIVE
Test 10: sentiment_010 - NEGATIVE


In [6]:
class EvaluationRunner:
    """Runs tests against a model config and scores results"""
    
    def __init__(self, config):
        self.config = config
        self.results = []
    
    def run_test(self, test, mock_output=None):
        """
        Run a single test case
        mock_output: For now, we mock the model output
        """
        if mock_output is None:
            # Placeholder: later this will call the actual model
            mock_output = test['expected']  # Mock: always returns expected
        
        # Score the test
        score = self.score_test(test, mock_output)
        
        result = {
            'id': test['id'],
            'input': test['input'],
            'expected': test['expected'],
            'actual': mock_output,
            'score': score,
            'passed': score >= 0.8
        }
        self.results.append(result)
        return result
    
    def score_test(self, test, actual_output):
        """Score based on rubric"""
        rubric = test['rubric']
        total_score = 0
        
        for criterion_name, criterion in rubric.items():
            weight = criterion['weight']
            rule = criterion['rule']
            
            # For now: exact_match rule
            if rule == 'exact_match':
                criterion_score = 1.0 if actual_output == test['expected'] else 0.0
            else:
                criterion_score = 1.0  # Placeholder for other rules
            
            total_score += criterion_score * weight
        
        return total_score
    
    def run_suite(self, tests, mock_outputs=None):
        """Run all tests in a suite"""
        for i, test in enumerate(tests):
            mock_output = mock_outputs[i] if mock_outputs else None
            self.run_test(test, mock_output)
        
        return self.results
    
    def summary(self):
        """Print summary of results"""
        total = len(self.results)
        passed = sum(1 for r in self.results if r['passed'])
        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
        
        print(f"\n=== Evaluation Summary ===")
        print(f"Model: {self.config['model']}")
        print(f"Tests run: {total}")
        print(f"Passed: {passed}/{total}")
        print(f"Average score: {avg_score:.2%}")
        print()
        
        for result in self.results:
            status = "✓" if result['passed'] else "✗"
            print(f"{status} {result['id']}: {result['actual']} (expected: {result['expected']}, score: {result['score']:.2f})")

In [7]:
# Create runner
runner = EvaluationRunner(config)

# Run all tests (mocked for now)
results = runner.run_suite(tests)

# Print summary
runner.summary()


=== Evaluation Summary ===
Model: bert-base-multilingual-cased
Tests run: 10
Passed: 10/10
Average score: 100.00%

✓ sentiment_001: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_002: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_003: NEUTRAL (expected: NEUTRAL, score: 1.00)
✓ sentiment_004: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_005: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_006: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_007: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_008: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_009: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_010: NEGATIVE (expected: NEGATIVE, score: 1.00)


In [8]:
import yaml
import json
from pathlib import Path
from datetime import datetime

In [9]:
import os
print(f"Current working directory: {os.getcwd()}")
print("\nFiles in suites/basic/:")
import subprocess
result = subprocess.run(['ls', '-la', 'suites/basic/'], capture_output=True, text=True)
print(result.stdout)
print(result.stderr)

Current working directory: /Users/macski/claude/ai-evaluator-project

Files in suites/basic/:
total 32
drwxr-xr-x  5 macski  staff   160 12 Dec 18:57 [34m.[m[m
drwxr-xr-x  4 macski  staff   128 12 Dec 16:34 [34m..[m[m
-rw-r--r--@ 1 macski  staff  6148 12 Dec 18:12 .DS_Store
drwxr-xr-x  3 macski  staff    96 12 Dec 18:57 [34m.ipynb_checkpoints[m[m
-rw-------  1 macski  staff  4468 12 Dec 15:11 sentiment_tests.yaml




In [10]:
# Load test cases
with open('suites/basic/sentiment_tests.yaml') as f:
    tests = list(yaml.safe_load_all(f))

# Load config
with open('configs/xlm_roberta_baseline.yaml') as f:
    config = yaml.safe_load(f)

print(f"Loaded {len(tests)} test cases")

Loaded 10 test cases


In [11]:
class EvaluationRunner:
    """Runs tests against a model config and scores results"""
    
    def __init__(self, config):
        self.config = config
        self.results = []
    
    def run_test(self, test, mock_output=None):
        if mock_output is None:
            mock_output = test['expected']
        
        score = self.score_test(test, mock_output)
        
        result = {
            'id': test['id'],
            'input': test['input'],
            'expected': test['expected'],
            'actual': mock_output,
            'score': score,
            'passed': score >= 0.8
        }
        self.results.append(result)
        return result
    
    def score_test(self, test, actual_output):
        rubric = test['rubric']
        total_score = 0
        
        for criterion_name, criterion in rubric.items():
            weight = criterion['weight']
            rule = criterion['rule']
            
            if rule == 'exact_match':
                criterion_score = 1.0 if actual_output == test['expected'] else 0.0
            else:
                criterion_score = 1.0
            
            total_score += criterion_score * weight
        
        return total_score
    
    def run_suite(self, tests, mock_outputs=None):
        for i, test in enumerate(tests):
            mock_output = mock_outputs[i] if mock_outputs else None
            self.run_test(test, mock_output)
        
        return self.results
    
    def summary(self):
        total = len(self.results)
        passed = sum(1 for r in self.results if r['passed'])
        avg_score = sum(r['score'] for r in self.results) / total if total > 0 else 0
        
        print(f"\n=== Evaluation Summary ===")
        print(f"Model: {self.config['model']}")
        print(f"Tests run: {total}")
        print(f"Passed: {passed}/{total}")
        print(f"Average score: {avg_score:.2%}")
        print()
        
        for result in self.results:
            status = "✓" if result['passed'] else "✗"
            print(f"{status} {result['id']}: {result['actual']} (expected: {result['expected']}, score: {result['score']:.2f})")

In [12]:
# Create runner
runner = EvaluationRunner(config)

# Run all tests
results = runner.run_suite(tests)

# Print summary
runner.summary()


=== Evaluation Summary ===
Model: bert-base-multilingual-cased
Tests run: 10
Passed: 10/10
Average score: 100.00%

✓ sentiment_001: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_002: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_003: NEUTRAL (expected: NEUTRAL, score: 1.00)
✓ sentiment_004: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_005: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_006: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_007: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_008: NEGATIVE (expected: NEGATIVE, score: 1.00)
✓ sentiment_009: POSITIVE (expected: POSITIVE, score: 1.00)
✓ sentiment_010: NEGATIVE (expected: NEGATIVE, score: 1.00)


In [13]:
import json
from datetime import datetime
from pathlib import Path

def save_results(runner, suite_name):
    """Save evaluation results to runs/ folder"""
    
    # Create run folder with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_id = f"{timestamp}_{runner.config['name']}"
    run_folder = Path(f"runs/{run_id}")
    run_folder.mkdir(parents=True, exist_ok=True)
    
    # Save results as JSON
    results_json = {
        'run_id': run_id,
        'timestamp': timestamp,
        'config': runner.config,
        'suite': suite_name,
        'summary': {
            'total': len(runner.results),
            'passed': sum(1 for r in runner.results if r['passed']),
            'avg_score': sum(r['score'] for r in runner.results) / len(runner.results) if runner.results else 0
        },
        'results': runner.results
    }
    
    with open(run_folder / 'results.json', 'w') as f:
        json.dump(results_json, f, indent=2)
    
    print(f"✅ Results saved to: {run_folder}")
    return run_folder

# Test it
run_folder = save_results(runner, 'basic')

✅ Results saved to: runs/20251213_191427_bert-multilingual-baseline


In [14]:
with open('configs/xlm_roberta_baseline.yaml') as f:
    config = yaml.safe_load(f)

print("Config loaded")
print(f"Model: {config['model']}")
print(f"Provider: {config['provider']}")

Config loaded
Model: bert-base-multilingual-cased
Provider: huggingface


In [17]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load real model
model_name = config['model']  # 'bert-base-multilingual-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

print(f"✅ Loaded model: {model_name}")
print(f"Model type: {type(model)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Loaded model: bert-base-multilingual-cased
Model type: <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'>


In [18]:
# Test the model on first test case
test = tests[0]
input_text = test['input']

# Tokenize
inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=128)

# Get prediction
with torch.no_grad():  # Don't calculate gradients
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()

# Map prediction to label
label_map = {0: 'POSITIVE', 1: 'NEGATIVE', 2: 'NEUTRAL'}
predicted_label = label_map[prediction]

print(f"Input: {input_text}")
print(f"Expected: {test['expected']}")
print(f"Predicted: {predicted_label}")
print(f"Correct: {predicted_label == test['expected']}")

Input: Výborný produkt! Velmi spokojený s kvalitou a dodáním.
Expected: POSITIVE
Predicted: NEGATIVE
Correct: False
