In [2]:
# baseline_evaluation_v1.ipynb

# Cell 1: Setup
from main import GSM8KRLTrainer
from tqdm import tqdm
import json
import os

print("Imports successful")

Imports successful


In [3]:
# Cell 2: Initialize
trainer = GSM8KRLTrainer(
    model_name="Qwen/Qwen2.5-0.5B",
    reward_type='meta_only',
    data_dir='data'
)

print(f"Train: {len(trainer.train_dataset)} examples")
print(f"Valid: {len(trainer.valid_dataset)} examples")
print(f"Test: {len(trainer.test_dataset)} examples")
print(f"TOTAL: {len(trainer.train_dataset) + len(trainer.valid_dataset) + len(trainer.test_dataset)} examples")

Loading Qwen/Qwen2.5-0.5B...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading train split from data/train.jsonl...
Loaded train split: 6352 examples
Loading valid split from data/valid.jsonl...
Loaded valid split: 1121 examples
Loading test split from data/test.jsonl...
Loaded test split: 1319 examples
Reward type: meta_only
Alpha (local): 0.3, Beta (meta): 0.7
Train: 6352 examples
Valid: 1121 examples
Test: 1319 examples
TOTAL: 8792 examples


In [4]:
# Cell 3: Setup output file
output_file = 'baseline_results_full.jsonl'
summary_file = 'baseline_summary.json'

# Create or clear output file
if not os.path.exists(output_file):
    open(output_file, 'w').close()
    print(f"Created {output_file}")
else:
    print(f"Will append to existing {output_file}")

Will append to existing baseline_results_full.jsonl


In [10]:
# Cell 4: Evaluate Train Set (run multiple times for batches)
dataset = trainer.train_dataset
split_name = 'train'
start_idx = 6000  # CHANGE THIS for each batch: 0, 1000, 2000, etc.
batch_size = 1000 #10 #1000

end_idx = min(start_idx + batch_size, len(dataset))
eval_data = dataset.data[start_idx:end_idx]

print(f"\n{'='*80}")
print(f"Evaluating {split_name.upper()}: {start_idx} to {end_idx}")
print(f"{'='*80}\n")

correct = 0
total = 0

for item in tqdm(eval_data, desc=f"{split_name} {start_idx}-{end_idx}"):
    question = item['question']
    gt_answer = dataset.extract_final_answer(item['answer'])
    
    # Generate
    response = trainer.generate_response(question)
    model_answer = trainer.reward_computer.extract_model_answer(response)
    
    # Check
    is_correct = (model_answer is not None and abs(model_answer - gt_answer) < 1e-6)
    if is_correct:
        correct += 1
    total += 1
    
    # Append to file immediately
    result = {
        'split': split_name,
        'question': question,
        'ground_truth_answer': gt_answer,
        'model_response': response,
        'extracted_answer': model_answer,
        'correct': 1 if is_correct else 0
    }
    
    with open(output_file, 'a') as f:
        f.write(json.dumps(result) + '\n')

print(f"\nBatch accuracy: {correct}/{total} = {correct/total:.2%}")


Evaluating TRAIN: 6000 to 6352



train 6000-6352: 100%|██████████| 352/352 [1:19:39<00:00, 13.58s/it]


Batch accuracy: 156/352 = 44.32%





In [12]:
# Cell 5: Evaluate Valid Set (same pattern)
dataset = trainer.valid_dataset
split_name = 'valid'
start_idx = 1000  # CHANGE THIS for batches
batch_size = 1000

end_idx = min(start_idx + batch_size, len(dataset))
eval_data = dataset.data[start_idx:end_idx]

print(f"\n{'='*80}")
print(f"Evaluating {split_name.upper()}: {start_idx} to {end_idx}")
print(f"{'='*80}\n")

correct = 0
total = 0

for item in tqdm(eval_data, desc=f"{split_name} {start_idx}-{end_idx}"):
    question = item['question']
    gt_answer = dataset.extract_final_answer(item['answer'])
    
    response = trainer.generate_response(question)
    model_answer = trainer.reward_computer.extract_model_answer(response)
    
    is_correct = (model_answer is not None and abs(model_answer - gt_answer) < 1e-6)
    if is_correct:
        correct += 1
    total += 1
    
    result = {
        'split': split_name,
        'question': question,
        'ground_truth_answer': gt_answer,
        'model_response': response,
        'extracted_answer': model_answer,
        'correct': 1 if is_correct else 0
    }
    
    with open(output_file, 'a') as f:
        f.write(json.dumps(result) + '\n')

print(f"\nBatch accuracy: {correct}/{total} = {correct/total:.2%}")



Evaluating VALID: 1000 to 1121



valid 1000-1121: 100%|██████████| 121/121 [27:27<00:00, 13.62s/it]


Batch accuracy: 67/121 = 55.37%





In [14]:
# Cell 6: Evaluate Test Set (same pattern)
dataset = trainer.test_dataset
split_name = 'test'
start_idx = 1000  # CHANGE THIS for batches
batch_size = 1000

end_idx = min(start_idx + batch_size, len(dataset))
eval_data = dataset.data[start_idx:end_idx]

print(f"\n{'='*80}")
print(f"Evaluating {split_name.upper()}: {start_idx} to {end_idx}")
print(f"{'='*80}\n")

correct = 0
total = 0

for item in tqdm(eval_data, desc=f"{split_name} {start_idx}-{end_idx}"):
    question = item['question']
    gt_answer = dataset.extract_final_answer(item['answer'])
    
    response = trainer.generate_response(question)
    model_answer = trainer.reward_computer.extract_model_answer(response)
    
    is_correct = (model_answer is not None and abs(model_answer - gt_answer) < 1e-6)
    if is_correct:
        correct += 1
    total += 1
    
    result = {
        'split': split_name,
        'question': question,
        'ground_truth_answer': gt_answer,
        'model_response': response,
        'extracted_answer': model_answer,
        'correct': 1 if is_correct else 0
    }
    
    with open(output_file, 'a') as f:
        f.write(json.dumps(result) + '\n')

print(f"\nBatch accuracy: {correct}/{total} = {correct/total:.2%}")


Evaluating TEST: 1000 to 1319



test 1000-1319: 100%|██████████| 319/319 [1:10:46<00:00, 13.31s/it]


Batch accuracy: 125/319 = 39.18%





In [5]:
# Cell 7: Calculate FULL dataset accuracy
print("\n" + "="*80)
print("CALCULATING OVERALL ACCURACY FROM ALL RESULTS")
print("="*80 + "\n")

total_correct = 0
total_samples = 0
split_stats = {'train': {'correct': 0, 'total': 0},
               'valid': {'correct': 0, 'total': 0},
               'test': {'correct': 0, 'total': 0}}

with open(output_file, 'r') as f:
    for line in f:
        result = json.loads(line)
        split = result['split']
        
        split_stats[split]['total'] += 1
        split_stats[split]['correct'] += result['correct']
        
        total_correct += result['correct']
        total_samples += 1

# Overall accuracy
overall_accuracy = total_correct / total_samples if total_samples > 0 else 0.0

print(f"Train:  {split_stats['train']['correct']}/{split_stats['train']['total']} = {split_stats['train']['correct']/split_stats['train']['total']:.2%}")
print(f"Valid:  {split_stats['valid']['correct']}/{split_stats['valid']['total']} = {split_stats['valid']['correct']/split_stats['valid']['total']:.2%}")
print(f"Test:   {split_stats['test']['correct']}/{split_stats['test']['total']} = {split_stats['test']['correct']/split_stats['test']['total']:.2%}")
print(f"\n{'='*80}")
print(f"OVERALL ACCURACY: {total_correct}/{total_samples} = {overall_accuracy:.2%}")
print(f"{'='*80}\n")

# Save summary
summary = {
    'overall': {
        'correct': total_correct,
        'total': total_samples,
        'accuracy': overall_accuracy
    },
    'by_split': {
        'train': {
            'correct': split_stats['train']['correct'],
            'total': split_stats['train']['total'],
            'accuracy': split_stats['train']['correct']/split_stats['train']['total'] if split_stats['train']['total'] > 0 else 0.0
        },
        'valid': {
            'correct': split_stats['valid']['correct'],
            'total': split_stats['valid']['total'],
            'accuracy': split_stats['valid']['correct']/split_stats['valid']['total'] if split_stats['valid']['total'] > 0 else 0.0
        },
        'test': {
            'correct': split_stats['test']['correct'],
            'total': split_stats['test']['total'],
            'accuracy': split_stats['test']['correct']/split_stats['test']['total'] if split_stats['test']['total'] > 0 else 0.0
        }
    }
}

with open(summary_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to {summary_file}")


CALCULATING OVERALL ACCURACY FROM ALL RESULTS

Train:  3010/6352 = 47.39%
Valid:  537/1121 = 47.90%
Test:   437/1319 = 33.13%

OVERALL ACCURACY: 3984/8792 = 45.31%

Summary saved to baseline_summary.json
