# LateBench: Adversarial Error Generation Examples

This notebook demonstrates how to use the LateBench system to generate adversarial examples with late reasoning errors.

In [None]:
# Setup
import sys
sys.path.append('../src')

import os
import json
from data_loader import NuminaMathDataLoader
from error_injector import AdversarialErrorInjector
from error_types import MATH_ERROR_TAXONOMY
from visualization import VISUALIZER

# Load environment variables
from dotenv import load_dotenv
load_dotenv('../.env')

print("Setup complete!")

## 1. Load and Explore Dataset

In [None]:
# Initialize data loader
loader = NuminaMathDataLoader(cache_dir="../data")

# Load dataset (will use cached version if available)
dataset = loader.download_dataset()
print(f"Dataset loaded successfully!")

In [None]:
# Get sample examples for experimentation
sample_examples = loader.get_sample_examples(n=5, min_steps=8)

print(f"Got {len(sample_examples)} sample examples")
print(f"Step counts: {[ex['num_steps'] for ex in sample_examples]}")

# Show first example
if sample_examples:
    example = sample_examples[0]
    print(f"\nFirst example:")
    print(f"Problem: {example.get('problem', 'No problem')[:200]}...")
    print(f"Steps: {example['num_steps']}")
    print(f"Answer: {example.get('answer', 'No answer')}")

## 2. Explore Error Taxonomy

In [None]:
# Explore available error types
error_stats = MATH_ERROR_TAXONOMY.get_error_statistics()
print("Error Taxonomy Statistics:")
for key, value in error_stats.items():
    print(f"  {key}: {value}")

print(f"\nAvailable error types:")
for name in MATH_ERROR_TAXONOMY.get_all_error_names():
    error_type = MATH_ERROR_TAXONOMY.get_error_by_name(name)
    print(f"  - {name}: {error_type.description}")

## 3. Generate Single Adversarial Example

In [None]:
# Initialize error injector
injector = AdversarialErrorInjector()
print("Error injector initialized")

# Select a problem to work with
if sample_examples:
    test_problem = sample_examples[0]
    print(f"Using problem with {test_problem['num_steps']} steps")
else:
    print("No sample examples available")

In [None]:
# Inject an error (this will call OpenAI API)
if 'test_problem' in locals():
    print("Injecting error... (this may take a moment)")
    
    # Try injecting a specific error type
    result = injector.inject_error(
        test_problem, 
        error_type_preference="invalid_generalization"
    )
    
    if result.success:
        print("✓ Error injection successful!")
        print(f"Error type: {result.error_analysis.get('error_type', 'Unknown')}")
        print(f"Error step: {result.error_analysis.get('selected_error_step', 'Unknown')}")
    else:
        print(f"✗ Error injection failed: {result.error_message}")
else:
    print("No test problem available")

## 4. Visualize the Result

In [None]:
# Create text visualization
if 'result' in locals() and result.success:
    visualization = VISUALIZER.create_example_visualization(result)
    print(visualization[:2000])  # Show first 2000 characters
    
    # Save full visualization
    with open('../data/example_visualization.md', 'w') as f:
        f.write(visualization)
    print("\nFull visualization saved to ../data/example_visualization.md")
else:
    print("No successful result to visualize")

## 5. Generate Multiple Examples

In [None]:
# Generate a small batch for demonstration
if len(sample_examples) >= 3:
    print("Generating batch of 3 examples...")
    
    # Use different error types for variety
    error_distribution = {
        "invalid_generalization": 0.4,
        "theorem_misapplication": 0.3,
        "circular_reasoning": 0.3
    }
    
    batch_results = injector.batch_inject_errors(
        sample_examples[:3], 
        error_distribution=error_distribution,
        save_checkpoints=True
    )
    
    print(f"\nBatch complete!")
    successful = sum(1 for r in batch_results if r.success)
    print(f"Success rate: {successful}/{len(batch_results)}")
    
    # Save results
    injector.save_results(batch_results, '../data/demo_batch_results.json')
    
else:
    print("Need at least 3 sample examples for batch demo")

## 6. Analysis and Visualization

In [None]:
# Analyze batch results if available
if 'batch_results' in locals():
    # Generate quality metrics
    metrics = VISUALIZER.create_quality_metrics_report(batch_results)
    
    print("Quality Metrics:")
    print(json.dumps(metrics, indent=2))
    
    # Create statistical plots
    try:
        fig = VISUALIZER.create_batch_statistics_plot(batch_results, 
                                                    save_path='../data/demo_statistics.png')
        print("\nStatistics plot created")
    except Exception as e:
        print(f"Error creating plot: {e}")
    
    # Generate HTML report
    VISUALIZER.save_html_report(batch_results, '../data/demo_report.html', max_examples=3)
    print("HTML report generated")
    
else:
    print("No batch results available for analysis")

## 7. Next Steps

This notebook demonstrated:
1. Loading the NuminaMath dataset
2. Exploring the error taxonomy
3. Generating adversarial examples with late reasoning errors
4. Visualizing and analyzing the results

For production use:
- Use `loader.filter_long_solutions()` to get all suitable problems
- Run `injector.batch_inject_errors()` on larger datasets
- Experiment with different error type distributions
- Use the generated examples to test reasoning critics and process reward models