# Aesop Agent Statistics

Analysis of theorem proving experiments with different models and strategies.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import defaultdict

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

: 

## Load Data

In [None]:
# Load theorem registry
registry_path = Path("../data/full_defs_new_aesop_registry.json")

with open(registry_path, 'r') as f:
    registry_data = json.load(f)

# Load run logs
logs_dir = Path("../logs")
run_files = sorted(logs_dir.glob("run_*.json"))

print(f"Found {len(run_files)} run logs")
print(f"Registry contains {len(registry_data['theorems'])} theorems")

## Theorem Composition Analysis

Analyze the 124 theorems by category:
- Has @simp annotation
- Proven by naive aesop
- Proven by LLM-assisted aesop
- Not yet proven

In [None]:
# Load source file to check simp annotations
source_file = Path("../data/full_defs_new.lean")
with open(source_file, 'r') as f:
    source_content = f.read()

# Count theorems by category
total_theorems = 124
theorems_in_registry = registry_data['theorems']

# Categorize
has_simp = 0
naive_success = 0
llm_success = 0

for thm_name, thm_data in theorems_in_registry.items():
    if thm_data['method'] == 'naive':
        naive_success += 1
    elif thm_data['method'] == 'llm':
        llm_success += 1

# Count @simp annotations in source
import re
simp_pattern = r'@\[simp\]'
has_simp = len(re.findall(simp_pattern, source_content))

not_proven = total_theorems - len(theorems_in_registry)

print(f"Total theorems: {total_theorems}")
print(f"Has @simp annotation: {has_simp}")
print(f"Proven by naive aesop: {naive_success}")
print(f"Proven by LLM-assisted: {llm_success}")
print(f"Not yet proven: {not_proven}")

In [None]:
# Create composition visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Pie chart for theorem status
status_data = {
    'Naive Aesop': naive_success,
    'LLM-Assisted': llm_success,
    'Not Proven': not_proven
}

colors = ['#2ecc71', '#3498db', '#e74c3c']
ax1.pie(status_data.values(), labels=status_data.keys(), autopct='%1.1f%%',
        colors=colors, startangle=90)
ax1.set_title('Theorem Proving Status\n(124 theorems total)', fontsize=14, fontweight='bold')

# Bar chart with simp annotation overlay
categories = ['Has @simp\nAnnotation', 'Naive Aesop\nSuccess', 'LLM-Assisted\nSuccess', 'Not Yet\nProven']
counts = [has_simp, naive_success, llm_success, not_proven]
colors_bar = ['#9b59b6', '#2ecc71', '#3498db', '#e74c3c']

bars = ax2.bar(categories, counts, color=colors_bar, alpha=0.7, edgecolor='black')
ax2.set_ylabel('Number of Theorems', fontsize=12)
ax2.set_title('Theorem Composition', fontsize=14, fontweight='bold')
ax2.set_ylim(0, max(counts) * 1.15)

# Add count labels on bars
for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('theorem_composition.png', dpi=300, bbox_inches='tight')
plt.show()

## LLM Performance Over Runs

Track how many theorems each model proved across different runs.

In [None]:
# Parse run logs
run_data = []

for run_file in run_files:
    with open(run_file, 'r') as f:
        data = json.load(f)
    
    run_data.append({
        'run_id': data['run_id'],
        'timestamp': data['timestamp'],
        'model': data['config']['model'],
        'naive_success': data['stats']['naive_aesop_success'],
        'llm_success': data['stats']['llm_aesop_success'],
        'total_proven': data['stats']['total_aesop_success'],
        'failed': data['stats']['aesop_failed']
    })

df_runs = pd.DataFrame(run_data)
df_runs['run_number'] = range(1, len(df_runs) + 1)

print(df_runs[['run_number', 'model', 'naive_success', 'llm_success', 'total_proven']])

In [None]:
# Create line plot for LLM successes by model
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Get unique models
models = df_runs['model'].unique()
model_colors = {
    'Qwen/Qwen3-235B-A22B-Thinking-2507': '#e74c3c',
    'Qwen/Qwen3-Coder-480B-A35B-Instruct': '#3498db',
    'Qwen/Qwen3-Coder-30B-A3B-Instruct': '#2ecc71',
    'Qwen/Qwen3-30B-A3B-Thinking-2507': '#f39c12'
}

# Plot 1: LLM successes over runs
for model in models:
    model_data = df_runs[df_runs['model'] == model]
    model_short = model.split('/')[-1][:30]  # Shorten for legend
    
    ax1.plot(model_data['run_number'], model_data['llm_success'], 
            marker='o', linewidth=2, markersize=8,
            label=model_short,
            color=model_colors.get(model, '#95a5a6'))

ax1.set_xlabel('Run Number', fontsize=12, fontweight='bold')
ax1.set_ylabel('LLM-Assisted Theorems Proven', fontsize=12, fontweight='bold')
ax1.set_title('LLM Performance Across Runs', fontsize=14, fontweight='bold')
ax1.legend(loc='best', fontsize=9)
ax1.grid(True, alpha=0.3)

# Plot 2: Total success rate over runs
for model in models:
    model_data = df_runs[df_runs['model'] == model]
    model_short = model.split('/')[-1][:30]
    
    ax2.plot(model_data['run_number'], model_data['total_proven'], 
            marker='s', linewidth=2, markersize=8,
            label=model_short,
            color=model_colors.get(model, '#95a5a6'))

ax2.set_xlabel('Run Number', fontsize=12, fontweight='bold')
ax2.set_ylabel('Total Theorems Proven', fontsize=12, fontweight='bold')
ax2.set_title('Total Success Rate Across Runs', fontsize=14, fontweight='bold')
ax2.legend(loc='best', fontsize=9)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('llm_performance_over_runs.png', dpi=300, bbox_inches='tight')
plt.show()

## Summary Statistics by Model

In [None]:
# Aggregate statistics by model
model_stats = df_runs.groupby('model').agg({
    'llm_success': ['mean', 'max', 'min', 'std'],
    'total_proven': ['mean', 'max', 'min'],
    'run_number': 'count'
}).round(2)

model_stats.columns = ['_'.join(col).strip() for col in model_stats.columns.values]
model_stats = model_stats.rename(columns={'run_number_count': 'num_runs'})

print("\n=== Model Performance Summary ===")
print(model_stats)

In [None]:
# Create comparative bar chart
if len(models) > 1:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    model_avg = df_runs.groupby('model')[['naive_success', 'llm_success']].mean()
    model_avg['model_short'] = [m.split('/')[-1][:35] for m in model_avg.index]
    
    x = range(len(model_avg))
    width = 0.35
    
    ax.bar([i - width/2 for i in x], model_avg['naive_success'], 
           width, label='Naive Aesop', color='#2ecc71', alpha=0.8)
    ax.bar([i + width/2 for i in x], model_avg['llm_success'], 
           width, label='LLM-Assisted', color='#3498db', alpha=0.8)
    
    ax.set_xlabel('Model', fontsize=12, fontweight='bold')
    ax.set_ylabel('Average Theorems Proven', fontsize=12, fontweight='bold')
    ax.set_title('Average Performance by Model', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(model_avg['model_short'], rotation=15, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()