# Setup

### Load libraries

In [30]:
import os
import json
import pandas as pd
import numpy as np
import jinja2
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.width")
pd.set_option('display.max_colwidth', 100)

### Load data

In [1]:
import os
import json
import pandas as pd

root_dirs = [
    "experiments/some_models_all_ex_defs_context"
]

data_list = []

for root_dir in root_dirs:
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(subdir, file)
                try:
                    with open(json_path, 'r') as f:
                        data = json.load(f)
                    
                    # Extract experiment name: first part after "experiments/"
                    experiment_name = os.path.normpath(subdir).split(os.sep)
                    experiment_idx = experiment_name.index("experiments") + 1
                    experiment = experiment_name[experiment_idx]
                    
                    # Extract timestamp: immediate parent folder of JSON file
                    timestamp = os.path.basename(subdir)
                    
                    # Add to data
                    data["experiment"] = experiment
                    data["timestamp"] = timestamp
                    
                    data_list.append(data)
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON: {json_path}")

# Create DataFrame
df = pd.DataFrame(data_list)

# Optional: filter out unwanted models
exclude_models = ['openai/o4-mini-high', 'openai/gpt-4o-mini']
df = df[~df['model'].isin(exclude_models)]

# Define a composite run identifier
df['full_run_id'] = df['experiment'] + ' | ' + df['timestamp'] + ' | ' + df['run_id']

len(df)


3103

# Case-by-case data analysis

### Inspect example data entry

In [44]:
pd.set_option('display.max_colwidth', 100)
df.iloc[0].to_frame()

Unnamed: 0,0
model,x-ai/grok-3-beta
exercise,11_not_open
prompt,"You are asked to write a proof in the syntax of Waterproof, a theorem proving educational softwa..."
tutorial,# Waterproof Tutorial\n\n## 1. We conclude that\n\n### Example:\n\nLemma example_reflexivity :\n...
full_input,"You are asked to write a proof in the syntax of Waterproof, a theorem proving educational softwa..."
exercise_content,"Lemma not_open : ¬ ([0,1) is _open_).\nProof.\nWe need to show that (¬ (∀ a, [0, 1) a ⇨ a is an ..."
output,"Proof.\nWe need to show that (¬ (∀ a, [0, 1) a ⇨ a is an _interior point_ of [0, 1))).\nWe need ..."
token_count,4010
input_tokens,3702
thinking_tokens,0


### See number of runs per model

In [2]:
# Count number of unique runs per model
runs_per_model = df[['model', 'full_run_id']].drop_duplicates().groupby('model').size().reset_index(name='num_runs')

# Optional: sort by num_runs descending
runs_per_model = runs_per_model.sort_values(by='num_runs', ascending=False)

# Display
runs_per_model

Unnamed: 0,model,num_runs
0,anthropic/claude-sonnet-4,27
1,google/gemini-2.5-flash-preview-05-20,27
2,google/gemini-2.5-flash-preview-05-20:thinking,27
3,openai/gpt-4.1,27
4,openai/o4-mini,27
5,x-ai/grok-3-beta,27
6,x-ai/grok-3-mini-beta,27


### Inspect runs per model

In [3]:
model_to_inspect = 'x-ai/grok-3-beta'

# Filter to the model of interest
df_model = df[df['model'] == model_to_inspect].copy()

# Get unique runs
unique_full_run_ids = df_model['full_run_id'].unique()

print(f"Number of unique runs for model '{model_to_inspect}': {len(unique_full_run_ids)}")

# Optionally show the run IDs for inspection
for i, run in enumerate(unique_full_run_ids):
    print(f"{i}: {run}")


Number of unique runs for model 'x-ai/grok-3-beta': 27
0: some_models_all_ex_defs_context | 2025-06-10_03-10-52 | x-ai/grok-3-beta::10_7_6::1
1: some_models_all_ex_defs_context | 2025-06-10_03-10-27 | x-ai/grok-3-beta::10_7_3::1
2: some_models_all_ex_defs_context | 2025-06-10_03-06-19 | x-ai/grok-3-beta::6_8_2::1
3: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_triangle_inequality_2::1
4: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_positive_2::1
5: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_positive_1::1
6: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_reflexive_1::1
7: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::6_8_1::1
8: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_triangle_inequality_1::1
9: some_models_all_ex_defs_context | 2025-06-10_00-12-06 | x-ai/grok-3-beta::2_non_degenerate_1::1
10: some_mod

### Inspect errors in a particular run

In [54]:
# Select the Nth run (e.g. first one)
run_index = 1 # change this to pick different run
selected_full_run_id = unique_full_run_ids[run_index]

# Filter rows corresponding to this run
df_run = df_model[df_model['full_run_id'] == selected_full_run_id].sort_values(by='attempt').copy()

# Clean up error message
df_run['error_message'] = df_run['errors'].apply(
    lambda x: x.split('\n', 1)[1] if isinstance(x, str) and '\n' in x else ''
)

# Show trace table
df_run[['model', 'exercise', 'attempt', 'error_message', 'line_with_error']]


Unnamed: 0,model,exercise,attempt,error_message,line_with_error
148,x-ai/grok-3-beta,4_9_1,1,,


### Display diff between proof revisions

In [36]:
from IPython.display import display, HTML
import difflib
import html

def show_proof_diff(df_run, attempt_number_1, attempt_number_2):
    # Extract proofs
    proof_1 = df_run[df_run['attempt'] == attempt_number_1]['output'].iloc[0]
    proof_2 = df_run[df_run['attempt'] == attempt_number_2]['output'].iloc[0]

    # Split into lines and compute diff
    proof_1_lines = proof_1.splitlines()
    proof_2_lines = proof_2.splitlines()

    diff = list(difflib.ndiff(proof_1_lines, proof_2_lines))

    # Process diff for left and right columns
    left_col = []
    right_col = []

    for line in diff:
        tag = line[:2]
        content = html.escape(line[2:])  # escape <, >, &, etc.

        if tag == '  ':
            left_col.append(f"<div>{content}</div>")
            right_col.append(f"<div>{content}</div>")
        elif tag == '- ':
            left_col.append(f"<div style='background-color:#441111;'>{content}</div>")
            right_col.append(f"<div style='background-color:#222222;'></div>")
        elif tag == '+ ':
            left_col.append(f"<div style='background-color:#222222;'></div>")
            right_col.append(f"<div style='background-color:#114411;'>{content}</div>")
        elif tag == '? ':
            # ignore helper lines like ^^^^^^ in diffs
            continue

    # Combine columns into HTML
    html_table = f"""
    <div style="display: flex; gap: 10px; font-family: monospace; background-color: #1e1e1e; color: white; padding: 10px;">
        <div style="flex: 1;">
            <div style="font-weight: bold; padding-bottom: 5px;">Attempt {attempt_number_1}</div>
            {''.join(left_col)}
        </div>
        <div style="flex: 1;">
            <div style="font-weight: bold; padding-bottom: 5px;">Attempt {attempt_number_2}</div>
            {''.join(right_col)}
        </div>
    </div>
    """
    display(HTML(html_table))


In [37]:
show_proof_diff(df_run, 1, 2)

### Total cost estimation

In [9]:
total_cost = df['cost'].sum()
total_cost

np.float64(24.586999)

# Aggregate data analysis

### Success rate per model

In [4]:
# Group by model and run, compute per-run success (1 if any attempt succeeded)
per_run_success = df.groupby(['model', 'full_run_id'])['success'].max().reset_index()

# Aggregate per model: success_rate, total_runs, successful_runs
agg_success_rate = per_run_success.groupby('model')['success'].agg(['mean', 'count', 'sum']).reset_index()

# Rename columns for clarity
agg_success_rate.rename(columns={
    'mean': 'success_rate',
    'count': 'total_runs',
    'sum': 'successful_runs'
}, inplace=True)

# Convert success_rate to % for display
agg_success_rate['success_rate'] = agg_success_rate['success_rate'] * 100

# Optional: round to 1 decimal place
agg_success_rate['success_rate'] = agg_success_rate['success_rate'].round(1)

# Reorder columns: successful_runs, total_runs, success_rate, sort by success_rate descending
agg_success_rate = agg_success_rate[['model', 'successful_runs', 'total_runs', 'success_rate']].sort_values(by='success_rate', ascending=False)

# Display the table
agg_success_rate

Unnamed: 0,model,successful_runs,total_runs,success_rate
0,anthropic/claude-sonnet-4,12,27,44.4
4,openai/o4-mini,10,27,37.0
3,openai/gpt-4.1,9,27,33.3
5,x-ai/grok-3-beta,7,27,25.9
2,google/gemini-2.5-flash-preview-05-20:thinking,7,27,25.9
1,google/gemini-2.5-flash-preview-05-20,6,27,22.2
6,x-ai/grok-3-mini-beta,5,27,18.5


### Success rate per exercise

In [5]:
# Group by exercise and run, compute per-run success
per_run_success_ex = df.groupby(['exercise', 'full_run_id'])['success'].max().reset_index()

# Aggregate per exercise
agg_success_rate_ex = per_run_success_ex.groupby('exercise')['success'].agg(['mean', 'count', 'sum']).reset_index()

# Rename columns for clarity
agg_success_rate_ex.rename(columns={
    'mean': 'success_rate',
    'count': 'total_runs',
    'sum': 'successful_runs'
}, inplace=True)

# Convert success_rate to % and round
agg_success_rate_ex['success_rate'] = agg_success_rate_ex['success_rate'] * 100
agg_success_rate_ex['success_rate'] = agg_success_rate_ex['success_rate'].round(1)

# Reorder columns: successful_runs, total_runs, success_rate, sort by success_rate descending
agg_success_rate_ex = agg_success_rate_ex[['exercise', 'successful_runs', 'total_runs', 'success_rate']].sort_values(by='success_rate', ascending=False)

# Display the table
agg_success_rate_ex


Unnamed: 0,exercise,successful_runs,total_runs,success_rate
8,2_non_degenerate_2,7,7,100.0
17,3_11_1,7,7,100.0
12,2_reflexive_2,7,7,100.0
2,10_7_intermediate,6,7,85.7
18,3_11_2,6,7,85.7
20,4_9_1,5,7,71.4
10,2_positive_2,5,7,71.4
21,4_9_2,3,7,42.9
19,3_11_4,3,7,42.9
9,2_positive_1,2,7,28.6


### Success rate per model and exercise

In [6]:
# Group by model, exercise, run — compute per-run success
per_run_success_grid = df.groupby(['model', 'exercise', 'full_run_id'])['success'].max().reset_index()

# Now pivot: model as rows, exercise as columns, value = mean per run (in %)
grid = per_run_success_grid.pivot_table(
    index='model',
    columns='exercise',
    values='success',
    aggfunc='mean'
) * 100

# Round nicely
grid = grid.round(1)

# Display the grid
grid


exercise,10_7_3,10_7_6,10_7_intermediate,11_not_closed,11_not_open,13_11_2,13_11_3,2_non_degenerate_1,2_non_degenerate_2,2_positive_1,...,3_11_1,3_11_2,3_11_4,4_9_1,4_9_2,4_9_3,5_9_1,5_9_2,6_8_1,6_8_2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
anthropic/claude-sonnet-4,0.0,0.0,100.0,0.0,0.0,0.0,100.0,0.0,100.0,100.0,...,100.0,100.0,100.0,100.0,0.0,100.0,0.0,0.0,100.0,0.0
google/gemini-2.5-flash-preview-05-20,0.0,0.0,100.0,0.0,0.0,100.0,0.0,0.0,100.0,0.0,...,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
google/gemini-2.5-flash-preview-05-20:thinking,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,100.0,100.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
openai/gpt-4.1,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,100.0,100.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0
openai/o4-mini,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,100.0,...,100.0,100.0,0.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0
x-ai/grok-3-beta,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,100.0,100.0,100.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
x-ai/grok-3-mini-beta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0,...,100.0,100.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0


### Average output counts per model

In [7]:
# Average output token counts per model
output_tokens_avg = df.groupby('model')['output_tokens'].mean().reset_index()
output_tokens_avg.rename(columns={'output_tokens': 'Avg Output Tokens'}, inplace=True)

# Average thinking token counts per model
thinking_tokens_avg = df.groupby('model')['thinking_tokens'].mean().reset_index()
thinking_tokens_avg.rename(columns={'thinking_tokens': 'Avg Thinking Tokens'}, inplace=True)

# Get the thinking_mode per model
thinking_mode_per_model = df[['model', 'thinking_mode']].drop_duplicates()

# Merge into the result
output_tokens_avg = output_tokens_avg.merge(thinking_mode_per_model, on='model')
output_tokens_avg = output_tokens_avg.merge(thinking_tokens_avg, on='model')

output_tokens_avg.sort_values("Avg Thinking Tokens")

Unnamed: 0,model,Avg Output Tokens,thinking_mode,Avg Thinking Tokens
0,anthropic/claude-sonnet-4,375.567568,False,0.0
1,google/gemini-2.5-flash-preview-05-20,397.655993,False,0.0
3,openai/gpt-4.1,348.038622,False,0.0
5,x-ai/grok-3-beta,272.413333,False,0.0
6,x-ai/grok-3-mini-beta,344.552632,True,5.671053
4,openai/o4-mini,322.145729,True,3174.623116
2,google/gemini-2.5-flash-preview-05-20:thinking,402.553488,True,6476.427907


### Average cost per model

In [8]:
# Define cost per attempt: group by model → mean cost
cost_per_attempt = df.groupby('model')['cost'].mean().reset_index()

# Rename column for clarity
cost_per_attempt.rename(columns={'cost': 'avg cost per attempt ($)'}, inplace=True)

# Optional: add company column
# cost_per_attempt['company'] = cost_per_attempt['model'].str.split('/').str[0]

# Display the table
cost_per_attempt


Unnamed: 0,model,avg cost per attempt ($)
0,anthropic/claude-sonnet-4,0.031504
1,google/gemini-2.5-flash-preview-05-20,0.003693
2,google/gemini-2.5-flash-preview-05-20:thinking,0.005301
3,openai/gpt-4.1,0.003345
4,openai/o4-mini,0.022931
5,x-ai/grok-3-beta,0.023582
6,x-ai/grok-3-mini-beta,0.002233


### (WIP) Success rates with different prompt lengths

In [10]:
# Success rate per model compared across a few levels of tutorial verbosity
df['tutorial_len'] = df['tutorial'].apply(lambda x: len(x) if isinstance(x, str) else 0)

tutorial_success = df.groupby(['model', 'tutorial_len'])['success'].mean().reset_index()
tutorial_success['success'] = (tutorial_success['success'] * 100).round(2)
tutorial_success.rename(columns={'success': 'Success Rate (%)', 'tutorial_len': 'Tutorial Length'}, inplace=True)

tutorial_success

Unnamed: 0,model,Tutorial Length,Success Rate (%)
0,anthropic/claude-sonnet-4,8665,6.49
1,google/gemini-2.5-flash-preview-05-20,8665,0.55
2,google/gemini-2.5-flash-preview-05-20:thinking,8665,3.26
3,openai/gpt-4.1,8665,0.94
4,openai/o4-mini,8665,5.03
5,x-ai/grok-3-beta,8665,3.11
6,x-ai/grok-3-mini-beta,8665,2.19


### (WIP) Success rates with different tutorial lengths

In [84]:
# Success rate per model compared across a few levels of prompt verbosity
df['prompt_len'] = df['prompt'].apply(lambda x: len(x) if isinstance(x, str) else 0)

prompt_success = df.groupby(['model', 'prompt_len'])['success'].mean().reset_index()
prompt_success['success'] = (prompt_success['success'] * 100).round(2)
prompt_success.rename(columns={'success': 'Success Rate (%)', 'prompt_len': 'Prompt Length'}, inplace=True)

prompt_success

Unnamed: 0,model,Prompt Length,Success Rate (%)
0,anthropic/claude-3.7-sonnet:thinking,1422,0.0
1,anthropic/claude-sonnet-4,1422,13.64
2,deepseek/deepseek-chat-v3-0324,1422,0.0
3,deepseek/deepseek-r1-0528,1422,0.0
4,google/gemini-2.5-flash-preview-05-20,1422,0.66
5,google/gemini-2.5-flash-preview-05-20:thinking,1422,6.9
6,openai/gpt-4.1,1422,0.65
7,openai/o3-mini,1422,0.0
8,openai/o4-mini,1422,5.26
9,x-ai/grok-3-mini-beta,1422,4.76


### Aggregate pass@k metrics for each model

In [11]:
# List of (full_run_id, model, exercise, sorted list of success flags per attempt)
run_attempts = df.sort_values(['full_run_id', 'attempt']).groupby('full_run_id').agg({
    'model': 'first',
    'exercise': 'first',
    'success': lambda x: list(x),  # get list of success flags
    'attempt': 'max'  # store max_attempts per run (for correct "not used k" logic)
}).reset_index()

# Determine global max k (max number of attempts used in any run)
max_k = df['max_attempts'].max()

# Compute pass@k columns
for k in range(1, max_k + 1):
    def pass_at_k(row):
        success_flags = row['success']
        used_attempts = len(success_flags)
        success_in_k = any(success_flags[:k])
        not_used_k = used_attempts < k
        return success_in_k or not_used_k
    
    run_attempts[f'pass@{k}'] = run_attempts.apply(pass_at_k, axis=1)

# Now compute per-model pass@k mean
passk_cols = [f'pass@{k}' for k in range(1, max_k + 1)]
passk_summary = run_attempts.groupby('model')[passk_cols].mean().reset_index()

# Convert to % and round
for col in passk_cols:
    passk_summary[col] = (passk_summary[col] * 100).round(2)

# Optional: rename columns to Pass@k (%) style
passk_summary.rename(columns={col: f'Pass@{col.split("@")[1]} (%)' for col in passk_cols}, inplace=True)

# Display
passk_summary


Unnamed: 0,model,Pass@1 (%),Pass@2 (%),Pass@3 (%),Pass@4 (%),Pass@5 (%),Pass@6 (%),Pass@7 (%),Pass@8 (%),Pass@9 (%),...,Pass@41 (%),Pass@42 (%),Pass@43 (%),Pass@44 (%),Pass@45 (%),Pass@46 (%),Pass@47 (%),Pass@48 (%),Pass@49 (%),Pass@50 (%)
0,anthropic/claude-sonnet-4,11.11,33.33,33.33,37.04,37.04,37.04,40.74,40.74,44.44,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
1,google/gemini-2.5-flash-preview-05-20,14.81,18.52,18.52,18.52,18.52,18.52,18.52,18.52,18.52,...,22.22,22.22,22.22,22.22,22.22,22.22,22.22,22.22,22.22,22.22
2,google/gemini-2.5-flash-preview-05-20:thinking,7.41,18.52,22.22,25.93,25.93,25.93,25.93,25.93,25.93,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
3,openai/gpt-4.1,11.11,11.11,18.52,18.52,22.22,22.22,22.22,22.22,22.22,...,33.33,33.33,33.33,33.33,33.33,33.33,33.33,33.33,33.33,33.33
4,openai/o4-mini,7.41,18.52,22.22,25.93,33.33,33.33,37.04,40.74,40.74,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
5,x-ai/grok-3-beta,7.41,11.11,18.52,18.52,22.22,22.22,22.22,22.22,22.22,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
6,x-ai/grok-3-mini-beta,11.11,14.81,18.52,18.52,18.52,18.52,18.52,18.52,18.52,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


# TODOs

In [24]:
# TODO:
# thinking vs no thinking models (all)
# thinking vs no thinking on models that support both (fair)
# definition expanding comparison