In [2]:
import os
import json
import pandas as pd
import numpy as np
import jinja2
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.width")
pd.set_option('display.max_colwidth', 100)  # default is 50

In [33]:
# List of root directories to process

#this is for cost eval
#root_dirs = [
#    "experiments/1_cost_assessment_per_model_no_defs/results/2025-06-08_20-28-18",
#    "experiments/1_cost_assessment_per_model_no_defs/results/2025-06-08_21-04-01",
#    "experiments/1_cost_assessment_per_model/results/2025-06-08_20-13-16",
#    "experiments/1_cost_assessment_per_model/results/2025-06-08_20-58-45"
#]

root_dirs = [
    "experiments/5_tvn_less_models_defs_context",
    "experiments/4_tvn_defs_no_context"
]

# Prepare a list to collect all the JSON data
data_list = []

# Walk through each directory
for root_dir in root_dirs:
    for subdir, _, files in os.walk(root_dir):
        for file in files:
            if file.endswith('.json'):
                json_path = os.path.join(subdir, file)
                with open(json_path, 'r') as f:
                    try:
                        data = json.load(f)
                        data_list.append(data)
                    except json.JSONDecodeError:
                        print(f"Failed to decode JSON: {json_path}")

# Convert to a DataFrame
df = pd.DataFrame(data_list)

# Define models to exclude
exclude_models = ['openai/o4-mini-high', 'openai/gpt-4o-mini']

# Drop rows where model is in exclude_models
df = df[~df['model'].isin(exclude_models)]

# Example: check number of rows
len(df)


483

In [4]:
# Set the model you want to inspect
model_to_inspect = 'anthropic/claude-sonnet-4'  # replace with your model

df_model = df[df['model'] == model_to_inspect].sort_values(by=['run_id', 'attempt']).copy()

df_model['error_message'] = df_model['errors'].apply(
    lambda x: x.split('\n', 1)[1] if isinstance(x, str) and '\n' in x else ''
)

#df_model.head()
df_model[['attempt', 'error_message', 'line_with_error']]

Unnamed: 0,attempt,error_message,line_with_error
52,1,Error: Wrong assumption specified.\n\n,"Assume that (∀ a, [0, 1) a ⇨ ∃ r > 0, ∀ x ∈ B(a, r), x ∈ [0, 1))."
41,2,Error: Wrong assumption specified.\n\n,"Assume that ([0,1) is _open_)."
54,3,"Error: Syntax error: ',' or 'in' '(' expected (in [ltac2_expr]).\n\n",Use a := (0) in this statement.
61,4,"Error: You cannot do this right now, follow the advice in the goal window.\n\n","* Indeed, ([0, 1) 0)."
60,5,"Error: Could not verify that ((B(0, r)) (- r / 2)).\n\n","It holds that (B(0, r) (-r/2))."
28,6,Error: Syntax error: [term level 200] expected after '(' (in [term]).\n\n,It holds that (|-r/2 - 0| < r).
40,7,,


In [24]:
attempt_number = 5

df_proof = df_model[df_model['attempt'] == attempt_number].copy()
proof = df_proof['output'].iloc[0]

print(proof)

Proof.
We need to show that (¬ ([0,1) is _open_)).
We argue by contradiction.
Assume that (¬ (¬ ([0,1) is _open_))).
It holds that ([0,1) is _open_).
It holds that (∀ a, [0, 1) a ⇨ a is an _interior point_ of [0, 1)) (i).
By (i) it holds that ([0, 1) 0 ⇨ 0 is an _interior point_ of [0, 1)).
It holds that ([0, 1) 0).
It holds that (0 is an _interior point_ of [0, 1)).
It holds that (∃ r > 0, ∀ x ∈ B(0, r), x ∈ [0, 1)).
Obtain such an r.
It holds that (∀ x ∈ B(0, r), x ∈ [0, 1)) (ii).
By (ii) it holds that (B(0, r) (-r/2) ⇨ (-r/2) ∈ [0, 1)).
It holds that (B(0, r) (-r/2)).
It holds that ((-r/2) ∈ [0, 1)).
It holds that (-r/2 ≥ 0).
It holds that (-r/2 < 0).
Contradiction.
Qed.


In [29]:
total_cost = df['cost'].sum()
total_cost

np.float64(3.8525817499999997)

In [None]:
# Total success rate per model
summary = df.groupby('model')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per model", label="tab:success_rate")
print(latex_table)

In [None]:
# Total success rate per exercise (check if this code is correct)
summary = df.groupby('exercise')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per exercise", label="tab:success_rate")
print(latex_table)

In [31]:
# Grid of success rates per model and per exercise (requires multiple runs of all models)
grid = df.pivot_table(index='model', columns='exercise', values='success', aggfunc='mean') * 100
grid = grid.round(2)
grid

exercise,11_not_open,3_11_2,4_9_1
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
anthropic/claude-sonnet-4,5.88,100.0,50.0
google/gemini-2.5-flash-preview-05-20,0.0,50.0,0.0
google/gemini-2.5-flash-preview-05-20:thinking,0.0,50.0,66.67
openai/gpt-4.1,0.0,25.0,0.0
openai/o3-mini,0.0,0.0,0.0
openai/o4-mini,0.0,100.0,10.0
x-ai/grok-3-beta,0.0,,
x-ai/grok-3-mini-beta,0.0,100.0,8.33


In [26]:
# Average output token counts per model
output_tokens_avg = df.groupby('model')['output_tokens'].mean().reset_index()
output_tokens_avg.rename(columns={'output_tokens': 'Avg Output Tokens'}, inplace=True)

# Average thinking token counts per model
thinking_tokens_avg = df.groupby('model')['thinking_tokens'].mean().reset_index()
thinking_tokens_avg.rename(columns={'thinking_tokens': 'Avg Thinking Tokens'}, inplace=True)

# Get the thinking_mode per model
thinking_mode_per_model = df[['model', 'thinking_mode']].drop_duplicates()

# Merge into the result
output_tokens_avg = output_tokens_avg.merge(thinking_mode_per_model, on='model')
output_tokens_avg = output_tokens_avg.merge(thinking_tokens_avg, on='model')

output_tokens_avg

Unnamed: 0,model,Avg Output Tokens,thinking_mode,Avg Thinking Tokens
0,anthropic/claude-sonnet-4,370.857143,False,0.0
1,google/gemini-2.5-flash-preview-05-20:thinking,424.8,True,6018.9
2,openai/o4-mini,273.7,True,6786.6
3,x-ai/grok-3-beta,262.0,False,0.0
4,x-ai/grok-3-mini-beta,246.6,True,-0.3


In [32]:
# Average cost per model
cost_avg = df.groupby('model')['cost'].mean().reset_index()
cost_avg.rename(columns={'cost': 'Avg Cost ($)'}, inplace=True)
cost_avg.sort_values("Avg Cost ($)")

Unnamed: 0,model,Avg Cost ($)
7,x-ai/grok-3-mini-beta,0.002066
3,openai/gpt-4.1,0.003471
1,google/gemini-2.5-flash-preview-05-20,0.003739
2,google/gemini-2.5-flash-preview-05-20:thinking,0.004954
4,openai/o3-mini,0.016139
5,openai/o4-mini,0.026379
6,x-ai/grok-3-beta,0.028501
0,anthropic/claude-sonnet-4,0.029369


In [89]:
# Extract company from model name
cost_avg['company'] = cost_avg['model'].str.split('/').str[0]

# Compute ratio per company
ratios = []

for company, group in cost_avg.groupby('company'):
    group_sorted = group.sort_values('Avg Cost ($)')
    
    if len(group_sorted) >= 2:
        cheaper = group_sorted.iloc[0]
        more_expensive = group_sorted.iloc[-1]
        
        ratio = more_expensive['Avg Cost ($)'] / cheaper['Avg Cost ($)']
        model_pair = f"{more_expensive['model']} vs {cheaper['model']}"
        
        ratios.append({'Company': company, 'Model Pair': model_pair, 'Cost Ratio': ratio})
    else:
        # Optionally, you can skip or report companies with only 1 model
        pass

# Convert to DataFrame
ratios_df = pd.DataFrame(ratios)
ratios_df


Unnamed: 0,Company,Model Pair,Cost Ratio
0,anthropic,anthropic/claude-3.7-sonnet:thinking vs anthropic/claude-sonnet-4,6.525463
1,deepseek,deepseek/deepseek-r1-0528 vs deepseek/deepseek-chat-v3-0324,2.290332
2,google,google/gemini-2.5-flash-preview-05-20:thinking vs google/gemini-2.5-flash-preview-05-20,1.325095
3,openai,openai/o4-mini vs openai/gpt-4.1,7.498808


In [66]:
# Average cost per model per attempt
cost_avg_attempt = df.groupby(['model', 'attempt'])[['cost', 'thinking_mode']].mean().reset_index()
cost_avg_attempt.rename(columns={'cost': 'Avg Cost ($)'}, inplace=True)
cost_avg_attempt.sort_values(['thinking_mode', 'model', 'attempt'])

Unnamed: 0,model,attempt,Avg Cost ($),thinking_mode
2,anthropic/claude-sonnet-4,1,0.023674,0.0
3,anthropic/claude-sonnet-4,2,0.025472,0.0
4,deepseek/deepseek-chat-v3-0324,1,0.001475,0.0
5,deepseek/deepseek-chat-v3-0324,2,0.001724,0.0
8,google/gemini-2.5-flash-preview-05-20,1,0.001005,0.0
9,google/gemini-2.5-flash-preview-05-20,2,0.001116,0.0
12,openai/gpt-4.1,1,0.000862,0.0
13,openai/gpt-4.1,2,0.000985,0.0
0,anthropic/claude-3.7-sonnet:thinking,1,0.209818,1.0
1,anthropic/claude-3.7-sonnet:thinking,2,0.09328,1.0


In [90]:
# Filter to exercise 3_11_4
df_3114 = df[df['exercise'] == '3_11_4']

# Total runs per model = number of unique run_ids per model
total_runs = df_3114.groupby('model')['run_id'].nunique()

# Successful runs per model:
# For each run_id, if any row was success == True → that run counts as successful
success_per_run = df_3114.groupby(['model', 'run_id'])['success'].max().reset_index()

# Now count number of successful runs per model
successful_runs = success_per_run.groupby('model')['success'].sum()

# Attempts used per run_id:
# For each run_id, the max(attempt) in that run tells how many attempts were used
attempts_per_run = df_3114.groupby(['model', 'run_id'])['attempt'].max().reset_index()

# Now sum attempts used per model
total_attempts_used = attempts_per_run.groupby('model')['attempt'].sum()

# Max possible attempts = total_runs * max_attempts_per_run
# We can infer max_attempts_per_run from the data:
max_attempts_per_run = df_3114['attempt'].max()
total_possible_attempts = total_runs * max_attempts_per_run

# Combine into one DataFrame
summary_df = pd.DataFrame({
    'Success?': successful_runs,
    'Total Runs': total_runs,
    'Attempts Used': total_attempts_used,
    'Possible Attempts': total_possible_attempts
})

# Format columns
summary_df['Success?'] = summary_df['Success?'].astype(int).astype(str) + '/' + summary_df['Total Runs'].astype(int).astype(str)
summary_df['Attempts Used'] = summary_df['Attempts Used'].astype(int).astype(str) + '/' + summary_df['Possible Attempts'].astype(int).astype(str)

# Final columns
summary_df = summary_df[['Success?', 'Attempts Used']]

# Optional: sort by success rate descending
summary_df = summary_df.sort_values(by='Success?', ascending=False)

# Display
summary_df


Unnamed: 0_level_0,Success?,Attempts Used
model,Unnamed: 1_level_1,Unnamed: 2_level_1
anthropic/claude-sonnet-4,1/1,10/50
anthropic/claude-3.7-sonnet:thinking,0/1,3/50
deepseek/deepseek-chat-v3-0324,0/1,32/50
deepseek/deepseek-r1-0528,0/1,10/50
google/gemini-2.5-flash-preview-05-20,0/1,50/50
google/gemini-2.5-flash-preview-05-20:thinking,0/1,15/50
openai/gpt-4.1,0/1,50/50
openai/o4-mini,0/1,5/50


In [67]:
# Success rate per model compared across a few levels of tutorial verbosity
df['tutorial_len'] = df['tutorial'].apply(lambda x: len(x) if isinstance(x, str) else 0)

tutorial_success = df.groupby(['model', 'tutorial_len'])['success'].mean().reset_index()
tutorial_success['success'] = (tutorial_success['success'] * 100).round(2)
tutorial_success.rename(columns={'success': 'Success Rate (%)', 'tutorial_len': 'Tutorial Length'}, inplace=True)

tutorial_success

Unnamed: 0,model,Tutorial Length,Success Rate (%)
0,anthropic/claude-3.7-sonnet:thinking,8665,0.0
1,anthropic/claude-sonnet-4,8665,0.0
2,deepseek/deepseek-chat-v3-0324,8665,0.0
3,deepseek/deepseek-r1-0528,8665,0.0
4,google/gemini-2.5-flash-preview-05-20,8665,0.0
5,google/gemini-2.5-flash-preview-05-20:thinking,8665,0.0
6,openai/gpt-4.1,8665,0.0
7,openai/o4-mini,8665,0.0


In [80]:
# Success rate per model compared across a few levels of prompt verbosity
df['prompt_len'] = df['prompt'].apply(lambda x: len(x) if isinstance(x, str) else 0)

prompt_success = df.groupby(['model', 'prompt_len'])['success'].mean().reset_index()
prompt_success['success'] = (prompt_success['success'] * 100).round(2)
prompt_success.rename(columns={'success': 'Success Rate (%)', 'prompt_len': 'Prompt Length'}, inplace=True)

prompt_success

Unnamed: 0,model,Prompt Length,Success Rate (%)
0,openai/o4-mini,1294,18.18
1,x-ai/grok-3-mini-beta,1294,16.67


In [81]:
df_filtered = df[df['exercise'] == '6_8_1']
pd.set_option('display.max_colwidth', None)
df_filtered['errors'].to_frame()

Unnamed: 0,errors


In [56]:
df_filtered['line_with_error'].to_frame()

Unnamed: 0,line_with_error


In [82]:
import pandas as pd

# Group by run_id to get per-run summary
run_grouped = df.groupby('run_id').agg({
    'model': 'first',
    'success': list,
    'attempt': 'max',
    'max_attempts': 'first'
}).reset_index()

# Expand success list into per-k success map
max_k = df['max_attempts'].max()

for k in range(1, max_k + 1):
    def pass_at_k(row):
        successes = row['success']
        used_attempts = row['attempt']
        # Success in any of the first k attempts
        success_in_k = any(successes[:k])
        # Or the model didn't even use k attempts
        not_used_k = used_attempts < k
        return success_in_k or not_used_k

    run_grouped[f'pass@{k}'] = run_grouped.apply(pass_at_k, axis=1)

# Now compute per-model mean for each pass@k
passk_cols = [f'pass@{k}' for k in range(1, max_k + 1)]
passk_summary = run_grouped.groupby('model')[passk_cols].mean().reset_index()

# Convert to percentage
for col in passk_cols:
    passk_summary[col] = (passk_summary[col] * 100).round(2)

# Rename columns for display
passk_summary.rename(columns={col: f'Pass@{col[-1]} (%)' for col in passk_cols}, inplace=True)

passk_summary


Unnamed: 0,model,Pass@1 (%),Pass@2 (%),Pass@3 (%)
0,openai/o4-mini,25.0,50.0,50.0
1,x-ai/grok-3-mini-beta,50.0,50.0,50.0


In [None]:
# TODO:
# thinking vs no thinking models (all)
# thinking vs no thinking on models that support both (fair)
# definition expanding comparison