In [11]:
import os
import json
import pandas as pd

In [25]:
# Set the root directory containing the subdirectories and JSON files
root_dir = 'responses'

# Prepare a list to collect all the JSON data
data_list = []

# Walk through the directory tree
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            json_path = os.path.join(subdir, file)
            with open(json_path, 'r') as f:
                try:
                    data = json.load(f)
                    data_list.append(data)
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON: {json_path}")

# Convert to a DataFrame
df = pd.DataFrame(data_list)

# Show the first few rows
df.head()


Unnamed: 0,model,exercise,prompt,tutorial,exercise_content,output,token_count,input_tokens,output_tokens,cost,thinking_mode,errors,line_with_error,success
0,openai/o1-mini,10_7_intermediate,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,Section exercise_10_7_3.\n\nVariable (X : Metr...,"Proof.\nWe need to show that (∀ k ∈ ℕ, n(k) < ...",7114,4304,2810,0.017098,True,"File ""/tmp/E10_7_intermediate_7dek2n62.v"", lin...","By the definition of n, n(k) = 2 * k and n(k +...",False
1,anthropic/claude-3.7-sonnet:thinking,10_7_intermediate,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,Section exercise_10_7_3.\n\nVariable (X : Metr...,Proof.\nTake k ∈ ℕ.\nWe need to show that (n(k...,7374,5337,2037,0.046566,True,,,True
2,meta-llama/llama-4-maverick,10_7_intermediate,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,Section exercise_10_7_3.\n\nVariable (X : Metr...,Take k ∈ ℕ.\nWe need to show that (n(k) < n(k ...,3841,3718,123,0.000669,False,,,True
3,qwen/qwq-32b:free,10_7_intermediate,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,Section exercise_10_7_3.\n\nVariable (X : Metr...,Proof.\nTake k ∈ ℕ.\nWe need to show that (2 *...,4903,2609,2294,0.00085,True,,,True
4,qwen/qwen3-235b-a22b,10_7_intermediate,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,Section exercise_10_7_3.\n\nVariable (X : Metr...,Proof.\nTake k ∈ ℕ.\nWe need to show that (n(k...,8451,3727,4724,0.003356,True,"File ""/tmp/E10_7_intermediate_ry_2bot8.v"", lin...",We conclude that (2 * k < 2 * (k + 1)).,False


In [26]:
total_cost = df['cost'].sum()
total_cost

np.float64(0.14488195)

In [27]:
# Total success rate per model
summary = df.groupby('model')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per model", label="tab:success_rate")
print(latex_table)

\begin{table}
\caption{Success rate per model}
\label{tab:success_rate}
\begin{tabular}{ll}
\toprule
model & Success Rate (%) \\
\midrule
anthropic/claude-3.7-sonnet & 100.000000 \\
anthropic/claude-3.7-sonnet:thinking & 100.000000 \\
anthropic/claude-sonnet-4 & 0.000000 \\
deepseek/deepseek-chat-v3-0324:free & 0.000000 \\
deepseek/deepseek-prover-v2:free & 0.000000 \\
deepseek/deepseek-r1:free & 0.000000 \\
google/gemini-2.5-flash-preview-05-20 & 0.000000 \\
google/gemini-2.5-flash-preview-05-20:thinking & 0.000000 \\
meta-llama/llama-3.3-70b-instruct & 0.000000 \\
meta-llama/llama-4-maverick & 100.000000 \\
nousresearch/hermes-3-llama-3.1-405b & 0.000000 \\
nousresearch/hermes-3-llama-3.1-70b & 0.000000 \\
openai/o1-mini & 0.000000 \\
openai/o3-mini & 0.000000 \\
openai/o4-mini & 100.000000 \\
qwen/qwen-2.5-72b-instruct & 0.000000 \\
qwen/qwen3-235b-a22b & 0.000000 \\
qwen/qwq-32b:free & 100.000000 \\
x-ai/grok-3-mini-beta & 0.000000 \\
\bottomrule
\end{tabular}
\end{table}



In [28]:
# Total success rate per exercise (check if this code is correct)
summary = df.groupby('exercise')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per exercise", label="tab:success_rate")
print(latex_table)

\begin{table}
\caption{Success rate per exercise}
\label{tab:success_rate}
\begin{tabular}{ll}
\toprule
exercise & Success Rate (%) \\
\midrule
10_7_intermediate & 26.320000 \\
\bottomrule
\end{tabular}
\end{table}



In [29]:
# Grid of success rates per model and per exercise (requires multiple runs of all models)
grid = df.pivot_table(index='model', columns='exercise', values='success', aggfunc='mean') * 100
grid = grid.round(2)
grid

exercise,10_7_intermediate
model,Unnamed: 1_level_1
anthropic/claude-3.7-sonnet,100.0
anthropic/claude-3.7-sonnet:thinking,100.0
anthropic/claude-sonnet-4,0.0
deepseek/deepseek-chat-v3-0324:free,0.0
deepseek/deepseek-prover-v2:free,0.0
deepseek/deepseek-r1:free,0.0
google/gemini-2.5-flash-preview-05-20,0.0
google/gemini-2.5-flash-preview-05-20:thinking,0.0
meta-llama/llama-3.3-70b-instruct,0.0
meta-llama/llama-4-maverick,100.0


In [30]:
# Average output token counts per model
output_tokens_avg = df.groupby('model')['output_tokens'].mean().reset_index()
output_tokens_avg.rename(columns={'output_tokens': 'Avg Output Tokens'}, inplace=True)

# Get the thinking_mode per model
thinking_mode_per_model = df[['model', 'thinking_mode']].drop_duplicates()

# Merge into the result
output_tokens_avg = output_tokens_avg.merge(thinking_mode_per_model, on='model')

output_tokens_avg

Unnamed: 0,model,Avg Output Tokens,thinking_mode
0,anthropic/claude-3.7-sonnet,181.0,False
1,anthropic/claude-3.7-sonnet:thinking,2037.0,True
2,anthropic/claude-sonnet-4,109.0,False
3,deepseek/deepseek-chat-v3-0324:free,207.0,False
4,deepseek/deepseek-prover-v2:free,159.0,False
5,deepseek/deepseek-r1:free,3461.0,True
6,google/gemini-2.5-flash-preview-05-20,118.0,False
7,google/gemini-2.5-flash-preview-05-20:thinking,485.0,True
8,meta-llama/llama-3.3-70b-instruct,93.0,False
9,meta-llama/llama-4-maverick,123.0,False


In [31]:
# Average cost per model
cost_avg = df.groupby('model')['cost'].mean().reset_index()
cost_avg.rename(columns={'cost': 'Avg Cost ($)'}, inplace=True)
cost_avg

Unnamed: 0,model,Avg Cost ($)
0,anthropic/claude-3.7-sonnet,0.018642
1,anthropic/claude-3.7-sonnet:thinking,0.046566
2,anthropic/claude-sonnet-4,0.017562
3,deepseek/deepseek-chat-v3-0324:free,0.001309
4,deepseek/deepseek-prover-v2:free,0.002225
5,deepseek/deepseek-r1:free,0.009424
6,google/gemini-2.5-flash-preview-05-20,0.000698
7,google/gemini-2.5-flash-preview-05-20:thinking,0.000918
8,meta-llama/llama-3.3-70b-instruct,0.000283
9,meta-llama/llama-4-maverick,0.000669


In [32]:
# Success rate per model compared across a few levels of tutorial verbosity
df['tutorial_len'] = df['tutorial'].apply(lambda x: len(x) if isinstance(x, str) else 0)

tutorial_success = df.groupby(['model', 'tutorial_len'])['success'].mean().reset_index()
tutorial_success['success'] = (tutorial_success['success'] * 100).round(2)
tutorial_success.rename(columns={'success': 'Success Rate (%)', 'tutorial_len': 'Tutorial Length'}, inplace=True)

tutorial_success

Unnamed: 0,model,Tutorial Length,Success Rate (%)
0,anthropic/claude-3.7-sonnet,8665,100.0
1,anthropic/claude-3.7-sonnet:thinking,8665,100.0
2,anthropic/claude-sonnet-4,8665,0.0
3,deepseek/deepseek-chat-v3-0324:free,8665,0.0
4,deepseek/deepseek-prover-v2:free,8665,0.0
5,deepseek/deepseek-r1:free,8665,0.0
6,google/gemini-2.5-flash-preview-05-20,8665,0.0
7,google/gemini-2.5-flash-preview-05-20:thinking,8665,0.0
8,meta-llama/llama-3.3-70b-instruct,8665,0.0
9,meta-llama/llama-4-maverick,8665,100.0


In [33]:
# Success rate per model compared across a few levels of prompt verbosity
df['prompt_len'] = df['prompt'].apply(lambda x: len(x) if isinstance(x, str) else 0)

prompt_success = df.groupby(['model', 'prompt_len'])['success'].mean().reset_index()
prompt_success['success'] = (prompt_success['success'] * 100).round(2)
prompt_success.rename(columns={'success': 'Success Rate (%)', 'prompt_len': 'Prompt Length'}, inplace=True)

prompt_success

Unnamed: 0,model,Prompt Length,Success Rate (%)
0,anthropic/claude-3.7-sonnet,1294,100.0
1,anthropic/claude-3.7-sonnet:thinking,1294,100.0
2,anthropic/claude-sonnet-4,1294,0.0
3,deepseek/deepseek-chat-v3-0324:free,1294,0.0
4,deepseek/deepseek-prover-v2:free,1294,0.0
5,deepseek/deepseek-r1:free,1294,0.0
6,google/gemini-2.5-flash-preview-05-20,1294,0.0
7,google/gemini-2.5-flash-preview-05-20:thinking,1294,0.0
8,meta-llama/llama-3.3-70b-instruct,1294,0.0
9,meta-llama/llama-4-maverick,1294,100.0


In [None]:
# Pass@k success rate per model; can mention if a model consistently did not use all k attempts
df['pass@k'] = df['success'] | (df['used_attempts'] < df['max_attempts'])
passk_summary = df.groupby('model')['pass@k'].mean().reset_index()
passk_summary['pass@k'] = (passk_summary['pass@k'] * 100).round(2)
passk_summary.rename(columns={'pass@k': 'Pass@k Rate (%)'}, inplace=True)
passk_summary

In [None]:
# TODO:
# thinking vs no thinking models (all)
# thinking vs no thinking on models that support both (fair)
# definition expanding comparison