In [39]:
import os
import json
import pandas as pd
import numpy as np
pd.reset_option("display.max_rows")
pd.reset_option("display.max_columns")
pd.reset_option("display.width")

In [18]:
# Set the root directory containing the subdirectories and JSON files
root_dir = 'responses'

# Prepare a list to collect all the JSON data
data_list = []

# Walk through the directory tree
for subdir, _, files in os.walk(root_dir):
    for file in files:
        if file.endswith('.json'):
            json_path = os.path.join(subdir, file)
            with open(json_path, 'r') as f:
                try:
                    data = json.load(f)
                    data_list.append(data)
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON: {json_path}")

# Convert to a DataFrame
df = pd.DataFrame(data_list)

# Show the first few rows
df.head()


Unnamed: 0,model,exercise,prompt,tutorial,exercise_content,output,token_count,input_tokens,output_tokens,cost,thinking_mode,errors,line_with_error,success,run_id,attempt,max_attempts
0,openai/o4-mini,11_not_closed,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,"Lemma not_closed : ¬ ([0,1) is _closed_).\nProof.",Proof.\nRequire Import Coquelicot.Coquelicot.\...,6376,280,252,0.02713,True,"File ""/tmp/E11_not_closed_sxc1gooi.v"", line 34...",Require Import Coquelicot.Coquelicot.,False,openai/o4-mini::11_not_closed::1,3,3
1,x-ai/grok-3-mini-beta,11_not_closed,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,"Lemma not_closed : ¬ ([0,1) is _closed_).\nProof.",Proof.\nWe argue by contradiction.\nAssume tha...,3667,3619,48,0.00111,True,"File ""/tmp/E11_not_closed_c1ak_nxs.v"", line 35...","Assume that ([0,1) is _closed_).",False,x-ai/grok-3-mini-beta::11_not_closed::1,1,3
2,x-ai/grok-3-mini-beta,11_not_closed,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,"Lemma not_closed : ¬ ([0,1) is _closed_).\nProof.","Proof.\n\nWe need to show that ¬( [0,1) is clo...",368,246,123,0.000135,True,"File ""/tmp/E11_not_closed_2ckg7fcm.v"", line 34...","We need to show that ¬( [0,1) is closed ).",False,x-ai/grok-3-mini-beta::11_not_closed::2,2,3
3,x-ai/grok-3-mini-beta,11_not_closed,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,"Lemma not_closed : ¬ ([0,1) is _closed_).\nProof.","Proof.\n\nWe will prove that the interval [0,1...",493,262,232,0.000194,True,"File ""/tmp/E11_not_closed_9_vfar9d.v"", line 34...","We will prove that the interval [0,1) is not c...",False,x-ai/grok-3-mini-beta::11_not_closed::2,3,3
4,x-ai/grok-3-mini-beta,11_not_closed,You are asked to write a proof in the syntax o...,# Waterproof Tutorial\n\n## 1. We conclude tha...,"Lemma not_closed : ¬ ([0,1) is _closed_).\nProof.",Proof.\nWe argue by contradiction.\nAssume tha...,3684,3619,65,0.001118,True,"File ""/tmp/E11_not_closed_a4j3i0sx.v"", line 35...","Assume that ([0,1) is closed). (i)",False,x-ai/grok-3-mini-beta::11_not_closed::2,1,3


In [19]:
total_cost = df['cost'].sum()
total_cost

np.float64(0.10816060000000001)

In [None]:
# Dummy code to make up pass@k values; delete this later
new_success = []

for run_id, group in df.groupby('run_id'):
    attempts = sorted(group['attempt'].unique())

    # Assume max_attempt is the max of current group
    max_attempt = max(attempts)
    last_attempt = attempts[-1]

    a1 = np.random.rand() < 0.3
    a2 = a1 or (np.random.rand() < 0.3)
    success_map = {
        1: a1,
        2: a2,
        last_attempt: True  # Always succeeds on last attempt
    }

    # Fill in intermediate attempts if needed
    for attempt in attempts:
        if attempt not in success_map:
            # If more than 3 attempts: fallback logic, extend success from previous
            success_map[attempt] = success_map.get(attempt - 1, False)

    for attempt in attempts:
        new_success.append(success_map[attempt])

# Overwrite df['success']
df['success'] = new_success

In [20]:
# Total success rate per model
summary = df.groupby('model')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per model", label="tab:success_rate")
print(latex_table)

\begin{table}
\caption{Success rate per model}
\label{tab:success_rate}
\begin{tabular}{ll}
\toprule
model & Success Rate (%) \\
\midrule
openai/o4-mini & 0.000000 \\
x-ai/grok-3-mini-beta & 0.000000 \\
\bottomrule
\end{tabular}
\end{table}



In [21]:
# Total success rate per exercise (check if this code is correct)
summary = df.groupby('exercise')['success'].mean().reset_index()
summary['success'] = (summary['success'] * 100).round(2)  # Convert to percentage
summary.rename(columns={'success': 'Success Rate (%)'}, inplace=True)

# Export to LaTeX
latex_table = summary.to_latex(index=False, column_format="ll", caption="Success rate per exercise", label="tab:success_rate")
print(latex_table)

\begin{table}
\caption{Success rate per exercise}
\label{tab:success_rate}
\begin{tabular}{ll}
\toprule
exercise & Success Rate (%) \\
\midrule
11_not_closed & 0.000000 \\
\bottomrule
\end{tabular}
\end{table}



In [22]:
# Grid of success rates per model and per exercise (requires multiple runs of all models)
grid = df.pivot_table(index='model', columns='exercise', values='success', aggfunc='mean') * 100
grid = grid.round(2)
grid

exercise,11_not_closed
model,Unnamed: 1_level_1
openai/o4-mini,0.0
x-ai/grok-3-mini-beta,0.0


In [23]:
# Average output token counts per model
output_tokens_avg = df.groupby('model')['output_tokens'].mean().reset_index()
output_tokens_avg.rename(columns={'output_tokens': 'Avg Output Tokens'}, inplace=True)

# Get the thinking_mode per model
thinking_mode_per_model = df[['model', 'thinking_mode']].drop_duplicates()

# Merge into the result
output_tokens_avg = output_tokens_avg.merge(thinking_mode_per_model, on='model')

output_tokens_avg

Unnamed: 0,model,Avg Output Tokens,thinking_mode
0,openai/o4-mini,305.666667,True
1,x-ai/grok-3-mini-beta,168.666667,True


In [24]:
# Average cost per model
cost_avg = df.groupby('model')['cost'].mean().reset_index()
cost_avg.rename(columns={'cost': 'Avg Cost ($)'}, inplace=True)
cost_avg

Unnamed: 0,model,Avg Cost ($)
0,openai/o4-mini,0.01753
1,x-ai/grok-3-mini-beta,0.000497


In [25]:
# Success rate per model compared across a few levels of tutorial verbosity
df['tutorial_len'] = df['tutorial'].apply(lambda x: len(x) if isinstance(x, str) else 0)

tutorial_success = df.groupby(['model', 'tutorial_len'])['success'].mean().reset_index()
tutorial_success['success'] = (tutorial_success['success'] * 100).round(2)
tutorial_success.rename(columns={'success': 'Success Rate (%)', 'tutorial_len': 'Tutorial Length'}, inplace=True)

tutorial_success

Unnamed: 0,model,Tutorial Length,Success Rate (%)
0,openai/o4-mini,8665,0.0
1,x-ai/grok-3-mini-beta,8665,0.0


In [26]:
# Success rate per model compared across a few levels of prompt verbosity
df['prompt_len'] = df['prompt'].apply(lambda x: len(x) if isinstance(x, str) else 0)

prompt_success = df.groupby(['model', 'prompt_len'])['success'].mean().reset_index()
prompt_success['success'] = (prompt_success['success'] * 100).round(2)
prompt_success.rename(columns={'success': 'Success Rate (%)', 'prompt_len': 'Prompt Length'}, inplace=True)

prompt_success

Unnamed: 0,model,Prompt Length,Success Rate (%)
0,openai/o4-mini,1294,0.0
1,x-ai/grok-3-mini-beta,1294,0.0


In [27]:
df_filtered = df[df['exercise'] == '6_8_1']
pd.set_option('display.max_colwidth', None)
df_filtered['errors'].to_frame()

Unnamed: 0,errors


In [28]:
df_filtered['line_with_error'].to_frame()

Unnamed: 0,line_with_error


In [44]:
import pandas as pd

# Group by run_id to get per-run summary
run_grouped = df.groupby('run_id').agg({
    'model': 'first',
    'success': list,
    'attempt': 'max',
    'max_attempts': 'first'
}).reset_index()

# Expand success list into per-k success map
max_k = df['max_attempts'].max()

for k in range(1, max_k + 1):
    def pass_at_k(row):
        successes = row['success']
        used_attempts = row['attempt']
        # Success in any of the first k attempts
        success_in_k = any(successes[:k])
        # Or the model didn't even use k attempts
        not_used_k = used_attempts < k
        return success_in_k or not_used_k

    run_grouped[f'pass@{k}'] = run_grouped.apply(pass_at_k, axis=1)

# Now compute per-model mean for each pass@k
passk_cols = [f'pass@{k}' for k in range(1, max_k + 1)]
passk_summary = run_grouped.groupby('model')[passk_cols].mean().reset_index()

# Convert to percentage
for col in passk_cols:
    passk_summary[col] = (passk_summary[col] * 100).round(2)

# Rename columns for display
passk_summary.rename(columns={col: f'Pass@{col[-1]} (%)' for col in passk_cols}, inplace=True)

passk_summary


Unnamed: 0,model,Pass@1 (%),Pass@2 (%),Pass@3 (%)
0,openai/o4-mini,0.0,100.0,100.0
1,x-ai/grok-3-mini-beta,50.0,50.0,100.0


In [None]:
# TODO:
# thinking vs no thinking models (all)
# thinking vs no thinking on models that support both (fair)
# definition expanding comparison