In [None]:
import os
import pandas as pd
from inspect_ai.log import read_eval_log

root_dir = "../outputs"
candidat_results = []

# Load pricing data
pricing_path = "../data/models_pricing.csv"
pricing_df = pd.read_csv(pricing_path)

pricing_df

In [None]:
def clean_model_name(model_name: str) -> str:
	if "google/" in model_name:
		return model_name.split("/")[-1].strip()
	elif "openai/" in model_name:
		if "gpt" in model_name or "/o3" in model_name:
			return model_name.split("/")[-1].strip()
		else:
			return model_name.split("openai/")[-1]

	else:
		return model_name.strip()

In [None]:
import numpy as np


results = []
for eval_dir in os.listdir(root_dir):
    if eval_dir.startswith("eval_") and os.path.isdir(os.path.join(root_dir, eval_dir)):
        
        for model_dir in os.listdir(os.path.join(root_dir, eval_dir)):
            model_path = os.path.join(root_dir, eval_dir, model_dir)
            
            try:
                eval_files = [f for f in os.listdir(model_path) if f.endswith(".eval")]
                log_file = os.path.join(model_path, eval_files[0])
                log = read_eval_log(log_file)
            except Exception as e:
                print(f"Error reading {log_file}: {e}")
                continue

            n_models = len(log.stats.model_usage.keys())
            if n_models == 1:
                # Edge case: the same model is used for solving and grading
                model_name = clean_model_name(list(log.stats.model_usage.keys())[0])

                # Compute cost for grading only
                gr_in_tks = 0
                gr_out_tks = 0
                gr_reas_tks = 0

                for sample in log.samples:
                    usage = sample.scores["model_graded_qa_with_reasoning_stripped"].metadata["usage"]
                    gr_in_tks += usage["input_tokens"]
                    gr_out_tks += usage["output_tokens"]
                    gr_reas_tks += usage["reasoning_tokens"] if usage["reasoning_tokens"] is not None else 0

                # Subtract to obtain solving usage
                usage = log.stats.model_usage[list(log.stats.model_usage.keys())[0]]
                in_tks = usage.input_tokens - gr_in_tks
                out_tks = usage.output_tokens - gr_out_tks
                reas_tks = usage.reasoning_tokens - gr_reas_tks
                reas_tks = 0 if reas_tks is None else reas_tks

                input_price_per_1M = pricing_df[pricing_df["model_name"] == model_name]["input_price"].values[0]
                output_price_per_1M = pricing_df[pricing_df["model_name"] == model_name]["output_price"].values[0]
                
                input_cost = (in_tks / 1_000_000) * input_price_per_1M
                output_cost = ((out_tks + reas_tks) / 1_000_000) * output_price_per_1M
                total_cost = input_cost + output_cost

            elif n_models > 1:
                d = log.stats.model_usage.copy()
                d.pop("google/gemini-3-flash-preview", None)    # Remove grader usage
                model_name = clean_model_name(list(d.keys())[0])

                if "gpt-5" in model_name.lower() or "o3" in model_name.lower():
                    in_tks = out_tks = reas_tks = 0
                    for sample in log.samples:
                        if sample.error:
                            continue
                        usage = sample.model_usage[list(d.keys())[0]]
                        in_tks += usage.input_tokens
                        out_tks += usage.output_tokens
                else:
                    usage = list(d.values())[0]
                    in_tks, out_tks, reas_tks = usage.input_tokens, usage.output_tokens, usage.reasoning_tokens
                    reas_tks = 0 if reas_tks is None else reas_tks

                # Get pricing info
                pricing_row = pricing_df[pricing_df["model_name"] == model_name]
                input_price_per_1M = pricing_row["input_price"].values[0]
                output_price_per_1M = pricing_row["output_price"].values[0]

                input_cost = (in_tks / 1_000_000) * input_price_per_1M
                output_cost = ((out_tks + reas_tks) / 1_000_000) * output_price_per_1M
                total_cost = input_cost + output_cost

            else:
                print(f"Warning: No model usage found in {log_file}")
                continue

            # Compute number valid samples
            num_valid = sum(1 for sample in log.samples if sample.score is not None and sample.error is None)
            num_correct = sum(1 for sample in log.samples if sample.score is not None and sample.score.value in [1, True, "C"])
            num_total_samples = len(log.samples)
            
            scores_by_id = {}
            for sample in log.samples:
                if sample.id not in scores_by_id:
                    scores_by_id[sample.id] = []
                scores_by_id[sample.id].append(1 if sample.score is not None and sample.score.value in [1, True, "C"] else 0)
            pass_at_2 = sum(1 for scores in scores_by_id.values() if sum(scores[:2]) >= 1) / len(scores_by_id)
            std = np.mean([np.std(scores) for scores in scores_by_id.values()])

            results.append({
                "eval_dir": eval_dir,
                "model_name": model_name,
                "accuracy": num_correct / num_total_samples,    # accuracy over all samples (invalid counted as incorrect)
                "pass_at_2": pass_at_2,
                "accuracy_std": std,
                "num_samples": num_total_samples,
                "num_valid_samples": num_valid,
                "num_correct_samples": num_correct,
                "input_tokens": in_tks,
                "output_tokens": out_tks,
                "reasoning_tokens": reas_tks,
                "input_cost": input_cost,
                "output_cost": output_cost,
                "total_cost": total_cost,
                "avg_price_per_sample": total_cost / num_valid if num_valid > 0 else 0,  # average only over valid samples
                "avg_out_tokens_per_sample": (out_tks + reas_tks) / num_valid if num_valid > 0 else 0,
            })



In [None]:
df = pd.DataFrame(results)
df

In [None]:
print("Solver's cost: ${:.2f}".format(df["total_cost"].sum()))
print("\t- openai cost: ${:.2f}".format(df[df["model_name"].str.contains("gpt") | df["model_name"].str.contains("o3")]["total_cost"].sum()))
print("\t- google cost: ${:.2f}".format(df[df["model_name"].str.contains("gemini")]["total_cost"].sum()))
print("\t- Open models: ${:.2f}".format(df[~(df["model_name"].str.contains("gpt") | df["model_name"].str.contains("o3") | df["model_name"].str.contains("gemini"))]["total_cost"].sum()))

In [None]:
# Update README.md
str_for_readme = ""

for eval_dir in df['eval_dir'].unique():
	tmp_df = df[df['eval_dir'] == eval_dir].copy()
	tmp_df = tmp_df[["model_name", "accuracy", "pass_at_2", "avg_price_per_sample", "avg_out_tokens_per_sample"]]
	tmp_df = tmp_df.sort_values(by="accuracy", ascending=False)
	tmp_df["accuracy"] = tmp_df["accuracy"].apply(lambda x: f"{100 * x:.2f}%")
	tmp_df["pass_at_2"] = tmp_df["pass_at_2"].apply(lambda x: f"{100 * x:.2f}%")
	tmp_df["model_name"] = tmp_df["model_name"].apply(lambda x: x.split("/")[-1])
	tmp_df["avg_price_per_sample"] = (tmp_df["avg_price_per_sample"] * 100).apply(lambda x: f"${x:.3f}")
	tmp_df["avg_out_tokens_per_sample"] = tmp_df["avg_out_tokens_per_sample"].apply(lambda x: f"{x:.1f}")
	tmp_df = tmp_df.rename(columns={
		"model_name": "Model",
		"accuracy": "Pass@1",
		"pass_at_2": "Pass@2",
		"avg_price_per_sample": "Price / 100 Sample",
		"avg_out_tokens_per_sample": "Output Tks / Sample"
	})

	if eval_dir == "eval_text":
		title = "üìù‚Üíüìù Text-only"
	elif eval_dir == "eval_mm":
		title = "üñºÔ∏è‚Üíüìù Multimodal-to-text"
	else:
		title = eval_dir

	section_header = f"#### {title} Evaluation\n"
	table_md = tmp_df.to_markdown(index=False)
	str_for_readme += section_header + table_md + "\n\n"

# Open the README, find the markers and replace the content in between
with open("../README.md", "r") as f:
	readme_content = f.read()
start_marker = "<!-- LEADERBOARD-START -->"
end_marker = "<!-- LEADERBOARD-END -->"
start_index = readme_content.index(start_marker) + len(start_marker)
end_index = readme_content.index(end_marker)
new_readme_content = (readme_content[:start_index] + "\n\n" +
					  str_for_readme +
					  readme_content[end_index:])
with open("../README.md", "w") as f:
	f.write(new_readme_content)