In [16]:
import os
from inspect_ai.log import read_eval_log
import pandas as pd

In [17]:
pricing_df = pd.read_csv("../data/models_pricing.csv")

In [18]:
pricing_df

Unnamed: 0,model_name,model_type,input_price,output_price
0,openai/gpt-oss-120b,LM,0.0381,0.1143
1,MiniMaxAI/MiniMax-M2,LM,0.1721,0.5162
2,deepseek-ai/DeepSeek-V3.2,LM,0.6147,1.844
3,Qwen/Qwen3-VL-235B-A22B-Thinking,VLM,0.2428,0.7285
4,Qwen/Qwen3-VL-235B-A22B-Instruct,VLM,0.2428,0.7285
5,mistralai/Mistral-Large-3-675B-Instruct-2512,VLM,0.4882,1.4646
6,gemini-2.5-flash,VLM,0.3,2.5
7,gemini-2.5-pro,VLM,1.25,10.0
8,gemini-3-flash-preview,VLM,0.5,3.0
9,gemini-3-pro-preview,VLM,2.0,12.0


In [30]:
root_dir = "../outputs"

for eval_dir in os.listdir(root_dir):
	if eval_dir.startswith("eval_"):
		for model_dir in os.listdir(os.path.join(root_dir, eval_dir)):
			print(f"Evaluating {model_dir}")

			log_file = os.path.join(
				root_dir, eval_dir, model_dir,
				[f for f in os.listdir(os.path.join(root_dir, eval_dir, model_dir)) if f.endswith(".eval")][0]
			)

			# Read the inspect.ai evaluation log
			log = read_eval_log(log_file)

			total_cost = 0.0
			for model_name, usage in log.stats.model_usage.items():
				# remove everything that comes before the first slash (there could be two slashes in case of huggingface models)
				cleaned_model_name = model_name.replace(model_name.split("/")[0] + "/", "")
				input_cost_per_1m_tokens = pricing_df.loc[pricing_df["model_name"] == cleaned_model_name, "input_price"].values[0]
				output_cost_per_1m_tokens = pricing_df.loc[pricing_df["model_name"] == cleaned_model_name, "output_price"].values[0]
				cost = usage.input_tokens / 1_000_000 * input_cost_per_1m_tokens + (usage.output_tokens + usage.reasoning_tokens) / 1_000_000 * output_cost_per_1m_tokens
				total_cost += cost

				print(f"\t- {model_name} -> Total cost: ${cost:.4f}\n\t  (Input tokens: {usage.input_tokens}, Output tokens: {usage.output_tokens}, Reasoning tokens: {usage.reasoning_tokens})")
			print(f"Total: ${total_cost:.4f}\n")


Evaluating gemini-2.5-flash_2026-01-19_17-25-16
	- google/gemini-2.5-flash -> Total cost: $1.0914
	  (Input tokens: 9639, Output tokens: 66684, Reasoning tokens: 368709)
	- google/gemini-3-flash-preview -> Total cost: $0.2072
	  (Input tokens: 115245, Output tokens: 20583, Reasoning tokens: 29260)
Total: $1.2985

