In [None]:
import os
from inspect_ai.log import read_eval_log
import pandas as pd
from datasets import load_dataset

In [None]:
ds = load_dataset("matsant01/blind-spots-bench", split="test")
df = pd.DataFrame(ds)

In [None]:
import os
import pandas as pd
from inspect_ai.log import read_eval_log
from termcolor import colored, cprint

root_dir = "../outputs"
candidat_results = []

# Load pricing data
pricing_path = "../data/models_pricing.csv"
if os.path.exists(pricing_path):
    pricing_df = pd.read_csv(pricing_path)
else:
    print("Warning: pricing file not found")
    pricing_df = pd.DataFrame(columns=["model_name", "input_price", "output_price"])


for eval_dir in os.listdir(root_dir):
    if eval_dir.startswith("eval_") and os.path.isdir(os.path.join(root_dir, eval_dir)):
        for model_dir in os.listdir(os.path.join(root_dir, eval_dir)):
            model_path = os.path.join(root_dir, eval_dir, model_dir)
            if not os.path.isdir(model_path): continue

            eval_files = [f for f in os.listdir(model_path) if f.endswith(".eval")]
            if not eval_files: continue
            
            log_file = os.path.join(model_path, eval_files[0])
            
            try:
                log = read_eval_log(log_file)
            except Exception as e:
                print(f"Error reading {log_file}: {e}")
                continue

            # Identify Solver and Grader
            # We want to exclude grader cost. Grader is 'gemini-3-flash-preview'.
            solver_cost = 0.0
            solver_total_output_tokens = 0
            
            # The solver's model name as configured in the eval
            solver_model_config = log.eval.model
            
            # Iterate usage to calculate cost
            for model_name, usage in log.stats.model_usage.items():
                
                # Heuristic: If model_name contains 'gemini-3-flash-preview' and the solver is NOT that, skip it (it's the grader).
                if "gemini-3-flash-preview" in model_name and "gemini-3-flash-preview" not in solver_model_config:
                    continue
                
                # Try 1: Exact match with model_name
                price_row = pricing_df[pricing_df['model_name'] == model_name]
                
                # Try 2: Cleaned name (remove prefix before first /)
                if price_row.empty and "/" in model_name:
                    cleaned = model_name.replace(model_name.split("/")[0] + "/", "")
                    price_row = pricing_df[pricing_df['model_name'] == cleaned]

                # Try 3: Last part (after last /)
                if price_row.empty:
                    short = model_name.split("/")[-1]
                    price_row = pricing_df[pricing_df['model_name'] == short]
                
                if not price_row.empty:
                    in_p = price_row.iloc[0]['input_price']
                    out_p = price_row.iloc[0]['output_price']
                    
                    # Cost = Input + Output + Reasoning
                    r_tok = getattr(usage, 'reasoning_tokens', 0) or 0
                    
                    # Correction for GPT-5 double counting of reasoning in output_tokens
                    # For GPT-5 models, output_tokens already includes reasoning_tokens
                    if "gpt-5" in model_name.lower():
                        total_output_toks_for_pricing = usage.output_tokens
                    else:
                        total_output_toks_for_pricing = usage.output_tokens + r_tok
                    
                    cost = (usage.input_tokens / 1_000_000) * in_p + \
                           (total_output_toks_for_pricing / 1_000_000) * out_p
                    solver_cost += cost
                    solver_total_output_tokens += total_output_toks_for_pricing
                else:
                    if "gemini-3-flash-preview" not in model_name: # Don't warn about grader if logic failed
                        print(f"Warning: No pricing found for {model_name} (Solver: {solver_model_config})")
            
            # Calculate correct count
            correct_count = 0
            if log.samples:
                for s in log.samples:
                    # Check for score value 1 (or True) or 'C'
                    if s.score and (s.score.value == 'C' or s.score.value == 1 or s.score.value is True): 
                        correct_count += 1
            
            n_samples = len(log.samples) if log.samples else 0
            
            candidat_results.append({
                "Model": solver_model_config,
                "Correct": correct_count,
                "n_samples": n_samples,
                "Total Cost": solver_cost,
                "Total Output Tokens": solver_total_output_tokens,
                "log_file": log_file
            })

# Filter and Finalize
df_results = pd.DataFrame()

if candidat_results:
    # Determine standard sample size (use Max)
    max_samples = max(r['n_samples'] for r in candidat_results)
    
    final_results = []
    
    for r in candidat_results:
        # Check against standard sample size
        if r['n_samples'] < max_samples:
            cprint(f"Skipping {r['Model']} ({os.path.basename(r['log_file'])}): {r['n_samples']} samples (expected {max_samples})", "yellow")
            continue
            
        accuracy = r['Correct'] / r['n_samples']
        # Calculate cost per 100 samples
        cost_per_100_samples = (r['Total Cost'] / r['n_samples'] * 100) if r['n_samples'] > 0 else 0
        # Calculate avg output tokens per sample (output + reasoning)
        avg_output_tokens = r['Total Output Tokens'] / r['n_samples'] if r['n_samples'] > 0 else 0
        
        # Clean model name (everything after last /)
        model_name = r['Model'].split("/")[-1]
        
        final_results.append({
                "Model": model_name,
                "Accuracy": accuracy * 100, # in percentage
                "Cost/100 Samples ($)": cost_per_100_samples,
                "Output Toks/Sample": avg_output_tokens
        })

    # Create DataFrame
    df_results = pd.DataFrame(final_results)
    if not df_results.empty:
        # Sort
        df_results = df_results.sort_values(by="Accuracy", ascending=False)
        
        # --- Generate Markdown Table ---
        # floatfmt=".3f" ensures 3 decimal places for all float columns
        md_table = df_results.to_markdown(index=False, floatfmt=".2f")
        
        print("Generated Markdown Leaderboard:")
        print(md_table)

        # Save Markdown to file (instead of HTML)
        table_path = "../data/text_only_leaderboard.md"
        with open(table_path, "w") as f:
            f.write(md_table)
        print(f"Saved Markdown to '{table_path}'")
        
        # --- Update README.md ---
        readme_path = "../README.md"
        if os.path.exists(readme_path):
            with open(readme_path, "r") as f:
                readme_content = f.read()

            start_marker = "<!-- LEADERBOARD-START -->"
            end_marker = "<!-- LEADERBOARD-END -->"
            
            if start_marker in readme_content and end_marker in readme_content:
                start_idx = readme_content.find(start_marker) + len(start_marker)
                end_idx = readme_content.find(end_marker)
                
                # Check if markers are in correct order
                if start_idx < end_idx:
                    # Create the new content preserving outside content
                    # Add newlines ensuring separation
                    new_content = readme_content[:start_idx] + "\n" + md_table + "\n" + readme_content[end_idx:]
                    
                    with open(readme_path, "w") as f:
                        f.write(new_content)
                    print("Updated README.md with new leaderboard table.")
                else:
                    print("Error: README markers are malformed (start after end).")
            else:
                print("Warning: Leaderboard markers not found in README.md.")
    else:
        print("No valid results remaining after filtering.")
else:
    print("No results found.")

df_results