## Ace Attorney Ablation Study

In [1]:
import sys, pathlib; sys.path.append(str(pathlib.Path().resolve().parent))
import os
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import requests
import json

import base64

from tools.serving.api_providers import (
    openai_text_reasoning_completion,
    anthropic_text_completion,
    gemini_text_completion,
    together_ai_completion,
    xai_grok_completion
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Create a prompt template with {level name} as the placeholder
prompt_template = """
You are an expert on *Phoenix Wright: Ace Attorney*.

LEVEL = “{level name}”

Return **exactly** the three sections below, in the order shown, with *nothing* else before or after them.

1) Detailed Narrative Description:
<Write one coherent paragraph in strict chronological order, including:
  • exact dates/times (use the game’s timeline),
  • locations,
  • all relevant characters’ motives,
  • any off-camera investigation Phoenix, Mia, or their allies perform.>

2) Complete Evidence List:
- "Exact In-Game Item Name" (Type) Relevance: one-sentence explanation of why it helps the defense.
- (repeat bullet for every distinct item)

3) Cross-Examination Breakdown:
- **<Witness Name>, Round X**
  - Statement: <key line from testimony>
  - Present: <evidence name>  (or “Press” if simply pressing)
  - Contradiction Exposed: <logical inconsistency uncovered>
  - Impact: <how this advances Phoenix’s case>
- (add bullets for every round / witness)

**Formatting rules (must follow strictly):**
- Use the section headings *exactly* as written (including numbers, parentheses, and colons).
- Leave exactly one blank line between sections.
- Section 1 is plain prose (no lists).
- Sections 2 & 3 must be bullet lists that match the patterns shown.
- Do **not** add extra commentary, headings, or markdown.
- If a detail is unknown in-game, omit it rather than inventing filler.
"""

# Define the level names
level_names = ["The First Turnabout", "Turnabout Sisters - (Part 1 to Part 4)"]

# Create the actual prompts
prompts = [prompt_template.format(**{"level name": level_name}) for level_name in level_names]

# Step 1: Define prompts and load performance data
def setup_study_data():
    # Load the performance data
    with open("./rank_data_03_25_2025.json", "r") as f:
        performance_data = json.load(f)
    
    # Extract Ace Attorney ranks
    ace_attorney_data = performance_data["Ace Attorney"]["results"]
    
    # Create a dictionary mapping model names to ranks and scores
    model_performance = {}
    for result in ace_attorney_data:
        model_name = result["model"]
        model_performance[model_name] = {
            "rank": result["rank"],
            "levels_cracked": result["levels_cracked"],
            "score": result["score"]
        }
    
    # Load ground truth transcripts
    with open("./ace_attorney_ground_truth.json", "r") as f:
        ground_truth = json.load(f)
    
    # Extract transcripts in the same order as level_names
    transcripts = [ground_truth[level_name] for level_name in level_names]
    
    return prompts, level_names, model_performance, transcripts

# Step 2: Generate Text Descriptions
def generate_descriptions(models, prompts, level_names):
    # First try to load existing results
    try:
        with open("generated_texts.json", "r") as f:
            results = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        results = {}
    
    for model_name in models:
        # Skip if model results already exist
        if model_name in results:
            print(f"Skipping {model_name} - already exists in results")
            continue
            
        results[model_name] = {}
        
        for prompt, level_name in zip(prompts, level_names):
            system_prompt = "You are an Ace Attorney expert."
            print(f"Generating for {model_name} - {level_name}")
            
            try:
                # Generate based on model provider
                if "o1-" in model_name or "o3-" in model_name or "o4-" in model_name or "gpt-" in model_name:
                    generated_text = openai_text_reasoning_completion(
                        system_prompt=system_prompt,
                        model_name=model_name,
                        prompt=prompt,
                        temperature=0
                    )

                elif "claude-3-7-sonnet-20250219(thinking)" in model_name:
                    generated_text = anthropic_text_completion(
                        system_prompt=system_prompt,
                        model_name="claude-3-7-sonnet-20250219",
                        prompt=prompt,
                        thinking=True
                    )
                    
                elif "claude" in model_name:
                    generated_text = anthropic_text_completion(
                        system_prompt=system_prompt,
                        model_name=model_name,
                        prompt=prompt,
                        thinking=False
                    )
                    
                elif "gemini" in model_name:
                    generated_text = gemini_text_completion(
                        system_prompt=system_prompt,
                        model_name=model_name,
                        prompt=prompt
                    )
                    
                elif "llama-4-maverick" in model_name.lower():
                    generated_text = together_ai_completion(
                        system_prompt=system_prompt,
                        model_name=model_name,
                        prompt=prompt,
                        temperature=0
                    )
                elif "grok-3" in model_name:
                    generated_text = xai_grok_completion(
                        system_prompt=system_prompt,
                        model_name=model_name,
                        prompt=prompt,
                        temperature=0
                    )
                
                results[model_name][level_name] = generated_text
                
                # Save after each successful generation
                with open("generated_texts.json", "w") as f:
                    json.dump(results, f, indent=4)
                    
            except Exception as e:
                print(f"Error generating for {model_name} - {level_name}: {str(e)}")
                # Save partial results even if there's an error
                with open("generated_texts.json", "w") as f:
                    json.dump(results, f, indent=4)
                continue
            
    return results

def compute_similarity(generated_texts, transcripts):
    # Load SBERT model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    similarity_scores = {}
    
    for model_name, texts in generated_texts.items():
        similarity_scores[model_name] = []
        
        # Debug print
        print(f"Processing model: {model_name}")
        print(f"Available texts: {list(texts.keys())}")
        
        try:
            for i, level_name in enumerate(level_names):
                if level_name in texts:
                    # Get embeddings
                    emb_gen = model.encode(texts[level_name], show_progress_bar=False)
                    emb_trans = model.encode(transcripts[i], show_progress_bar=False)
                    
                    # Calculate cosine similarity
                    sim_score = cosine_similarity([emb_gen], [emb_trans])[0][0]
                    similarity_scores[model_name].append(sim_score)
                    
                    print(f"Computed similarity for {level_name}: {sim_score:.4f}")
                else:
                    print(f"Warning: Missing text for {level_name} in {model_name}")
                    similarity_scores[model_name].append(0.0)  # or np.nan if you prefer
                
        except KeyError as e:
            print(f"Error processing {model_name}: Missing key {e}")
            print(f"Available keys: {list(texts.keys())}")
            continue
        except Exception as e:
            print(f"Error processing {model_name}: {str(e)}")
            continue
    
    return similarity_scores

# Step 4: Calculate total similarity and correlation with performance
def analyze_correlation(similarity_scores, model_performance):
    # Calculate total similarity score for each model
    total_similarity = {}
    for model_name, scores in similarity_scores.items():
        total_similarity[model_name] = sum(scores)
    
    # Prepare data for correlation
    models = []
    sim_scores = []
    ranks = []
    game_scores = []
    
    for model_name, total_sim in total_similarity.items():
        if model_name in model_performance:
            models.append(model_name)
            sim_scores.append(total_sim)
            ranks.append(model_performance[model_name]["rank"])
            game_scores.append(model_performance[model_name]["score"])
    
    # Calculate Spearman's rank correlation with rank
    rank_rho, rank_p_val = spearmanr(sim_scores, ranks)
    
    # Calculate Spearman's rank correlation with game score
    score_rho, score_p_val = spearmanr(sim_scores, game_scores)
    
    # Create a summary DataFrame
    results_df = pd.DataFrame({
        "Model": models,
        "Total Similarity": sim_scores,
        "Performance Rank": ranks,
        "Game Score": game_scores
    })
    
    # Sort by total similarity
    results_df = results_df.sort_values("Total Similarity", ascending=False)
    
    correlation_results = {
        "rank_correlation": rank_rho,
        "rank_p_value": rank_p_val,
        "score_correlation": score_rho,
        "score_p_value": score_p_val
    }
    
    return results_df, correlation_results, total_similarity

# Step 5: Main Function
def run_ablation_study():
    # Setup data
    prompts, level_names, model_performance, transcripts = setup_study_data()
    
    # Define models to test as specified by the user
    models = [
        "o1-2024-12-17",
        "o3-2025-04-16",
        "gemini-2.5-pro-exp-03-25",
        "claude-3-7-sonnet-20250219(thinking)",
        "gpt-4.1-2025-04-14",
        "claude-3-5-sonnet-20241022",
        "gemini-2.0-flash-thinking-exp-1219",
        "gemini-2.5-flash-preview-04-17",
        "o4-mini-2025-04-16",
        "Llama-4-Maverick-17B-128E-Instruct-FP8",
        "grok-3-mini-beta"
    ]
    
    # Generate descriptions
    print("Generating model descriptions...")
    generated_texts = generate_descriptions(models, prompts, level_names)
    
    # Compute similarities
    print("Computing semantic similarities...")
    similarity_scores = compute_similarity(generated_texts, transcripts)
    
    # Analyze correlations
    print("Analyzing correlations...")
    results_df, correlation_results, total_similarity = analyze_correlation(similarity_scores, model_performance)
    
    # Display results
    print("\nCorrelation Results:")
    print(f"Correlation with rank: {correlation_results['rank_correlation']:.4f} (p-value = {correlation_results['rank_p_value']:.4f})")
    print(f"Correlation with game score: {correlation_results['score_correlation']:.4f} (p-value = {correlation_results['score_p_value']:.4f})")
    
    # Create a detailed DataFrame showing similarity scores for each prompt
    detailed_df = pd.DataFrame({
        "Model": list(similarity_scores.keys()),
        "First Turnabout Similarity": [scores[0] for scores in similarity_scores.values()],
        "Sister Turnabout Similarity": [scores[1] for scores in similarity_scores.values()],
        "Total Similarity": [total_similarity[model] for model in similarity_scores.keys()]
    })
    
    # Add performance data where available
    performance_rank = []
    performance_score = []
    
    for model in detailed_df["Model"]:
        if model in model_performance:
            performance_rank.append(model_performance[model]["rank"])
            performance_score.append(model_performance[model]["score"])
        else:
            performance_rank.append(np.nan)
            performance_score.append(np.nan)
    
    detailed_df["Performance Rank"] = performance_rank
    detailed_df["Game Score"] = performance_score
    
    # Sort by total similarity
    detailed_df = detailed_df.sort_values("Total Similarity", ascending=False)
    
    return results_df, correlation_results, detailed_df, generated_texts, similarity_scores

# Run the study if executed directly
if __name__ == "__main__":
    results_df, correlation_results, detailed_df, generated_texts, similarity_scores = run_ablation_study()
    print("\nDetailed Results:")
    print(detailed_df)

Generating model descriptions...
Skipping o1-2024-12-17 - already exists in results
Skipping o3-2025-04-16 - already exists in results
Skipping gemini-2.5-pro-exp-03-25 - already exists in results
Skipping claude-3-7-sonnet-20250219(thinking) - already exists in results
Skipping gpt-4.1-2025-04-14 - already exists in results
Skipping claude-3-5-sonnet-20241022 - already exists in results
Skipping gemini-2.0-flash-thinking-exp-1219 - already exists in results
Skipping gemini-2.5-flash-preview-04-17 - already exists in results
Skipping o4-mini-2025-04-16 - already exists in results
Skipping Llama-4-Maverick-17B-128E-Instruct-FP8 - already exists in results
Skipping grok-3-beta - already exists in results
Generating for grok-3-mini-beta - The First Turnabout
current reasoning effort: high
Generating for grok-3-mini-beta - Turnabout Sisters - (Part 1 to Part 4)
current reasoning effort: high
Computing semantic similarities...
Processing model: o1-2024-12-17
Available texts: ['The First Tur

In [None]:
import numpy as np
from scipy import stats
from sentence_transformers import SentenceTransformer
def add_embedding_cross_correlation(detailed_df, generated_texts):
    # Initialize SBERT model
    model = SentenceTransformer("all-MiniLM-L6-v2")
    
    # Calculate cross-correlation using embeddings for each model
    cross_correlations = []
    
    for model_name in detailed_df['Model']:
        if model_name in generated_texts:
            # Check if both required texts exist for this model
            if "The First Turnabout" in generated_texts[model_name] and "Turnabout Sisters - (Part 1 to Part 4)" in generated_texts[model_name]:
                # Get the generated texts for both cases
                first_turnabout_text = generated_texts[model_name]["The First Turnabout"]
                sister_turnabout_text = generated_texts[model_name]["Turnabout Sisters - (Part 1 to Part 4)"]
                
                # Get embeddings for both texts
                first_turnabout_embedding = model.encode(first_turnabout_text, show_progress_bar=False)
                sister_turnabout_embedding = model.encode(sister_turnabout_text, show_progress_bar=False)
                
                # Calculate cosine similarity between embedding vectors
                # Reshape embeddings to 2D arrays for cosine_similarity
                sim_score = cosine_similarity([first_turnabout_embedding], [sister_turnabout_embedding])[0][0]
                cross_correlations.append(sim_score)
            else:
                # If either text is missing, append NaN
                cross_correlations.append(np.nan)
        else:
            cross_correlations.append(np.nan)
    
    # Add the embedding-based cross-correlation scores as a new column
    detailed_df['Cross Similarity Scores'] = cross_correlations
    
    # Calculate average embedding correlation across all models
    valid_correlations = [c for c in cross_correlations if not np.isnan(c)]
    avg_correlation = np.mean(valid_correlations)
    print(f"\nAverage embedding cosine similarity between cases: {avg_correlation:.4f}")
    
    return detailed_df

# To use this function, you would add this after getting your detailed_df:
detailed_df = add_embedding_cross_correlation(detailed_df, generated_texts)

# Display the updated DataFrame with the new column
print("\nUpdated Results with Embedding Cross Correlation Scores:")
detailed_df


Average embedding cosine similarity between cases: 0.6925

Updated Results with Embedding Cross Correlation Scores:


Unnamed: 0,Model,First Turnabout Similarity,Sister Turnabout Similarity,Total Similarity,Performance Rank,Game Score,Embedding Cross Correlation
2,gemini-2.5-pro-exp-03-25,0.903199,0.90252,1.805719,3.0,20.0,0.600194
3,claude-3-7-sonnet-20250219(thinking),0.854651,0.908786,1.763437,4.0,8.0,0.652369
6,gemini-2.0-flash-thinking-exp-1219,0.865594,0.892488,1.758082,9.0,4.0,0.686458
0,o1-2024-12-17,0.892339,0.838106,1.730446,1.0,26.0,0.611148
10,grok-3-beta,0.902313,0.823115,1.725428,,,0.655004
1,o3-2025-04-16,0.823373,0.898478,1.721851,2.0,23.0,0.630078
4,gpt-4.1-2025-04-14,0.846685,0.839394,1.686079,7.0,6.0,0.649394
5,claude-3-5-sonnet-20241022,0.804925,0.87302,1.677945,6.0,6.0,0.672999
11,grok-3-mini-beta,0.822153,0.823807,1.645959,5.0,7.0,0.769331
7,gemini-2.5-flash-preview-04-17,0.910507,0.691844,1.60235,8.0,4.0,0.876161


In [5]:
# Calculate ground truth cross-correlation using SBERT embeddings
model = SentenceTransformer("all-MiniLM-L6-v2")
with open("./ace_attorney_ground_truth.json", "r") as f:
    ground_truth = json.load(f)
    
# Extract transcripts in the same order as level_names
transcripts = [ground_truth[level_name] for level_name in level_names]

# Get embeddings for both transcripts
first_turnabout_embedding = model.encode(transcripts[0], show_progress_bar=False)
sister_turnabout_embedding = model.encode(transcripts[1], show_progress_bar=False)

# Calculate cosine similarity between embedding vectors
sim_score = cosine_similarity([first_turnabout_embedding], [sister_turnabout_embedding])[0][0]
print(f"\nGround truth cross-correlation: {sim_score:.4f}")


Ground truth cross-correlation: 0.6803


In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.preprocessing import StandardScaler

def plot_correlation_analysis(detailed_df):
    # Create a copy of the dataframe to avoid modifying the original
    df_standardized = detailed_df.copy()
    
    # Standardize the values
    scaler = StandardScaler()
    df_standardized[['Total Similarity', 'Game Score', 'Performance Rank']] = scaler.fit_transform(
        df_standardized[['Total Similarity', 'Game Score', 'Performance Rank']]
    )
    
    # Create figure with three subplots
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
    
    # Function to calculate correlation and p-value
    def get_correlation_stats(x, y):
        corr, p_value = stats.pearsonr(x, y)
        return corr, p_value
    
    # Plot 1: Linear Regression Plot for Game Score
    sns.regplot(
        data=df_standardized,
        x='Total Similarity',
        y='Game Score',
        ax=ax1,
        scatter_kws={'alpha':0.5},
        line_kws={'color': 'red'}
    )
    ax1.set_title('Linear Correlation: Similarity vs Game Score\n(Standardized Values)')
    ax1.set_xlabel('Total Similarity Score (Standardized)')
    ax1.set_ylabel('Game Score (Standardized)')
    
    # Calculate correlation and p-value for Game Score
    corr_score, p_value_score = get_correlation_stats(
        df_standardized['Total Similarity'], 
        df_standardized['Game Score']
    )
    
    # Add correlation coefficient and p-value to the plot
    significance = "***" if p_value_score < 0.001 else "**" if p_value_score < 0.01 else "*" if p_value_score < 0.05 else "ns"
    ax1.text(0.05, 0.95, 
             f'Correlation: {corr_score:.3f} {significance}\np-value: {p_value_score:.3e}', 
             transform=ax1.transAxes, 
             bbox=dict(facecolor='white', alpha=0.8))
    
    # Plot 2: Linear Regression Plot for Performance Rank
    sns.regplot(
        data=df_standardized,
        x='Total Similarity',
        y='Performance Rank',
        ax=ax2,
        scatter_kws={'alpha':0.5},
        line_kws={'color': 'red'}
    )
    ax2.set_title('Linear Correlation: Similarity vs Performance Rank\n(Standardized Values)')
    ax2.set_xlabel('Total Similarity Score (Standardized)')
    ax2.set_ylabel('Performance Rank (Standardized)')
    
    # Calculate correlation and p-value for Performance Rank
    corr_rank, p_value_rank = get_correlation_stats(
        df_standardized['Total Similarity'], 
        df_standardized['Performance Rank']
    )
    
    # Add correlation coefficient and p-value to the plot
    significance = "***" if p_value_rank < 0.001 else "**" if p_value_rank < 0.01 else "*" if p_value_rank < 0.05 else "ns"
    ax2.text(0.05, 0.95, 
             f'Correlation: {corr_rank:.3f} {significance}\np-value: {p_value_rank:.3e}', 
             transform=ax2.transAxes, 
             bbox=dict(facecolor='white', alpha=0.8))
    
    # Plot 3: Heatmap
    # Standardize all columns for correlation
    corr_columns = ['First Turnabout Similarity', 
                   'Sister Turnabout Similarity',
                   'Total Similarity',
                   'Game Score',
                   'Performance Rank']
    
    # Calculate correlation matrix and p-values using standardized values
    corr_matrix = df_standardized[corr_columns].corr()
    p_values = pd.DataFrame(
        [[stats.pearsonr(df_standardized[i], df_standardized[j])[1] 
          for j in corr_columns] for i in corr_columns],
        columns=corr_columns,
        index=corr_columns
    )
    
    # Create heatmap with significance stars
    sns.heatmap(
        corr_matrix,
        annot=True,
        cmap='coolwarm',
        center=0,
        ax=ax3,
        fmt='.2f',
        square=True
    )
    ax3.set_title('Correlation Heatmap\n(Standardized Values)')
    
    # Add significance stars to heatmap
    for i in range(len(corr_columns)):
        for j in range(len(corr_columns)):
            p_val = p_values.iloc[i, j]
            if p_val < 0.001:
                ax3.text(j + 0.5, i + 0.5, "***", 
                        ha='center', va='center', color='black')
            elif p_val < 0.01:
                ax3.text(j + 0.5, i + 0.5, "**", 
                        ha='center', va='center', color='black')
            elif p_val < 0.05:
                ax3.text(j + 0.5, i + 0.5, "*", 
                        ha='center', va='center', color='black')
    
    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    
    # Adjust layout to prevent label cutoff
    plt.tight_layout()
    
    # Save the figure
    plt.savefig('correlation_analysis.png', dpi=300, bbox_inches='tight')
    plt.close()

def plot_model_performance(detailed_df):
    # Create a copy and standardize values
    df_standardized = detailed_df.copy()
    scaler = StandardScaler()
    df_standardized[['Total Similarity', 'Game Score']] = scaler.fit_transform(
        df_standardized[['Total Similarity', 'Game Score']]
    )
    
    # Create a new figure
    plt.figure(figsize=(12, 6))
    
    # Sort dataframe by Game Score
    sorted_df = df_standardized.sort_values('Game Score', ascending=False)
    
    # Create bar plot
    bars = plt.bar(sorted_df['Model'], sorted_df['Game Score'])
    
    # Add Total Similarity as scatter points
    plt.scatter(range(len(sorted_df)), 
               sorted_df['Total Similarity'],
               color='red',
               s=100,
               label='Total Similarity')
    
    # Customize the plot
    plt.title('Model Performance vs Similarity Scores\n(Standardized Values)')
    plt.xlabel('Models')
    plt.ylabel('Standardized Score')
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.2f}',
                ha='center', va='bottom')
    
    # Adjust layout
    plt.tight_layout()
    
    # Save the figure
    plt.savefig('model_performance.png', dpi=300, bbox_inches='tight')
    plt.close()

def generate_correlation_plots(detailed_df):
    # Generate both plots
    plot_correlation_analysis(detailed_df)
    plot_model_performance(detailed_df)
    
    print("Correlation plots have been generated and saved as:")
    print("1. correlation_analysis.png")
    print("2. model_performance.png")

# Example usage:
# After running your ablation study and getting detailed_df:
generate_correlation_plots(detailed_df)

Correlation plots have been generated and saved as:
1. correlation_analysis.png
2. model_performance.png


## Super Mario Ablation Study

In [None]:
import sys, pathlib; sys.path.append(str(pathlib.Path().resolve().parent))
import os
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import requests
import json
from pathlib import Path

import base64

from tools.serving.api_providers import (
    openai_text_reasoning_completion,
    anthropic_text_completion,
    gemini_text_completion,
    together_ai_completion,
    xai_grok_completion,
    openai_completion,
    anthropic_completion,
    gemini_completion,
    together_ai_completion,
)


MODEL_NAMES = [
    "o4-mini-2025-04-16",
    "o3-2025-04-16",
    "gemini-2.5-pro-exp-03-25",
    "claude-3-7-sonnet-20250219",
    "Llama-4-Maverick-17B-128E-Instruct-FP8",
    "claude-3-5-sonnet-20241022"]

# Prompt template for vision models
PROMPT = """You are given ONE RGB frame from an NES game.
Respond with EXACTLY these three lines—no extra text, no blank lines:

GameName: <full title or UNKNOWN>
LevelNumber: <world-stage, e.g. 1-1, or UNKNOWN>
LevelDetails: <semi-colon–separated list in this template>
              area=<area_type>;
              onscreen=<comma-separated facts>;
              upcoming=<comma-separated fine-grained events for rest of level>

Formatting rules
• <area_type> one of: overworld, underground, water, castle, bonus
• <onscreen> list ONLY objects & terrain entirely visible NOW
• <upcoming> list EVERY key event that will happen later in THIS level,
  expressed as lowercase snake-case tokens, in left→right order
  (see examples). If nothing, write [].
• No synonyms, plurals, or re-ordering: use the exact vocabulary below.
  ── allowed upcoming tokens ──
  six_block_triangle_q_bricks_mushroom, triple_pipes_goombas_between,
  bonus_pipe_19_coins_skip, hidden_1up_block, pit_after_pipes,
  question_block_item, falling_goombas_block_row, ten_coin_brick,
  starman_brick, second_q_triangle_fireflower, koopa_troopa,
  extra_goombas, brick_question_row, pyramid_hard_blocks_gap,
  double_pyramid_hard_blocks_pit, exit_pipe_from_bonus,
  two_goombas_four_block_row, inaccessible_pipe_end, staircase, flagpole
• If unsure of a field, write UNKNOWN (not empty).
• Do NOT break the three-line structure under any circumstance."""

with open("super_mario_bros_ground_truth.json", "r") as f:
  ground_truth = json.load(f)
ground_truth["1-1"]


  from .autonotebook import tqdm as notebook_tqdm


{'1-1': 'GameName: Super Mario Bros.\nLevelNumber: 1-1\nLevelDetails: area=overworld;\n              onscreen=flat_path,bushes,little_goomba,question_block;\n              upcoming=six_block_triangle_q_bricks_mushroom,triple_pipes_goombas_between,\n                       bonus_pipe_19_coins_skip,hidden_1up_block,pit_after_pipes,\n                       question_block_item,falling_goombas_block_row,ten_coin_brick,\n                       starman_brick,second_q_triangle_fireflower,koopa_troopa,\n                       extra_goombas,brick_question_row,pyramid_hard_blocks_gap,\n                       double_pyramid_hard_blocks_pit,exit_pipe_from_bonus,\n                       two_goombas_four_block_row,inaccessible_pipe_end,staircase,flagpole'}


In [None]:
def encode_image(image_path):
    """Encode an image file to base64."""
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def analyze_single_image(image_path, level_id="1-1"):
    """Analyze a single Mario screenshot using all models."""
    # Create output dictionary
    results = {}
    
    # Result file
    result_file = "super_mario_bros_generated_text.json"
    
    # Load existing results if file exists
    try:
        with open(result_file, "r") as f:
            results = json.load(f)
        print(f"Loaded existing results from {result_file}")
    except (FileNotFoundError, json.JSONDecodeError):
        results = {}
    
    # System prompt
    system_prompt = "You are an expert at identifying NES game content."
    
    # Encode the image once
    base64_image = encode_image(image_path)
    
    # Process each model
    for model_name in MODEL_NAMES:
        print(f"\nProcessing model: {model_name}")
        
        # Initialize model's results if not already present
        if model_name not in results:
            results[model_name] = {}
        
        try:
            # Call the appropriate API based on model type
            if "o1-" in model_name or "o3-" in model_name or "o4-" in model_name or "gpt-" in model_name:
                response = openai_completion(
                    system_prompt=system_prompt,
                    model_name=model_name,
                    base64_image=base64_image,
                    prompt=PROMPT,
                    temperature=0
                )
                
            elif "claude" in model_name:
                response = anthropic_completion(
                    system_prompt=system_prompt,
                    model_name=model_name,
                    base64_image=base64_image,
                    prompt=PROMPT,
                    thinking=False
                )
                
            elif "gemini" in model_name:
                response = gemini_completion(
                    system_prompt=system_prompt,
                    model_name=model_name,
                    base64_image=base64_image,
                    prompt=PROMPT
                )
                
            elif "llama-4-maverick" in model_name.lower():
                response = together_ai_completion(
                    system_prompt=system_prompt,
                    model_name=model_name,
                    base64_image=base64_image,
                    prompt=PROMPT
                )
                
            elif "grok" in model_name.lower():
                # Note: grok may not support images directly
                print(f"Skipping {model_name} - image support not available")
                continue
            
            # Store the result
            results[model_name][level_id] = response
            print(f"Result for {model_name}:\n{response}\n")
            
            # Save after each successful generation
            with open(result_file, "w") as f:
                json.dump(results, f, indent=4)
                
        except Exception as e:
            print(f"Error with {model_name}: {str(e)}")
    
    return results

if __name__ == "__main__":
    # Path to your Mario screenshot
    image_path = "mario_1-1.png"  # Replace with your actual image path
    
    # Level ID (derived from filename or specified manually)
    level_id = "1-1"
    
    # Analyze the image
    results = analyze_single_image(image_path, level_id)
    
    print("\nAnalysis complete!")
    print(f"Results saved to super_mario_bros_generated_text.json")

In [None]:

if __name__ == "__main__":
    # Example usage:
    # Replace with your image directory path
    image_dir = "mario_screenshots"
    
    # Check if directory exists
    if not os.path.exists(image_dir):
        print(f"Directory {image_dir} does not exist. Please create it and add screenshot images.")
        print("Each image filename should match its level ID in the ground truth file.")
    else:
        main(image_dir) 