# Setup Evaluations for Multiple Games

In [1]:
import os
import sys
import json
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import requests
import pandas as pd

from pandas import DataFrame, json_normalize
from typing import List, Dict, Any, Tuple, Union, Optional

LOGS_PATH: str = "../expt-logs/"

In [2]:
import dotenv
dotenv.load_dotenv()

True

In [3]:
sys.path.append("..")

from utils import load_agent_logs_df, read_jsonl_as_json, load_game_summary

In [4]:
EXPT_NAMES: List[str] = [
    "2025-01-25_phi_llama_100_games",
    "2025-01-27_llama_phi_100_games",
    "2025-01-28_phi_phi_100_games",
    "2025-01-28_llama_llama_100_games",
    ]

In [10]:
DESCRIPTIONS: List[str] = [
    "Crewmate: Phi 4, Impostor: Llama 3.3",
    "Crewmate: Llama 3.3, Impostor: Phi 4",
    "Crewmate: Phi 4, Impostor: Phi 4",
    "Crewmate: Llama 3.3, Impostor: Llama 3.3",
    ]

In [11]:
summary_logs_paths: List[str] = [
    os.path.join(LOGS_PATH, expt_name, "summary.json") for expt_name in EXPT_NAMES
]

In [12]:
summary_dfs: List[DataFrame] = [
    load_game_summary(logs_path) for logs_path in summary_logs_paths
]

In [13]:
reasons = [
    "Impostors Win! (outnumbered C.)",
    "Crewmates Win! (tasks done)",
    "Crewmates Win! (voted out I.)",
    "Impostors Win! (time up)",
]

for i, df in enumerate(summary_dfs):
    df['Winner Reason'] = df['Winner'].apply(lambda x: reasons[x-1])
    df['Models'] = DESCRIPTIONS[i]

In [14]:
combined_df: DataFrame = pd.concat(summary_dfs)

In [15]:
combined_df

Unnamed: 0,Game,Winner,Winner Reason,Models
0,Game 1,1,Impostors Win! (outnumbered C.),"Crewmate: Phi 4, Impostor: Llama 3.3"
1,Game 2,3,Crewmates Win! (voted out I.),"Crewmate: Phi 4, Impostor: Llama 3.3"
2,Game 3,1,Impostors Win! (outnumbered C.),"Crewmate: Phi 4, Impostor: Llama 3.3"
3,Game 4,1,Impostors Win! (outnumbered C.),"Crewmate: Phi 4, Impostor: Llama 3.3"
4,Game 5,1,Impostors Win! (outnumbered C.),"Crewmate: Phi 4, Impostor: Llama 3.3"
...,...,...,...,...
95,Game 65,2,Crewmates Win! (tasks done),"Crewmate: Llama 3.3, Impostor: Llama 3.3"
96,Game 67,3,Crewmates Win! (voted out I.),"Crewmate: Llama 3.3, Impostor: Llama 3.3"
97,Game 71,1,Impostors Win! (outnumbered C.),"Crewmate: Llama 3.3, Impostor: Llama 3.3"
98,Game 55,1,Impostors Win! (outnumbered C.),"Crewmate: Llama 3.3, Impostor: Llama 3.3"


In [16]:
import numpy as np
from scipy import stats

# Function to calculate bootstrap confidence intervals
def calculate_bootstrap_ci(data, n_bootstrap=1000, ci=0.90):
    """
    Calculate bootstrap confidence intervals for a binary dataset.
    
    Args:
        data: Binary data array
        n_bootstrap: Number of bootstrap samples
        ci: Confidence interval (0-1)
        
    Returns:
        tuple: (lower bound, upper bound)
    """
    bootstrap_samples = np.random.choice(data, size=(n_bootstrap, len(data)), replace=True)
    bootstrap_counts = np.sum(bootstrap_samples, axis=1)
    lower_percentile = (1 - ci) / 2 * 100
    upper_percentile = (1 + ci) / 2 * 100
    lower = np.percentile(bootstrap_counts, lower_percentile)
    upper = np.percentile(bootstrap_counts, upper_percentile)
    return lower, upper

In [35]:
def plot_winner_reasons_with_ci(combined_df, n_bootstrap=1000, ci=0.90):
    """
    Plot winner reasons with bootstrap confidence intervals.
    
    Args:
        combined_df: DataFrame with game results
        n_bootstrap: Number of bootstrap samples
        ci: Confidence interval (0-1)
        
    Returns:
        plotly.graph_objects.Figure: The figure object
    """
    # Softer color theme
    custom_colors = [
        ("#b05a44", "#8b4513"),  # More reddish terracotta/sienna
        ("#4a5859", "#22333b"),  # Deep slate/charcoal
        ("#567d46", "#2c5f2d"),  # Mid-tone forest green
        ("#8e7dbe", "#5e548e"),  # Royal lavender/plum
    ]   
    # Get unique winner reasons and models
    winner_reasons = combined_df['Winner Reason'].unique()
    models = combined_df['Models'].unique()
    
    # Create wrapped text labels for x-ticks
    wrapped_labels = []
    for reason in winner_reasons:
        # Split long reasons at appropriate points and join with <br> for line breaks
        words = reason.split()
        if len(words) > 3:
            # Split roughly in half
            mid = len(words) // 2
            wrapped_labels.append(' '.join(words[:mid]) + '<br>' + ' '.join(words[mid:]))
        else:
            wrapped_labels.append(reason)

    # Create a figure
    fig = go.Figure()

    # Calculate bar positions
    num_reasons = len(winner_reasons)
    num_models = len(models)
    bar_width = 0.8 / num_models  # Width of each bar
    bar_positions = {}
    
    for i, reason in enumerate(winner_reasons):
        for j, model in enumerate(models):
            # Calculate position for this bar
            position = i + (j - num_models/2 + 0.5) * bar_width
            if reason not in bar_positions:
                bar_positions[reason] = {}
            bar_positions[reason][model] = position

    # For each model, add a bar trace with error bars
    for j, model in enumerate(models):
        model_data = combined_df[combined_df['Models'] == model]
        
        counts = []
        error_y = []
        positions = []
        
        for reason in winner_reasons:
            # Count occurrences of this reason for this model
            reason_data = (model_data['Winner Reason'] == reason).astype(int)
            count = reason_data.sum()
            counts.append(count)
            positions.append(bar_positions[reason][model])
            
            # Calculate bootstrap confidence intervals
            lower, upper = calculate_bootstrap_ci(reason_data.values, n_bootstrap, ci)
            error_y.append((upper - lower) / 2)  # Use average of upper and lower as error
        
        # Add bar trace with error bars using Among Us colors
        color_idx = j % len(custom_colors)
        fig.add_trace(go.Bar(
            x=positions,  # Use custom positions
            y=counts,
            name=model,
            marker_color=custom_colors[color_idx][0],
            marker_line_color=custom_colors[color_idx][1],
            marker_line_width=1.5,
            error_y=dict(
                type='data',
                array=error_y,
                visible=True,
                color=custom_colors[color_idx][1]
            ),
            width=bar_width * 0.9  # Slightly narrower than calculated width
        ))

    # Update layout
    fig.update_layout(
        title="",
        xaxis_title="Winner (Reason)",
        yaxis_title="Number of Games",
        barmode="group",
        showlegend=True,
        plot_bgcolor='white'
    )
    
    # Set custom tick positions and labels
    fig.update_xaxes(
        tickmode='array',
        tickvals=list(range(len(winner_reasons))),  # Center positions for each group
        ticktext=wrapped_labels,
        tickangle=0
    )

    # Ensure y-axis starts at zero
    fig.update_yaxes(range=[0, max([max(counts) for counts in [
        [reason_data.sum() for reason_data in 
         [(combined_df[combined_df['Models'] == model]['Winner Reason'] == reason).astype(int) 
          for reason in winner_reasons]]
        for model in models
    ]]) * 1.4])  # Add some headroom for error bars

    # Legend settings
    fig.update_layout(legend=dict(
        orientation="v",
        x=0.4,
        y=0.85,
        traceorder="normal",
        bgcolor="white",
        bordercolor="black",
        borderwidth=1
    ))

    # Grid lines
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

    # Width and height
    fig.update_layout(width=800, height=450)

    # Font settings for research paper
    fig.update_layout(font=dict(family='serif', size=15, color='black'))
    fig.update_xaxes(title_font=dict(family='serif', size=18, color='black'))
    fig.update_yaxes(title_font=dict(family='serif', size=18, color='black'))
    fig.update_xaxes(tickfont=dict(family='serif', size=18, color='black'))
    fig.update_yaxes(tickfont=dict(family='serif', size=18, color='black'))
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=False)
    
    return fig

In [36]:
# Create and display the figure
fig = plot_winner_reasons_with_ci(combined_df)
fig.show()

In [37]:
# save the figure in high-res pdf
fig.write_image("plots/winner_reasons_with_ci.pdf", format="pdf")