# Tournament Results Visualization

This notebook reads tournament analysis CSV files and creates visualizations of agent performance.

In [3]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

## Load Tournament Results

In [4]:
# Load the CSV file (update path as needed)
df = pd.read_csv('./tournament_results.csv')

# Display the data
print(f"Tournament Results: {len(df)} agents")
df.head()

Tournament Results: 12 agents


Unnamed: 0,Rank,Agent,ELO,CI_Low,CI_High,Games,Wins,Losses,Draws,Win_Percentage,Error_Percentage
0,1,LLMAgent_x-ai/grok-3-mini,1743.0,1652.0,1782.0,33,29,4,0,87.9,0.0
1,2,LLMAgent_openai/gpt-5-mini,1679.0,1606.0,1743.0,32,26,6,0,81.2,0.0
2,3,LLMAgent_openai/gpt-4.1-mini,1673.0,1614.0,1754.0,33,27,6,0,81.8,0.0
3,4,LLMAgent_openai/gpt-5-nano,1649.0,1565.0,1689.0,33,24,9,0,72.7,0.0
4,5,LLMAgent_google/gemini-2.5-flash-lite,1572.0,1514.0,1648.0,34,22,12,0,64.7,14.7


## ELO Ratings with Confidence Intervals

In [5]:
# Calculate error bar values (distance from center to bounds)
df['error_lower'] = df['ELO'] - df['CI_Low']
df['error_upper'] = df['CI_High'] - df['ELO']

# Clean up agent names for better display (remove "LLMAgent_" prefix)
df['Agent_Clean'] = df['Agent'].str.replace('LLMAgent_', '').str.replace('/', '_')

# Ensure data is sorted by ELO (should already be from analysis script)
df = df.sort_values('ELO', ascending=False)

print("Data prepared for visualization")
df[['Agent_Clean', 'ELO', 'CI_Low', 'CI_High']].head()

Data prepared for visualization


Unnamed: 0,Agent_Clean,ELO,CI_Low,CI_High
0,x-ai_grok-3-mini,1743.0,1652.0,1782.0
1,openai_gpt-5-mini,1679.0,1606.0,1743.0
2,openai_gpt-4.1-mini,1673.0,1614.0,1754.0
3,openai_gpt-5-nano,1649.0,1565.0,1689.0
4,google_gemini-2.5-flash-lite,1572.0,1514.0,1648.0


## ELO Ratings Bar Chart

In [6]:
# Create bar chart with error bars
fig = go.Figure()

# Add bars with error bars
fig.add_trace(go.Bar(
    x=df['Agent_Clean'],
    y=df['ELO'],
    error_y=dict(
        type='data',
        symmetric=False,
        array=df['error_upper'],
        arrayminus=df['error_lower'],
        visible=True,
        color='rgba(0,0,0,0.5)',
        thickness=2,
        width=3
    ),
    marker=dict(
        color=df['ELO'],
        colorscale='RdYlBu_r',
        colorbar=dict(title="ELO Rating"),
        line=dict(color='black', width=1)
    ),
    text=[f"{elo:.0f}" for elo in df['ELO']],
    textposition='outside',
    hovertemplate=(
        "<b>%{x}</b><br>" +
        "ELO: %{y:.0f}<br>" +
        "90% CI: [%{customdata[0]:.0f} - %{customdata[1]:.0f}]<br>" +
        "Games: %{customdata[2]}<br>" +
        "Win Rate: %{customdata[3]:.1f}%<br>" +
        "Error Rate: %{customdata[4]:.1f}%" +
        "<extra></extra>"
    ),
    customdata=df[['CI_Low', 'CI_High', 'Games', 'Win_Percentage', 'Error_Percentage']].values
))

# Update layout
fig.update_layout(
    title={
        'text': 'Tournament Results: ELO Ratings with 90% Confidence Intervals',
        'x': 0.5,
        'font': {'size': 16}
    },
    xaxis=dict(
        title='Agent',
        tickangle=45,
        tickfont=dict(size=10)
    ),
    yaxis=dict(
        title='ELO Rating',
        gridcolor='lightgray'
    ),
    plot_bgcolor='white',
    height=600,
    width=1000,
    margin=dict(b=120)  # Extra bottom margin for rotated labels
)

# Add baseline ELO reference line
fig.add_hline(y=1500, line_dash="dash", line_color="gray", 
              annotation_text="Starting ELO (1500)", annotation_position="top right")

fig.show()

## Summary Statistics

In [7]:
print("Tournament Summary Statistics:")
print(f"Total Agents: {len(df)}")
print(f"Total Games: {df['Games'].sum() // 2}")
print(f"ELO Range: {df['ELO'].min():.0f} - {df['ELO'].max():.0f}")
print(f"Average ELO: {df['ELO'].mean():.0f}")
print(f"\nTop 3 Performers:")
for i, row in df.head(3).iterrows():
    print(f"  {row['Rank']}. {row['Agent_Clean']}: {row['ELO']:.0f} ELO ({row['Win_Percentage']:.1f}% wins)")

print(f"\nAgents with Error Issues (>10% error rate):")
error_agents = df[df['Error_Percentage'] > 10]
for i, row in error_agents.iterrows():
    print(f"  {row['Agent_Clean']}: {row['Error_Percentage']:.1f}% error rate")

Tournament Summary Statistics:
Total Agents: 12
Total Games: 199
ELO Range: 1225 - 1743
Average ELO: 1500

Top 3 Performers:
  1. x-ai_grok-3-mini: 1743 ELO (87.9% wins)
  2. openai_gpt-5-mini: 1679 ELO (81.2% wins)
  3. openai_gpt-4.1-mini: 1673 ELO (81.8% wins)

Agents with Error Issues (>10% error rate):
  google_gemini-2.5-flash-lite: 14.7% error rate
  openai_gpt-oss-120b: 84.8% error rate
  anthropic_claude-3.5-haiku-20241022: 90.9% error rate
  openai_gpt-oss-20b: 93.9% error rate
