# LLM Poker Arena - Deep Analysis

Comprehensive analysis of AI poker players across multiple LLM providers.

In [1]:
# Setup & Imports
import sqlite3
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
# Load Data from SQLite (filtered for completed 1000-hand 2-player games)
DB_PATH = '../data/games.db'
conn = sqlite3.connect(DB_PATH)

# Filter criteria matching analyze_results.py exactly
GAME_FILTER = """
    WHERE g.status = 'completed'
      AND g.num_hands = 1000
      AND g.num_players = 2
"""

# Load games (filtered)
games = pd.read_sql(f'''
    SELECT * FROM games g {GAME_FILTER}
''', conn)

# Load game_players with game metadata for BB/100 calculation
game_players = pd.read_sql(f'''
    SELECT gp.*, g.num_hands, g.big_blind, g.starting_stack
    FROM game_players gp
    JOIN games g ON gp.game_id = g.id
    {GAME_FILTER}
''', conn)

# Load actions (filtered)
actions = pd.read_sql(f'''
    SELECT a.*, g.big_blind, g.num_hands
    FROM actions a
    JOIN games g ON a.game_id = g.id
    {GAME_FILTER}
''', conn)

# Load hands (filtered)
hands = pd.read_sql(f'''
    SELECT h.* FROM hands h
    JOIN games g ON h.game_id = g.id
    {GAME_FILTER}
''', conn)

# Load showdowns (filtered)
showdowns = pd.read_sql(f'''
    SELECT s.*, g.big_blind FROM showdowns s
    JOIN games g ON s.game_id = g.id
    {GAME_FILTER}
''', conn)

# Load api_usage (no filter needed - will aggregate by model)
api_usage = pd.read_sql('SELECT * FROM api_usage', conn)

# Load opponent_profiles (filtered)
opponent_profiles = pd.read_sql(f'''
    SELECT op.* FROM opponent_profiles op
    JOIN games g ON op.game_id = g.id
    {GAME_FILTER}
''', conn)

conn.close()

print(f"Loaded {len(actions):,} actions across {len(hands):,} hands in {len(games):,} games")
print(f"Filter: completed games with 1000 hands, 2 players")
print(f"Opponent profiles: {len(opponent_profiles):,} observations")

Loaded 63,851 actions across 15,000 hands in 15 games
Filter: completed games with 1000 hands, 2 players
Opponent profiles: 30 observations


In [3]:
# Create model name mapping (shorter display names)
MODEL_NAMES = {
    'claude-opus-4-5-20251101': 'Opus',
    'claude-sonnet-4-5-20250929': 'Sonnet', 
    'claude-haiku-4-5-20251001': 'Haiku',
    'gpt-5.2': 'GPT-5',
    'gpt-5-mini': 'GPT-5 Mini',
    'deepseek-chat': 'DeepSeek',
    'mistral-large-latest': 'Mistral',
    'mistral-small-latest': 'Mistral Small',
    'grok-4-1-fast-reasoning': 'Grok',
    'grok-4-1-fast-non-reasoning': 'Grok (No Reason)',
    'gemini-3-pro-preview': 'Gemini'
}

# Model colors (matching analyze_results.py)
MODEL_COLORS = {
    "Opus": "#B8860B", "Sonnet": "#CC785C", "Haiku": "#D4A574",
    "GPT-5": "#10A37F", "GPT-5 Mini": "#74AA9C",
    "DeepSeek": "#4A90D9",
    "Mistral": "#FF7000", "Mistral Small": "#FFB366",
    "Grok": "#1DA1F2", "Grok (No Reason)": "#657786",
    "Gemini": "#4285F4",
}

# Add short names to dataframes
actions['model'] = actions['model_id'].map(MODEL_NAMES).fillna(actions['model_id'])
game_players['model'] = game_players['model_id'].map(MODEL_NAMES).fillna(game_players['model_id'])
api_usage['model'] = api_usage['model_id'].map(MODEL_NAMES).fillna(api_usage['model_id'])
showdowns['model'] = showdowns['player_name']  # Use player name for showdowns

# Map opponent profiles
if len(opponent_profiles) > 0:
    opponent_profiles['observer'] = opponent_profiles['observer_model'].map(MODEL_NAMES).fillna(opponent_profiles['observer_model'])
    opponent_profiles['observed'] = opponent_profiles['observed_model'].map(MODEL_NAMES).fillna(opponent_profiles['observed_model'])

# Calculate BB/100 metric (matches analyze_results.py)
game_players['total_invested'] = game_players['starting_chips'] + (game_players['rebuys'] * game_players['starting_stack'])
game_players['profit'] = game_players['final_chips'] - game_players['total_invested']
game_players['bb_won'] = game_players['profit'] / game_players['big_blind']
game_players['bb_per_100'] = (game_players['bb_won'] / game_players['num_hands']) * 100
game_players['roi'] = game_players['profit'] / game_players['total_invested']

print(f"BB/100 calculated for {len(game_players)} player-game records")

BB/100 calculated for 30 player-game records


---
## 1. Executive Summary

In [4]:
# Summary statistics with BB/100 as primary metric (matches analyze_results.py)
summary = game_players.groupby('model').agg({
    'bb_per_100': 'mean',          # PRIMARY METRIC
    'profit': 'sum',
    'roi': 'mean',
    'hands_won': 'sum',
    'num_hands': 'sum',
    'rebuys': 'sum',
    'game_id': 'count'
}).round(2)

summary.columns = ['Avg BB/100', 'Total Profit', 'Avg ROI', 'Hands Won', 'Total Hands', 'Total Rebuys', 'Games Played']

# Calculate win rate
summary['Win Rate'] = (summary['Hands Won'] / summary['Total Hands'] * 100).round(1)

# Reorder columns with BB/100 first
summary = summary[['Avg BB/100', 'Total Profit', 'Win Rate', 'Avg ROI', 'Hands Won', 'Total Hands', 'Games Played', 'Total Rebuys']]
summary = summary.sort_values('Avg BB/100', ascending=False)

print("=" * 80)
print("MODEL LEADERBOARD (Ranked by BB/100)")
print("=" * 80)
print("BB/100 = Big Blinds won per 100 hands (standard poker performance metric)")
print("-" * 80)
display(summary)

MODEL LEADERBOARD (Ranked by BB/100)
BB/100 = Big Blinds won per 100 hands (standard poker performance metric)
--------------------------------------------------------------------------------


Unnamed: 0_level_0,Avg BB/100,Total Profit,Win Rate,Avg ROI,Hands Won,Total Hands,Games Played,Total Rebuys
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
GPT-5,143.33,71663,54.3,4.94,2715,5000,5,12
Sonnet,89.66,44831,49.9,12.16,2496,5000,5,49
gemini-2.5-flash,87.09,43547,38.6,2.08,1932,5000,5,43
Grok,40.36,20180,61.2,2.32,3061,5000,5,46
Mistral,-42.48,-21242,35.7,2.55,1783,5000,5,54
DeepSeek,-317.96,-158979,63.9,-0.94,3196,5000,5,164


In [5]:
# Key metrics cards
total_hands = len(hands)
total_games = len(games)
total_actions = len(actions)
total_cost = api_usage['estimated_cost'].sum()
showdown_rate = (hands['went_to_showdown'].sum() / len(hands) * 100)
total_rebuys = game_players['rebuys'].sum()

# Get starting stack from games table
starting_stack = games['starting_stack'].iloc[0] if len(games) > 0 else 1000
total_capital = game_players['starting_chips'].sum() + total_rebuys * starting_stack

print(f"""
KEY METRICS
-----------
Total Games:       {total_games:,}
Total Hands:       {total_hands:,}
Total Actions:     {total_actions:,}
Total Rebuys:      {total_rebuys:,}
Total Capital:     ${total_capital:,}
Showdown Rate:     {showdown_rate:.1f}%
Total API Cost:    ${total_cost:.2f}
Cost per Hand:     ${total_cost/total_hands:.4f}
""")


KEY METRICS
-----------
Total Games:       15
Total Hands:       15,000
Total Actions:     63,851
Total Rebuys:      368
Total Capital:     $398,000
Showdown Rate:     11.1%
Total API Cost:    $269.66
Cost per Hand:     $0.0180



---
## 2. Model Performance Deep Dive

In [6]:
# BB/100 by model (primary performance metric)
bb100_by_model = game_players.groupby('model')['bb_per_100'].mean().sort_values(ascending=True)
colors = [MODEL_COLORS.get(m, '#888888') for m in bb100_by_model.index]

fig = go.Figure()
fig.add_trace(go.Bar(
    x=bb100_by_model.values,
    y=bb100_by_model.index,
    orientation='h',
    marker_color=colors,
    text=[f"{v:.2f}" for v in bb100_by_model.values],
    textposition='outside',
))

fig.update_layout(
    title='Model Rankings by BB/100',
    xaxis_title='BB/100 (Big Blinds per 100 hands)',
    yaxis_title='Model',
    template='plotly_white',
    height=450,
    margin=dict(l=100, r=100),
    xaxis=dict(range=[min(bb100_by_model.min() * 1.3, bb100_by_model.min() - 5),
                      max(bb100_by_model.max() * 1.3, bb100_by_model.max() + 5)]),
)
fig.show()

In [7]:
# Profit distribution (box plot)
fig = px.box(
    game_players,
    x='model',
    y='total_profit',
    title='Profit Distribution by Model (per game)',
    labels={'total_profit': 'Profit ($)', 'model': 'Model'},
    color='model'
)
fig.update_layout(showlegend=False, height=400)
fig.show()

In [8]:
# Cumulative win rate tracking by hand (hand-based progression)
import json
from collections import defaultdict

def parse_winners(winners_str):
    try:
        return json.loads(winners_str)
    except:
        return []

hands['winners_list'] = hands['winners'].apply(parse_winners)

# Build hand-by-hand win tracking
hand_records = []
for _, hand in hands.iterrows():
    winners = hand['winners_list']
    game_id = hand['game_id']
    hand_num = hand['hand_number']
    
    # Get player models for this game
    game_model_map = game_players[game_players['game_id'] == game_id][['player_name', 'model']].set_index('player_name')['model'].to_dict()
    
    for player, model in game_model_map.items():
        hand_records.append({
            'game_id': game_id,
            'hand_number': hand_num,
            'model': model,
            'player_name': player,
            'won': 1 if player in winners else 0
        })

if hand_records:
    hand_df = pd.DataFrame(hand_records)
    
    # Sort by game_id and hand_number for proper cumulative tracking
    hand_df = hand_df.sort_values(['game_id', 'hand_number'])
    
    # Calculate cumulative hands played and wins per model
    hand_df['cumulative_hand'] = hand_df.groupby('model').cumcount() + 1
    hand_df['cumulative_wins'] = hand_df.groupby('model')['won'].cumsum()
    hand_df['win_rate'] = (hand_df['cumulative_wins'] / hand_df['cumulative_hand'] * 100)
    
    # Plot cumulative win rate over hands
    fig = px.line(
        hand_df,
        x='cumulative_hand',
        y='win_rate',
        color='model',
        color_discrete_map=MODEL_COLORS,
        title='Cumulative Win Rate Over Hands Played',
        labels={'cumulative_hand': 'Hands Played', 'win_rate': 'Win Rate (%)'}
    )
    
    fig.add_hline(y=50, line_dash="dash", line_color="gray", opacity=0.5,
                  annotation_text="50% (breakeven)", annotation_position="bottom right")
    fig.update_layout(template='plotly_white', height=500)
    fig.show()
    
    # Show final stats
    final_stats = hand_df.groupby('model').last()[['cumulative_hand', 'cumulative_wins', 'win_rate']]
    final_stats.columns = ['Total Hands', 'Total Wins', 'Final Win Rate %']
    final_stats = final_stats.sort_values('Final Win Rate %', ascending=False)
    print("\nFinal Win Rate by Model:")
    display(final_stats.round(2))
else:
    print("Not enough data for cumulative chart yet")


Final Win Rate by Model:


Unnamed: 0_level_0,Total Hands,Total Wins,Final Win Rate %
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DeepSeek,5000,3196,63.92
Grok,5000,3061,61.22
GPT-5,5000,2715,54.3
Sonnet,5000,2496,49.92
gemini-2.5-flash,5000,1932,38.64
Mistral,5000,1783,35.66


---
## 3. Playing Style Analysis

In [9]:
# Calculate VPIP (Voluntarily Put In Pot) and PFR (Preflop Raise) per model
preflop = actions[actions['betting_round'] == 'preflop'].copy()

# VPIP: % of hands where player voluntarily put money in (call or raise, not just blinds)
# PFR: % of hands where player raised preflop

style_stats = preflop.groupby('model').apply(lambda x: pd.Series({
    'vpip': ((x['action_type'].isin(['call', 'raise', 'all_in'])).sum() / len(x) * 100) if len(x) > 0 else 0,
    'pfr': ((x['action_type'].isin(['raise', 'all_in'])).sum() / len(x) * 100) if len(x) > 0 else 0,
    'hands': len(x)
})).reset_index()

# VPIP vs PFR scatter (playing style quadrant)
fig = px.scatter(
    style_stats,
    x='vpip',
    y='pfr',
    text='model',
    size='hands',
    title='Playing Style: VPIP vs PFR',
    labels={'vpip': 'VPIP %', 'pfr': 'PFR %'}
)

# Add quadrant lines
fig.add_hline(y=20, line_dash="dash", line_color="gray", opacity=0.5)
fig.add_vline(x=25, line_dash="dash", line_color="gray", opacity=0.5)

# Add quadrant labels
fig.add_annotation(x=12, y=35, text="LAG", showarrow=False, font=dict(size=14, color="gray"))
fig.add_annotation(x=40, y=35, text="Maniac", showarrow=False, font=dict(size=14, color="gray"))
fig.add_annotation(x=12, y=5, text="Nit", showarrow=False, font=dict(size=14, color="gray"))
fig.add_annotation(x=40, y=5, text="Calling Station", showarrow=False, font=dict(size=14, color="gray"))

fig.update_traces(textposition='top center')
fig.update_layout(height=500)
fig.show()

In [10]:
# Action distribution by model
action_dist = actions.groupby(['model', 'action_type']).size().unstack(fill_value=0)
action_pct = action_dist.div(action_dist.sum(axis=1), axis=0) * 100

fig = px.bar(
    action_pct.reset_index().melt(id_vars='model'),
    x='model',
    y='value',
    color='action_type',
    title='Action Distribution by Model',
    labels={'value': 'Percentage (%)', 'model': 'Model', 'action_type': 'Action'},
    barmode='stack'
)
fig.update_layout(height=400)
fig.show()

In [11]:
# Aggression by street
def calc_aggression(df):
    aggressive = df['action_type'].isin(['raise', 'all_in']).sum()
    passive = df['action_type'].isin(['call', 'check']).sum()
    return aggressive / passive if passive > 0 else aggressive

aggression_by_street = actions.groupby(['model', 'betting_round']).apply(calc_aggression).unstack(fill_value=0)

# Reorder streets
street_order = ['preflop', 'flop', 'turn', 'river']
aggression_by_street = aggression_by_street[[c for c in street_order if c in aggression_by_street.columns]]

fig = px.bar(
    aggression_by_street.reset_index().melt(id_vars='model'),
    x='model',
    y='value',
    color='betting_round',
    barmode='group',
    title='Aggression Factor by Street',
    labels={'value': 'Aggression Factor', 'model': 'Model', 'betting_round': 'Street'}
)
fig.update_layout(height=400)
fig.show()

In [12]:
# Position-based analysis
if 'position' in actions.columns and actions['position'].notna().any():
    position_actions = actions[actions['position'].notna()].copy()
    
    # Define position order for proper display
    pos_order = ['UTG', 'HJ', 'CO', 'BTN', 'SB', 'BB']
    position_actions['position'] = pd.Categorical(position_actions['position'], categories=pos_order, ordered=True)
    
    # 1. Raise % by model and position (heatmap)
    raise_pct = position_actions.groupby(['model', 'position']).apply(
        lambda x: (x['action_type'].isin(['raise', 'all_in']).sum() / len(x) * 100) if len(x) > 0 else 0
    ).unstack(fill_value=0)
    
    fig = px.imshow(
        raise_pct,
        title='Raise % by Model and Position',
        labels={'x': 'Position', 'y': 'Model', 'color': 'Raise %'},
        color_continuous_scale='RdYlGn',
        text_auto='.0f'
    )
    fig.update_layout(height=400)
    fig.show()
    
    # 2. Fold % by position
    fold_pct = position_actions.groupby(['model', 'position']).apply(
        lambda x: (x['action_type'] == 'fold').sum() / len(x) * 100 if len(x) > 0 else 0
    ).unstack(fill_value=0)
    
    fig = px.imshow(
        fold_pct,
        title='Fold % by Model and Position',
        labels={'x': 'Position', 'y': 'Model', 'color': 'Fold %'},
        color_continuous_scale='Reds',
        text_auto='.0f'
    )
    fig.update_layout(height=400)
    fig.show()
    
    # 3. Position profitability analysis (requires linking to hand outcomes)
    # Calculate action count by position across all models
    pos_summary = position_actions.groupby('position').agg({
        'action_type': 'count',
        'amount': 'sum'
    }).reset_index()
    pos_summary.columns = ['Position', 'Total Actions', 'Total Amount']
    
    # Calculate action distribution by position
    pos_action_dist = position_actions.groupby(['position', 'action_type']).size().unstack(fill_value=0)
    pos_action_pct = pos_action_dist.div(pos_action_dist.sum(axis=1), axis=0) * 100
    
    fig = px.bar(
        pos_action_pct.reset_index().melt(id_vars='position'),
        x='position',
        y='value',
        color='action_type',
        title='Action Distribution by Position (All Models)',
        labels={'value': 'Percentage (%)', 'position': 'Position', 'action_type': 'Action'},
        barmode='stack',
        category_orders={'position': pos_order}
    )
    fig.update_layout(height=400)
    fig.show()
    
    # Position stats table
    print("\nPosition Statistics:")
    pos_stats = position_actions.groupby('position').apply(lambda x: pd.Series({
        'Actions': len(x),
        'Raise %': (x['action_type'].isin(['raise', 'all_in']).sum() / len(x) * 100),
        'Fold %': ((x['action_type'] == 'fold').sum() / len(x) * 100),
        'Call %': ((x['action_type'] == 'call').sum() / len(x) * 100),
        'Avg Bet': x[x['amount'] > 0]['amount'].mean() if (x['amount'] > 0).any() else 0
    })).round(1)
    display(pos_stats)
else:
    print("Position data not available yet (run more games with updated code)")


Position Statistics:


Unnamed: 0_level_0,Actions,Raise %,Fold %,Call %,Avg Bet
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
UTG,0.0,,,,0.0
HJ,0.0,,,,0.0
CO,0.0,,,,0.0
BTN,0.0,,,,0.0
SB,0.0,,,,0.0
BB,31005.0,36.0,20.5,18.7,404.0


---
## 4. Decision Quality Metrics

In [13]:
# Confidence analysis (if available)
if 'confidence' in actions.columns and actions['confidence'].notna().any():
    conf_data = actions[actions['confidence'].notna()].copy()
    
    fig = px.box(
        conf_data,
        x='model',
        y='confidence',
        color='action_type',
        title='Confidence Distribution by Model and Action',
        labels={'confidence': 'Confidence (0-1)', 'model': 'Model'}
    )
    fig.update_layout(height=400)
    fig.show()
else:
    print("Confidence data not available yet (run more games with updated code)")

In [14]:
# Confidence Calibration Analysis
# Does high confidence correlate with winning hands?
if 'confidence' in actions.columns and actions['confidence'].notna().any():
    conf_data = actions[actions['confidence'].notna()].copy()
    
    # Link actions to hand outcomes
    # Get winning hands per player
    winning_actions = []
    for _, hand in hands.iterrows():
        winners = parse_winners(hand['winners'])
        hand_actions_subset = conf_data[(conf_data['game_id'] == hand['game_id']) & 
                                        (conf_data['hand_number'] == hand['hand_number'])]
        for _, action in hand_actions_subset.iterrows():
            winning_actions.append({
                'model': action['model'],
                'confidence': action['confidence'],
                'won_hand': 1 if action['player_name'] in winners else 0,
                'action_type': action['action_type'],
                'hand_strength': action.get('hand_strength')
            })
    
    if winning_actions:
        calibration_df = pd.DataFrame(winning_actions)
        
        # Bin confidence into ranges
        calibration_df['conf_bin'] = pd.cut(calibration_df['confidence'], 
                                            bins=[0, 0.3, 0.5, 0.7, 0.9, 1.0],
                                            labels=['0-30%', '30-50%', '50-70%', '70-90%', '90-100%'])
        
        # Calculate actual win rate by confidence bin
        calibration_summary = calibration_df.groupby('conf_bin').agg({
            'won_hand': ['mean', 'count']
        }).round(3)
        calibration_summary.columns = ['Actual Win Rate', 'Sample Size']
        calibration_summary['Actual Win Rate'] = (calibration_summary['Actual Win Rate'] * 100).round(1)
        
        print("Confidence Calibration (Does Confidence Predict Winning?):")
        display(calibration_summary)
        
        # Calibration by model
        calibration_by_model = calibration_df.groupby('model').agg({
            'confidence': 'mean',
            'won_hand': 'mean'
        }).round(3)
        calibration_by_model.columns = ['Avg Confidence', 'Actual Win Rate']
        calibration_by_model['Win Rate %'] = (calibration_by_model['Actual Win Rate'] * 100).round(1)
        calibration_by_model = calibration_by_model.sort_values('Avg Confidence', ascending=False)
        
        # Scatter plot: Confidence vs Win Rate by Model
        fig = px.scatter(
            calibration_by_model.reset_index(),
            x='Avg Confidence',
            y='Win Rate %',
            text='model',
            size=[50] * len(calibration_by_model),  # Uniform size
            title='Confidence Calibration: Average Confidence vs Actual Win Rate',
            labels={'Avg Confidence': 'Average Confidence', 'Win Rate %': 'Actual Win Rate (%)'}
        )
        
        # Add diagonal line for perfect calibration
        fig.add_shape(
            type='line',
            x0=0, y0=0, x1=1, y1=100,
            line=dict(color='gray', dash='dash'),
        )
        fig.add_annotation(x=0.7, y=85, text="Perfect Calibration", showarrow=False, 
                          font=dict(size=10, color='gray'))
        
        fig.update_traces(textposition='top center')
        fig.update_layout(height=450)
        fig.show()
        
        print("\nModel Confidence Calibration:")
        display(calibration_by_model[['Avg Confidence', 'Win Rate %']])
else:
    print("Confidence data not available for calibration analysis")

Confidence Calibration (Does Confidence Predict Winning?):


Unnamed: 0_level_0,Actual Win Rate,Sample Size
conf_bin,Unnamed: 1_level_1,Unnamed: 2_level_1
0-30%,,0
30-50%,45.5,11
50-70%,45.5,7362
70-90%,49.0,44415
90-100%,28.5,12033



Model Confidence Calibration:


Unnamed: 0_level_0,Avg Confidence,Win Rate %
model,Unnamed: 1_level_1,Unnamed: 2_level_1
gemini-2.5-flash,0.901,37.5
Mistral,0.877,38.6
Grok,0.862,48.3
DeepSeek,0.8,54.7
Sonnet,0.769,42.7
GPT-5,0.726,44.8


In [15]:
# Hand strength at showdown
if 'hand_strength' in actions.columns and actions['hand_strength'].notna().any():
    # Get actions that led to showdown
    showdown_hand_ids = showdowns['hand_id'].unique()
    showdown_actions = actions[actions['hand_id'].isin(showdown_hand_ids) & actions['hand_strength'].notna()]
    
    fig = px.violin(
        showdown_actions,
        x='model',
        y='hand_strength',
        color='model',
        title='Hand Strength at Showdown by Model',
        labels={'hand_strength': 'Hand Strength (0-1)', 'model': 'Model'}
    )
    fig.update_layout(showlegend=False, height=400)
    fig.show()
else:
    print("Hand strength data not available yet (run more games with updated code)")

In [16]:
# Pot odds analysis
if 'pot_odds' in actions.columns and actions['pot_odds'].notna().any():
    pot_odds_data = actions[actions['pot_odds'].notna() & (actions['action_type'].isin(['call', 'fold']))].copy()
    
    fig = px.scatter(
        pot_odds_data,
        x='pot_odds',
        y='hand_strength',
        color='action_type',
        facet_col='model',
        facet_col_wrap=3,
        title='Pot Odds vs Hand Strength (Call/Fold Decisions)',
        labels={'pot_odds': 'Pot Odds', 'hand_strength': 'Hand Strength'},
        opacity=0.6
    )
    fig.update_layout(height=600)
    fig.show()
else:
    print("Pot odds data not available yet (run more games with updated code)")

In [17]:
# SPR (Stack-to-Pot Ratio) analysis
if 'spr' in actions.columns and actions['spr'].notna().any():
    spr_data = actions[actions['spr'].notna() & (actions['spr'] < 50)].copy()  # Filter outliers
    
    fig = px.histogram(
        spr_data,
        x='spr',
        color='model',
        marginal='box',
        title='Stack-to-Pot Ratio Distribution',
        labels={'spr': 'SPR'},
        nbins=30
    )
    fig.update_layout(height=400)
    fig.show()
else:
    print("SPR data not available yet (run more games with updated code)")

---
## 5. Timing & Cost Analysis

In [18]:
# Latency by model
if 'latency_ms' in actions.columns and actions['latency_ms'].notna().any():
    latency_data = actions[actions['latency_ms'].notna() & (actions['latency_ms'] > 0)].copy()
    
    fig = px.box(
        latency_data,
        x='model',
        y='latency_ms',
        color='model',
        title='Response Latency by Model',
        labels={'latency_ms': 'Latency (ms)', 'model': 'Model'}
    )
    fig.update_layout(showlegend=False, height=400)
    y_cap = latency_data['latency_ms'].quantile(0.99)
    fig.update_yaxes(range=[0, y_cap])
    fig.show()
    
    # Summary stats
    latency_summary = latency_data.groupby('model')['latency_ms'].agg(['mean', 'median', 'std']).round(0)
    latency_summary.columns = ['Mean (ms)', 'Median (ms)', 'Std Dev']
    print("\nLatency Summary:")
    display(latency_summary)
else:
    print("Latency data not available yet (run more games with updated code)")


Latency Summary:


Unnamed: 0_level_0,Mean (ms),Median (ms),Std Dev
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DeepSeek,12174.0,11614.0,3124.0
GPT-5,9568.0,9173.0,2531.0
Grok,22077.0,19175.0,12222.0
Mistral,19430.0,19041.0,4263.0
Sonnet,23472.0,22884.0,4509.0
gemini-2.5-flash,18775.0,15848.0,9947.0


In [19]:
# API cost analysis
cost_summary = api_usage.groupby('model').agg({
    'total_calls': 'sum',
    'input_tokens': 'sum',
    'output_tokens': 'sum',
    'estimated_cost': 'sum'
}).round(4)

cost_summary['cost_per_call'] = (cost_summary['estimated_cost'] / cost_summary['total_calls']).round(6)
cost_summary = cost_summary.sort_values('estimated_cost', ascending=False)

fig = px.bar(
    cost_summary.reset_index(),
    x='model',
    y='estimated_cost',
    title='Total API Cost by Model',
    labels={'estimated_cost': 'Cost ($)', 'model': 'Model'},
    color='estimated_cost',
    color_continuous_scale='reds'
)
fig.update_layout(showlegend=False, height=400)
fig.show()

print("\nCost Summary:")
display(cost_summary)


Cost Summary:


Unnamed: 0_level_0,total_calls,input_tokens,output_tokens,estimated_cost,cost_per_call
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Sonnet,10608,5812481,6736184,118.4802,0.011169
GPT-5,14450,7940640,5186071,86.5011,0.005986
Mistral,10513,5718732,7610428,57.1,0.005431
DeepSeek,11399,6687804,3504910,3.3446,0.000293
Grok,11739,6329367,3472865,3.0023,0.000256
gemini-2.5-flash,8907,4802535,2913061,1.2341,0.000139


In [20]:
# Profit vs Cost (ROI)
profit_by_model = game_players.groupby('model')['total_profit'].sum()
cost_by_model = api_usage.groupby('model')['estimated_cost'].sum()

roi_df = pd.DataFrame({
    'profit': profit_by_model,
    'cost': cost_by_model
}).dropna()
roi_df['roi'] = roi_df['profit'] / roi_df['cost']

fig = px.scatter(
    roi_df.reset_index(),
    x='cost',
    y='profit',
    text='model',
    size=abs(roi_df['profit']) + 100,
    color='roi',
    title='Profit vs API Cost',
    labels={'cost': 'API Cost ($)', 'profit': 'Profit ($)', 'roi': 'ROI'},
    color_continuous_scale='RdYlGn'
)
fig.update_traces(textposition='top center')
fig.add_hline(y=0, line_dash="dash", line_color="gray")
fig.update_layout(height=500)
fig.show()

---
## 6. Head-to-Head Matchups

In [21]:
# Head-to-head based on hands_won (matches analyze_results.py approach)
from collections import defaultdict

h2h_hands = defaultdict(lambda: defaultdict(lambda: {'won': 0, 'total': 0}))

# Group by game to get matchups with hands_won
games_grouped = game_players.groupby('game_id').apply(
    lambda x: x[['model', 'hands_won']].to_dict('records'),
    include_groups=False
).reset_index(name='players')

for _, row in games_grouped.iterrows():
    players = row['players']
    if len(players) == 2:
        m1, m2 = players[0]['model'], players[1]['model']
        h1, h2 = players[0]['hands_won'], players[1]['hands_won']
        total = h1 + h2
        
        # Accumulate hands won across all games against each opponent
        h2h_hands[m1][m2]['won'] += h1
        h2h_hands[m1][m2]['total'] += total
        h2h_hands[m2][m1]['won'] += h2
        h2h_hands[m2][m1]['total'] += total

# Convert to matrix with win rates
models = sorted(set(h2h_hands.keys()))
matrix_data = []

for m1 in models:
    row = {'model': m1}
    for m2 in models:
        if m1 == m2:
            row[m2] = None
        elif m2 in h2h_hands[m1] and h2h_hands[m1][m2]['total'] > 0:
            row[m2] = h2h_hands[m1][m2]['won'] / h2h_hands[m1][m2]['total'] * 100
        else:
            row[m2] = None
    matrix_data.append(row)

h2h_df = pd.DataFrame(matrix_data).set_index('model')

if len(h2h_df) > 0:
    # Create heatmap
    fig = go.Figure(data=go.Heatmap(
        z=h2h_df.values,
        x=h2h_df.columns,
        y=h2h_df.index,
        colorscale=[[0, '#ff6b6b'], [0.5, '#ffffff'], [1, '#51cf66']],
        zmid=50,
        text=[[f"{v:.0f}%" if pd.notna(v) else "" for v in row] for row in h2h_df.values],
        texttemplate="%{text}",
        hovertemplate="Row %{y} vs Col %{x}: %{z:.1f}%<extra></extra>",
    ))
    
    fig.update_layout(
        title='Head-to-Head Win Rates (Row vs Column)',
        xaxis_title='Opponent',
        yaxis_title='Model',
        template='plotly_white',
        height=500,
        width=700,
    )
    fig.show()
else:
    print("Not enough data for head-to-head analysis")

---
## 6.5 Opponent Profiling Analysis
How do models perceive and read each other during gameplay?

In [22]:
# Opponent Profiling Analysis
if len(opponent_profiles) > 0:
    # Aggregate profiles by observer-observed pair
    profile_agg = opponent_profiles.groupby(['observer', 'observed']).agg({
        'vpip': 'mean',
        'pfr': 'mean',
        'aggression_factor': 'mean',
        'cbet_frequency': 'mean',
        'fold_to_raise_rate': 'mean',
        'wtsd': 'mean',
        'estimated_style': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
        'hands_played': 'sum'
    }).reset_index()
    
    # How each model perceives others (VPIP estimates)
    vpip_matrix = profile_agg.pivot(index='observer', columns='observed', values='vpip').fillna(0)
    
    fig = px.imshow(
        vpip_matrix,
        title='Perceived VPIP by Model (How Each Model Reads Opponents)',
        labels={'x': 'Observed Player', 'y': 'Observer', 'color': 'VPIP %'},
        color_continuous_scale='Blues',
        text_auto='.1f'
    )
    fig.update_layout(height=450)
    fig.show()
    
    # Aggression perception matrix
    agg_matrix = profile_agg.pivot(index='observer', columns='observed', values='aggression_factor').fillna(0)
    
    fig = px.imshow(
        agg_matrix,
        title='Perceived Aggression Factor (How Each Model Reads Opponents)',
        labels={'x': 'Observed Player', 'y': 'Observer', 'color': 'Aggression'},
        color_continuous_scale='Reds',
        text_auto='.2f'
    )
    fig.update_layout(height=450)
    fig.show()
    
    # Style classification summary
    style_counts = opponent_profiles.groupby(['observed', 'estimated_style']).size().unstack(fill_value=0)
    
    fig = px.bar(
        style_counts.reset_index().melt(id_vars='observed'),
        x='observed',
        y='value',
        color='estimated_style',
        title='How Models Are Classified by Opponents',
        labels={'value': 'Times Classified', 'observed': 'Model', 'estimated_style': 'Style'},
        barmode='stack'
    )
    fig.update_layout(height=400)
    fig.show()
    
    # Summary table
    print("\nOpponent Profile Summary (averaged across all observers):")
    obs_summary = opponent_profiles.groupby('observed').agg({
        'vpip': 'mean',
        'pfr': 'mean', 
        'aggression_factor': 'mean',
        'fold_to_raise_rate': 'mean',
        'wtsd': 'mean'
    }).round(2)
    obs_summary.columns = ['Avg VPIP', 'Avg PFR', 'Avg Aggression', 'Fold to Raise', 'WTSD']
    display(obs_summary.sort_values('Avg VPIP', ascending=False))
else:
    print("No opponent profile data available yet")


Opponent Profile Summary (averaged across all observers):


Unnamed: 0_level_0,Avg VPIP,Avg PFR,Avg Aggression,Fold to Raise,WTSD
observed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DeepSeek,0.78,0.67,7.16,0.31,0.15
Grok,0.78,0.75,3.85,0.36,0.14
Sonnet,0.71,0.61,5.59,0.49,0.1
GPT-5,0.66,0.45,0.85,0.46,0.16
gemini-2.5-flash,0.53,0.47,7.19,0.62,0.07
Mistral,0.48,0.37,2.0,0.62,0.09


---
## 7. Hand Category Analysis

In [23]:
# Showdown hand distribution
if len(showdowns) > 0:
    hand_dist = showdowns.groupby('hand_name').size().sort_values(ascending=False)
    
    fig = px.pie(
        values=hand_dist.values,
        names=hand_dist.index,
        title='Distribution of Hands at Showdown'
    )
    fig.update_layout(height=400)
    fig.show()
else:
    print("No showdown data available yet")

No showdown data available yet


In [24]:
# Hand name analysis (if available from actions)
if 'hand_name' in actions.columns and actions['hand_name'].notna().any():
    hand_name_data = actions[actions['hand_name'].notna()].copy()
    
    # Count by model and hand type
    hand_counts = hand_name_data.groupby(['model', 'hand_name']).size().unstack(fill_value=0)
    
    fig = px.bar(
        hand_counts.reset_index().melt(id_vars='model'),
        x='model',
        y='value',
        color='hand_name',
        title='Made Hands by Model (Post-flop)',
        labels={'value': 'Count', 'model': 'Model', 'hand_name': 'Hand'},
        barmode='stack'
    )
    fig.update_layout(height=400)
    fig.show()
else:
    print("Hand name data not available yet")

In [25]:
# Hand name analysis (if available from actions)
if 'hand_name' in actions.columns and actions['hand_name'].notna().any():
    hand_name_data = actions[actions['hand_name'].notna()].copy()
    
    # Count by model and hand type
    hand_counts = hand_name_data.groupby(['model', 'hand_name']).size().unstack(fill_value=0)
    
    fig = px.bar(
        hand_counts.reset_index().melt(id_vars='model'),
        x='model',
        y='value',
        color='hand_name',
        title='Made Hands by Model (Post-flop) - Normalized',
        labels={'value': 'Percentage', 'model': 'Model', 'hand_name': 'Hand'},
        barmode='stack'
    )
    fig.update_layout(height=400, barnorm='percent')
    fig.show()
else:
    print("Hand name data not available yet")

---
## 8. Interesting Hands Deep Dive

In [26]:
# Biggest pots with winner breakdown (accounts for side pots)
biggest_pots = hands.nlargest(5, 'pot')[['game_id', 'hand_number', 'pot', 'winners', 'board', 'went_to_showdown']]

print("BIGGEST POTS (with actual winnings breakdown)")
print("=" * 70)

for idx, row in biggest_pots.iterrows():
    print(f"\nGame #{row['game_id']}, Hand #{row['hand_number']} - Total Pot: ${row['pot']:,.0f}")
    print(f"Board: {row['board']}")
    
    # Get actual winnings from showdowns table
    hand_showdowns = showdowns[(showdowns['game_id'] == row['game_id']) & 
                               (showdowns['hand_number'] == row['hand_number'])]
    
    if len(hand_showdowns) > 0:
        winners = hand_showdowns[hand_showdowns['amount_won'] > 0]
        if len(winners) > 0:
            print("Winners:")
            for _, winner in winners.iterrows():
                hand_name = winner['hand_name'] if pd.notna(winner['hand_name']) else 'Unknown'
                print(f"  - {winner['player_name']}: Won ${winner['amount_won']:,.0f} with {hand_name}")
            
            # Check for side pot situation
            total_won = winners['amount_won'].sum()
            if len(winners) > 1 and total_won == row['pot']:
                # Multiple winners - could be split pot or side pots
                win_amounts = winners['amount_won'].unique()
                if len(win_amounts) > 1:
                    print("  (Side pots detected - different amounts won)")
                else:
                    print("  (Split pot - equal amounts)")
        else:
            print(f"  Winners: {row['winners']} (no showdown amount data)")
    else:
        # No showdown data - hand won without showdown
        print(f"  Winner(s): {row['winners']} (won without showdown)")
    
    print("-" * 70)

BIGGEST POTS (with actual winnings breakdown)

Game #8, Hand #989 - Total Pot: $57,000
Board: T♥ 8♣ 9♥ 6♠ 3♥
  Winner(s): ["deepseek", "sonnet"] (won without showdown)
----------------------------------------------------------------------

Game #8, Hand #990 - Total Pot: $57,000
Board: 3♥ 8♠ Q♦ 5♥ 5♣
  Winner(s): ["deepseek", "sonnet"] (won without showdown)
----------------------------------------------------------------------

Game #8, Hand #971 - Total Pot: $56,000
Board: A♣ 5♦ 5♠ 7♠ 8♣
  Winner(s): ["sonnet"] (won without showdown)
----------------------------------------------------------------------

Game #8, Hand #967 - Total Pot: $55,080
Board: T♥ T♦ 7♣
  Winner(s): ["sonnet"] (won without showdown)
----------------------------------------------------------------------

Game #8, Hand #963 - Total Pot: $55,000
Board: 4♦ K♠ T♦ 8♥ 4♣
  Winner(s): ["sonnet"] (won without showdown)
----------------------------------------------------------------------


In [27]:
# Hand replay function with showdown breakdown
def replay_hand(game_id, hand_number):
    """Display all actions for a specific hand with actual winnings breakdown"""
    hand_actions = actions[(actions['game_id'] == game_id) & (actions['hand_number'] == hand_number)]
    
    print(f"\n{'='*60}")
    print(f"HAND REPLAY: Game #{game_id}, Hand #{hand_number}")
    print(f"{'='*60}")
    
    for _, action in hand_actions.iterrows():
        amount_str = f" ${action['amount']}" if action['amount'] > 0 else ""
        print(f"[{action['betting_round'].upper():8}] {action['player_name']:15} {action['action_type']:6}{amount_str}")
        if action.get('reasoning'):
            reason_preview = action['reasoning'][:500] + "..." if len(str(action['reasoning'])) > 500 else action['reasoning']
            print(f"           -> {reason_preview}")
    
    # Show result
    hand_result = hands[(hands['game_id'] == game_id) & (hands['hand_number'] == hand_number)].iloc[0]
    print(f"\nBoard: {hand_result['board']}")
    print(f"Total Pot: ${hand_result['pot']}")
    
    # Show actual showdown results with amounts won
    hand_showdowns = showdowns[(showdowns['game_id'] == game_id) & 
                               (showdowns['hand_number'] == hand_number)]
    
    if len(hand_showdowns) > 0:
        print("\n--- SHOWDOWN RESULTS ---")
        for _, sd in hand_showdowns.iterrows():
            hand_name = sd['hand_name'] if pd.notna(sd['hand_name']) else 'Unknown'
            hole_cards = sd['hole_cards'] if pd.notna(sd['hole_cards']) else '??'
            if sd['amount_won'] > 0:
                print(f"  {sd['player_name']}: {hole_cards} -> {hand_name} - WON ${sd['amount_won']:,.0f}")
            else:
                print(f"  {sd['player_name']}: {hole_cards} -> {hand_name} - Lost")
        
        # Detect side pots
        winners = hand_showdowns[hand_showdowns['amount_won'] > 0]
        if len(winners) > 1:
            win_amounts = winners['amount_won'].unique()
            if len(win_amounts) > 1:
                print("\n  * Side pots were in play (different amounts won)")
    else:
        print(f"\nWinner(s): {hand_result['winners']} (won without showdown)")

# Replay the biggest pot
if len(biggest_pots) > 0:
    top_hand = biggest_pots.iloc[0]
    replay_hand(top_hand['game_id'], top_hand['hand_number'])


HAND REPLAY: Game #8, Hand #989
[PREFLOP ] sonnet          call  
           -> 
## Situation Analysis

**Hand Strength:** K♥3♣ is a weak hand, but not terrible in heads-up play. The King gives us some high-card value, though the 3 kicker is poor.

**Position:** I'm on the BTN/SB, which in heads-up means I act first preflop but will have position postflop. This is a significant advantage.

**Pot Odds:** Need to call $5 to win $15, giving me 3:1 pot odds (25% equity needed).

**Stack Depth:** Both stacks are deep relative to blinds. I have $55,740 and opponent has $1,245. Th...
[PREFLOP ] deepseek        raise  $45
           -> I have pocket Aces, the strongest starting hand in heads-up NLHE. The opponent (sonnet) is weak-tight with a high fold-to-raise frequency (69%) and has just limped from the SB. Their VPIP of 55% and PFR of 34% indicate they limp with many weaker hands. Given their tendency to drastically overvalue top pair/strong kicker and pocket pairs preflop (as shown in rec

---
## 8.5 Critical Hand Detector
Flag statistically unusual decisions that tell interesting stories about model behavior.

In [28]:
# Critical Hand Detector - Flag unusual decisions
def flag_critical_hands(actions_df):
    """Flag unusual/interesting decisions."""
    flags = []
    
    for _, row in actions_df.iterrows():
        pot_odds = row.get('pot_odds')
        hand_strength = row.get('hand_strength')
        confidence = row.get('confidence')
        action = row.get('action_type')
        
        # Skip if missing data
        if pd.isna(pot_odds) or pd.isna(hand_strength):
            continue
            
        # Bad fold: folded a strong hand with good odds
        if action == 'fold' and pot_odds < 0.3 and hand_strength > 0.5:
            flags.append({
                'game_id': row['game_id'],
                'hand_number': row['hand_number'],
                'player': row['player_name'],
                'model': row.get('model', row['model_id']),
                'flag': '🎯 Bad Fold',
                'details': f"Folded {hand_strength:.0%} hand with {pot_odds:.0%} pot odds",
                'street': row['betting_round']
            })
        
        # Bad call: called with trash and terrible odds
        if action == 'call' and pot_odds > 0.4 and hand_strength < 0.25:
            flags.append({
                'game_id': row['game_id'],
                'hand_number': row['hand_number'],
                'player': row['player_name'],
                'model': row.get('model', row['model_id']),
                'flag': '💸 Bad Call',
                'details': f"Called with {hand_strength:.0%} hand, {pot_odds:.0%} pot odds",
                'street': row['betting_round']
            })
        
        # Naked bluff: shoved with air
        if action == 'all_in' and hand_strength < 0.3:
            flags.append({
                'game_id': row['game_id'],
                'hand_number': row['hand_number'],
                'player': row['player_name'],
                'model': row.get('model', row['model_id']),
                'flag': '🔥 Naked Bluff',
                'details': f"All-in with only {hand_strength:.0%} hand strength",
                'street': row['betting_round']
            })
        
        # Hero call: called a big bet with a marginal hand (and it worked)
        if action == 'call' and hand_strength < 0.4 and hand_strength > 0.2:
            flags.append({
                'game_id': row['game_id'],
                'hand_number': row['hand_number'],
                'player': row['player_name'],
                'model': row.get('model', row['model_id']),
                'flag': '🦸 Marginal Call',
                'details': f"Called with {hand_strength:.0%} hand",
                'street': row['betting_round']
            })
    
    return pd.DataFrame(flags)

# Run the detector
critical_hands = flag_critical_hands(actions)

if len(critical_hands) > 0:
    print(f"Found {len(critical_hands)} critical decisions:\n")
    
    # Group by flag type
    for flag_type in critical_hands['flag'].unique():
        subset = critical_hands[critical_hands['flag'] == flag_type]
        print(f"\n{flag_type} ({len(subset)} instances)")
        print("-" * 50)
        for _, row in subset.iterrows():
            print(f"  Game #{row['game_id']}, Hand #{row['hand_number']}: {row['player']}")
            print(f"    {row['details']} ({row['street']})")
else:
    print("No critical hands detected (need more data with pot_odds and hand_strength)")

Found 1763 critical decisions:


🎯 Bad Fold (334 instances)
--------------------------------------------------
  Game #8, Hand #1: deepseek
    Folded 52% hand with 28% pot odds (turn)
  Game #3, Hand #1: deepseek
    Folded 65% hand with 22% pot odds (turn)
  Game #6, Hand #5: deepseek
    Folded 60% hand with 29% pot odds (turn)
  Game #10, Hand #10: gemini
    Folded 68% hand with 25% pot odds (turn)
  Game #3, Hand #21: deepseek
    Folded 55% hand with 29% pot odds (river)
  Game #1, Hand #40: sonnet
    Folded 55% hand with 25% pot odds (turn)
  Game #3, Hand #31: mistral
    Folded 55% hand with 12% pot odds (turn)
  Game #8, Hand #35: deepseek
    Folded 52% hand with 27% pot odds (flop)
  Game #1, Hand #55: sonnet
    Folded 59% hand with 27% pot odds (flop)
  Game #10, Hand #83: deepseek
    Folded 61% hand with 29% pot odds (turn)
  Game #8, Hand #59: deepseek
    Folded 58% hand with 23% pot odds (flop)
  Game #1, Hand #82: mistral
    Folded 54% hand with 29% pot odds (tur

---
## 9. Summary Dashboard

In [29]:
# Create a summary dashboard with subplots
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Total Profit by Model', 'Action Distribution', 'API Cost', 'Win Rate'),
    specs=[[{"type": "bar"}, {"type": "pie"}],
           [{"type": "bar"}, {"type": "bar"}]]
)

# Profit
profit_data = game_players.groupby('model')['total_profit'].sum().sort_values()
fig.add_trace(
    go.Bar(x=profit_data.values, y=profit_data.index, orientation='h', name='Profit',
           marker_color=['red' if x < 0 else 'green' for x in profit_data.values]),
    row=1, col=1
)

# Action distribution (pie)
action_counts = actions['action_type'].value_counts()
fig.add_trace(
    go.Pie(labels=action_counts.index, values=action_counts.values, name='Actions'),
    row=1, col=2
)

# API Cost
cost_data = api_usage.groupby('model')['estimated_cost'].sum().sort_values()
fig.add_trace(
    go.Bar(x=cost_data.values, y=cost_data.index, orientation='h', name='Cost',
           marker_color='orange'),
    row=2, col=1
)

# Win rate
wins = game_players.groupby('model')['hands_won'].sum()
total = actions.groupby('model')['hand_number'].nunique()
win_rate = (wins / total * 100).sort_values()
fig.add_trace(
    go.Bar(x=win_rate.values, y=win_rate.index, orientation='h', name='Win Rate',
           marker_color='blue'),
    row=2, col=2
)

fig.update_layout(height=700, title_text="LLM Poker Arena - Summary Dashboard", showlegend=False)
fig.show()