## Basic Elo simulations

In [3]:
# Define the players and their initial ratings
players = {
    "A": 1500,
    "B": 1500,
    "C": 1500,
    "D": 1500,
    "E": 1500,
    "Z": 1500
}

# Define the list of games.
# Each game is a tuple: (player1, player2, winner)
# Here, the winner is specified as the name of the winning player.
# games = [
#     ("A", "B", "A"),
#     ("C", "D", "D"),
#     ("B", "E", "E"),
#     ("A", "C", "A"),
#     ("D", "E", "D"),
#     ("A", "E", "A"),
#     ("B", "C", "C")
# ]

# all vs one
# games = [
#     ("A", "Z", "A"),
#     ("B", "Z", "B"),
#     ("C", "Z", "Z"),
#     ("D", "Z", "D"),
#     ("E", "Z", "Z"),
#     ("A", "Z", "A"),
#     ("B", "Z", "Z"),
#     ("C", "Z", "C"),
#     ("D", "Z", "Z"),
#     ("E", "Z", "E"),
#     ("A", "Z", "Z"),
#     ("B", "Z", "B"),
#     ("C", "Z", "Z"),
#     ("D", "Z", "D"),
#     ("E", "Z", "Z"),
#     ("A", "Z", "A"),
# ]


# 2 vs 1, order matters, swapping B and A games changes the outcome
# Ammount of wins matter, more wins boosts the rating more
# Elo rating is hence temporal, there's state
# For LLM chess the suggested approach - get winrates, multiple by 100 (i.e. 100 games of LLM vs random),
# next have those games sequqnced one after another (a.e. A vs Z, B vs Z...), evenly distributing games,
# we will get 100 games pairings of LLM vs random, calculate the winrate,
# In this case we will have same ELO after each run, con - the model that happens to be first in
# every pass will be at disadvantage
# Alternatively we can shuffle LLMs in evey pass, yet this will produce different ELO after each run

games = [
    ("B", "Z", "B"),
    ("B", "Z", "B"),
    ("A", "Z", "A"),
    ("A", "Z", "A"),
    ("A", "Z", "A"),
    ("A", "Z", "A"),
    ("A", "Z", "A"),
    ("A", "Z", "A"),
]


K = 32

def exp_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_ratings(p1, p2, winner):
    global players
    r1 = players[p1]
    r2 = players[p2]
    exp1 = exp_score(r1, r2)
    exp2 = exp_score(r2, r1)
    
    if winner == p1:
        s1, s2 = 1, 0
    elif winner == p2:
        s1, s2 = 0, 1
    else:
        s1, s2 = 0.5, 0.5  # For a draw (if needed)
    
    players[p1] = r1 + K * (s1 - exp1)
    players[p2] = r2 + K * (s2 - exp2)

# Simulate each game
for game in games:
    p1, p2, winner = game
    update_ratings(p1, p2, winner)
    print(f"After {p1} vs {p2} (winner: {winner}), ratings: {players}")

print("\nFinal Elo ratings:")
for player, rating in players.items():
    print(f"Player {player}: {rating:.1f}")


After B vs Z (winner: B), ratings: {'A': 1500, 'B': 1516.0, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1484.0}
After B vs Z (winner: B), ratings: {'A': 1500, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1469.4695015289756}
After A vs Z (winner: A), ratings: {'A': 1514.597626351594, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1454.8718751773815}
After A vs Z (winner: A), ratings: {'A': 1527.8739306203415, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1441.595570908634}
After A vs Z (winner: A), ratings: {'A': 1539.980373303762, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1429.4891282252136}
After A vs Z (winner: A), ratings: {'A': 1551.0569344862079, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1418.4125670427677}
After A vs Z (winner: A), ratings: {'A': 1561.2288813638672, 'B': 1530.5304984710244, 'C': 1500, 'D': 1500, 'E': 1500, 'Z': 1408.2406201651083}
After A vs Z (winner: A), ratings: {'A': 1570.6

## LLM Chess Logs

In [7]:
import os
import pandas as pd
import numpy as np
from aggregate_logs_to_csv import aggregate_models_to_csv

file = "../_temp/aggregate.csv"

if not os.path.exists("../_temp"):
    os.makedirs("../_temp")

aggregate_models_to_csv(["../_logs/no_reflection", "../_logs/new/"], file)
print("\n\n")
df_aggr = pd.read_csv(file)

# Display the first record as field:value lines
first_record = df_aggr.iloc[0]
for field, value in first_record.items():
    print(f"{field}: {value}")

print("\n" + "-"*50 + "\n")

# Display only the top 10 rows
print(df_aggr.head(10).to_string(index=False))



Loading logs from directory: ../_logs/no_reflection
Skipping invalid JSON file: ../_logs/no_reflection/2025-02-07_gpt-4-0613/2025.02.07_14:12.json. Error: Expecting value: line 1 column 1 (char 0)
Loading logs from directory: ../_logs/new/
Total logs loaded from all directories: 5384



model_name: amazon.nova-lite-v1
total_games: 42
black_llm_wins: 0
white_rand_wins: 42
draws: 0
black_llm_win_rate: 0.0
std_dev_black_llm_win_rate: 0.0
moe_black_llm_win_rate: 0.0
black_llm_loss_rate: 1.0
std_dev_black_llm_loss_rate: 0.0
moe_black_llm_loss_rate: 0.0
draw_rate: 0.0
std_dev_draw_rate: 0.0
moe_draw_rate: 0.0
black_llm_wins_percent: 0.0
black_llm_draws_percent: 0.0
white_rand_wins_percent: 100.0
win_loss: 0.0
std_dev_win_loss: 0.0
moe_win_loss: 0.0
win_loss_non_interrupted: 0.5
std_dev_win_loss_non_interrupted: 0.0
moe_win_loss_non_interrupted: 0.0
game_duration: 0.023095238095238
std_dev_game_duration: 0.0129705285519726
moe_game_duration: 0.0039227361724427
games_interrupted: 42
games_inte

## LLM Elo


## Process Overview
1. **Convert win/loss/draw rates** to 100 simulated games per model
2. **Interleave outcomes** evenly throughout the sequence
3. **Process games sequentially across all models** (temporal ordering matters)
4. **Update both model and random player** ratings after each game
5. **Sort models** by final ELO for leaderboard

## Key Considerations
- **Temporal nature**: Order of games affects final ratings
- **Sequential processing**: Models take turns playing against random player
- **Consistent ordering**: Same sequence each run ensures reproducible ratings
- **Fairness tradeoff**: First models in sequence may be at slight disadvantage

## Calculation
- All models start at 1500 rating
- K-factor = 32
- Win=1.0, Draw=0.5, Loss=0.0


In [8]:
import random
# Set a fixed seed for reproducibility
random.seed(42)

# Sort the dataframe by model name
df = df_aggr.sort_values(by='model_name').reset_index(drop=True)

# # Shuffle the dataframe to avoid any bias from processing order
# df = df_aggr.copy()
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)


# Initialize ELO ratings dictionary - all models and random player start at 1500
elo_ratings = {'RANDOM_PLAYER': 1500}
for model in df['model_name']:
    elo_ratings[model] = 1500

# K-factor for ELO calculation
K = 32

def expected_score(rating_a, rating_b):
    return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

def update_elo(model_rating, random_rating, result):
    """Update ELO based on game result
    result: 1 for model win, 0.5 for draw, 0 for model loss
    """
    expected = expected_score(model_rating, random_rating)
    new_model_rating = model_rating + K * (result - expected)
    new_random_rating = random_rating + K * ((1 - result) - (1 - expected))
    return new_model_rating, new_random_rating

# Generate 100 games for each model with proper distribution
all_games = []

for _, row in df.iterrows():
    model = row['model_name']
    
    # Calculate games based on win/draw/loss rates
    win_count = round(row['black_llm_win_rate'] * 100)
    draw_count = round(row['draw_rate'] * 100)
    loss_count = round(row['black_llm_loss_rate'] * 100)
    
    # Ensure total is exactly 100
    total = win_count + draw_count + loss_count
    if total != 100:
        # Adjust draw count to make total 100
        draw_count += (100 - total)
    
    # Create game outcomes (1 for win, 0.5 for draw, 0 for loss)
    outcomes = [1] * win_count + [0.5] * draw_count + [0] * loss_count
    
    # Shuffle the outcomes
    random.shuffle(outcomes)
    
    all_games.append((model, outcomes))

# Simulate games and update ELO ratings
random_rating = 1500  # Initialize random player rating

for i in range(100):  # For each game number
    for model, outcomes in all_games:
        if i < len(outcomes):
            result = outcomes[i]
            model_rating = elo_ratings[model]
            
            # Update ratings
            new_model_rating, new_random_rating = update_elo(model_rating, random_rating, result)
            elo_ratings[model] = new_model_rating
            random_rating = new_random_rating  # Update random player rating

# Create leaderboard
leaderboard = pd.DataFrame({
    'Model': list(elo_ratings.keys()),
    'ELO Rating': [round(rating, 1) for rating in elo_ratings.values()]
})

# Sort by ELO rating in descending order
leaderboard = leaderboard.sort_values('ELO Rating', ascending=False)

# Display the leaderboard
print("Final ELO Ratings Leaderboard:")
print(leaderboard.to_string(index=False))

Final ELO Ratings Leaderboard:
                                    Model  ELO Rating
                            o3-2025-04-16      2220.6
                            o1-2024-12-17      2208.9
                       o4-mini-2025-04-16      2096.1
                        o1-2024-12-17-low      2089.6
                    o1-preview-2024-09-12      2074.2
                       o3-mini-2025-01-31      2036.2
                o3-mini-2025-01-31-medium      2000.6
               claude-3-7-sonnet-20250219      1944.2
                   gpt-4-turbo-2024-04-09      1896.9
                           gpt-4-32k-0613      1895.2
                       gpt-4.1-2025-04-14      1893.4
          anthropic.claude-v3-5-sonnet-v1      1892.0
                        gpt-4o-2024-11-20      1884.7
                      qwen-max-2025-01-25      1873.3
          anthropic.claude-v3-5-sonnet-v2      1865.6
             gemini-2.5-pro-preview-03-25      1855.2
anthropic.claude-3-7-sonnet-20250219-v1:0      1853

When models are shuffled:
```
                                    Model  ELO Rating
                        o1-2024-12-17-low      2112.2
                o3-mini-2025-01-31-medium      2066.3
                    o1-preview-2024-09-12      2060.9
                       o3-mini-2025-01-31      2013.6
                        gpt-4o-2024-11-20      1922.8
                      qwen-max-2025-01-25      1910.9
``` 

When models are sorted by name:
```
                                    Model  ELO Rating
                        o1-2024-12-17-low      2126.1
                    o1-preview-2024-09-12      2047.3
                       o3-mini-2025-01-31      2010.6
                o3-mini-2025-01-31-medium      1992.6
                           gpt-4-32k-0613      1909.3
                       o1-mini-2024-09-12      1908.6
```
                   