# Preparing Input Data

Win rates need to be aggregated across different sources:
1. Newer Logs
    1. `data_processing/refined.csv` - LLMs as Black playing against Random Player. The key data source for all models to be rated and included in the LB
    2. `data_processing/dragon_refined.csv` - LLMs as Black playing against different levels of Komodo Dragon chess engine (Elo rated against chess.com pool)
2. Older Logs
    1. `_logs/_pre_aug_2025/dragon_vs_llm` - LLMs as Black playing against different levels of Komodo Dragon chess engine (Elo rated against chess.com pool)
    2. `_logs/_pre_aug_2025/misc/dragon` - different levels of dragon playing different colors vs Random Player and Stockfish
    3. `_logs/_pre_aug_2025/llm_vs_llm` - LLMs vs LLMs

## Dataset Structure

- player_white - model name or one of the values: 'random_player', 'dragon-lvl-X' (where X is integer in range from 1 to 25), 'stockfish-lvl-X' (where X is integer in range from 0 to 20)
- player_black - same as white values, denotes all games as black
- games - total games the given white/black couple had
- win_rate_white - percent of the games won by the white player
- win_rate_black - same for black
- draw_rate - percent of Draws
- player_black_elo - only valid for Dragon levels (e.g. lvl 1 is 250, lvl 2 is 375, lvl 3 is 500 etc.), empty for other (non rated) players
- player_white_elo - same as above
- source - indicates which data source this row came from: 'refined', 'dragon_refined', 'dragon_vs_llm', 'misc_dragon', 'llm_vs_llm'


In [13]:
import csv
import json
from pathlib import Path
import re
from collections import defaultdict, Counter
from prettytable import PrettyTable

# Paths
REFINED_CSV = Path(".") / "refined.csv"
DRAGON_REFINED_CSV = Path(".") / "dragon_refined.csv"
WIN_RATES_CSV = Path(".") / "win_rates.csv"
DRAGON_VS_LLM_DIR = Path("..") / "_logs" / "_pre_aug_2025" / "dragon_vs_llm"
MISC_DRAGON_DIR = Path("..") / "_logs" / "_pre_aug_2025" / "misc" / "dragon"
LLM_VS_LLM_DIR = Path("..") / "_logs" / "_pre_aug_2025" / "llm_vs_llm"

# Prepare output header as per dataset specification
HEADER = [
    "player_white",
    "player_black",
    "games",
    "win_rate_white",
    "win_rate_black",
    "draw_rate",
    "player_white_elo",
    "player_black_elo",
    "source",
]

CURR_ROWS = 0

ROWS_OUT = []

# Counters per source
refined_count = 0
dragon_refined_count = 0
dragon_vs_llm_count = 0
misc_dragon_count = 0
llm_vs_llm_count = 0

# Helper functions
def dragon_elo(level):
    """Calculate Dragon Elo: lvl 1 = 250, lvl 2 = 375, etc."""
    return 250 + 125 * (level - 1)

def parse_dragon_level(name):
    """Extract dragon level from name like 'dragon-lvl-1' or 'lvl-1'"""
    match = re.search(r'lvl-(\d+)', name)
    return int(match.group(1)) if match else None

def read_aggregate_json(json_path):
    """Read _aggregate_results.json and return game stats"""
    with open(json_path, 'r') as f:
        data = json.load(f)
    return {
        'total_games': data['total_games'],
        'white_wins': data['white_wins'],
        'black_wins': data['black_wins'],
        'draws': data['draws']
    }

def compute_win_rates(white_wins, black_wins, draws, total_games):
    """Compute win rates as percentages"""
    if white_wins + black_wins + draws != total_games:
        raise ValueError(f"Inconsistent game counts: {white_wins} + {black_wins} + {draws} != {total_games}")
    
    win_rate_white = round(white_wins * 100.0 / total_games, 3)
    win_rate_black = round(black_wins * 100.0 / total_games, 3)
    draw_rate = round(draws * 100.0 / total_games, 3)
    
    return win_rate_white, win_rate_black, draw_rate

def preview_win_rates(rows_or_csv, preview_top=3, preview_bottom=5):
    """
    Print a pretty table preview of the win rates.
    Accepts either a path to a CSV file, a list of rows (with header as first row),
    or a list of dicts (all with identical keys).
    Shows top N rows, ellipsis if more than (top+bottom), and bottom M rows.
    Returns (rows, header).
    """
    # Determine if input is a path or already rows
    if hasattr(rows_or_csv, "open"):  # likely a Path
        with rows_or_csv.open("r", newline="") as f:
            reader = list(csv.reader(f))
            header = reader[0]
            rows = reader[1:]
    elif isinstance(rows_or_csv, list):
        if not rows_or_csv:
            print("No data to preview.")
            return [], []
        if isinstance(rows_or_csv[0], dict):
            header = list(rows_or_csv[0].keys())
            rows = [[row.get(col, "") for col in header] for row in rows_or_csv]
        else:
            header = rows_or_csv[0]
            rows = rows_or_csv[1:]
    else:
        raise ValueError("Input must be a Path, a list of rows (with header as first row), or a list of dicts")

    table = PrettyTable()
    table.field_names = header

    # Show top N rows if exists
    for i in range(min(preview_top, len(rows))):
        table.add_row(rows[i])

    # Add three dots row if there are more than (top+bottom) rows
    if len(rows) > (preview_top + preview_bottom):
        table.add_row(['...'] * len(header))

    # Add bottom M rows (or all remaining if less than M)
    for row in rows[-preview_bottom:]:
        table.add_row(row)

    print(table)
    return rows, header


## Ingesting Data

### 1. `refined.csv`

### 2. `dragon_refined.csv`


In [14]:
# Ingest refined.csv (LLM as Black vs Random Player)
with REFINED_CSV.open("r", newline="") as f:
    reader = csv.DictReader(f)
    for row in reader:
        total_games = int(row["total_games"])
        player_wins = int(row["player_wins"])
        opponent_wins = int(row["opponent_wins"])
        draws = int(row["draws"])

        # Basic sanity check
        if player_wins + opponent_wins + draws != total_games:
            raise ValueError(
                f"Row for player {row['Player']} has inconsistent win/loss/draw counts"
            )

        # Compute win rates as percentages (0-100)
        win_rate_white = round(opponent_wins * 100.0 / total_games, 3)
        win_rate_black = round(player_wins * 100.0 / total_games, 3)
        draw_rate = round(draws * 100.0 / total_games, 3)

        ROWS_OUT.append(
            {
                "player_white": "random_player",
                "player_black": row["Player"],
                "games": total_games,
                "win_rate_white": win_rate_white,
                "win_rate_black": win_rate_black,
                "draw_rate": draw_rate,
                "player_white_elo": "",
                "player_black_elo": "",
                "source": "refined",
            }
        )
        refined_count += 1

print(f"Loaded {refined_count} rows from refined.csv")

preview_win_rates(ROWS_OUT)


Loaded 122 rows from refined.csv
+---------------+------------------------------------+-------+----------------+----------------+-----------+------------------+------------------+---------+
|  player_white |            player_black            | games | win_rate_white | win_rate_black | draw_rate | player_white_elo | player_black_elo |  source |
+---------------+------------------------------------+-------+----------------+----------------+-----------+------------------+------------------+---------+
| random_player |        gpt-5-2025-08-07-low        |   43  |      0.0       |     95.349     |   4.651   |                  |                  | refined |
| random_player |     gpt-5-mini-2025-08-07-high     |   35  |      0.0       |     88.571     |   11.429  |                  |                  | refined |
| random_player |         o3-2025-04-16-low          |   42  |     2.381      |     90.476     |   7.143   |                  |                  | refined |
|      ...      |        

([['random_player',
   'gpt-5-2025-08-07-low',
   43,
   0.0,
   95.349,
   4.651,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-mini-2025-08-07-high',
   35,
   0.0,
   88.571,
   11.429,
   '',
   '',
   'refined'],
  ['random_player',
   'o3-2025-04-16-low',
   42,
   2.381,
   90.476,
   7.143,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-nano-2025-08-07-medium',
   33,
   3.03,
   90.909,
   6.061,
   '',
   '',
   'refined'],
  ['random_player',
   'o4-mini-2025-04-16-high',
   39,
   2.564,
   89.744,
   7.692,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-nano-2025-08-07-high',
   33,
   3.03,
   87.879,
   9.091,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-mini-2025-08-07-medium',
   33,
   0.0,
   81.818,
   18.182,
   '',
   '',
   'refined'],
  ['random_player',
   'o3-2025-04-16-medium',
   53,
   9.434,
   90.566,
   0.0,
   '',
   '',
   'refined'],
  ['random_player',
   'o1-2024-12-17-medium',
   41,
   2.439,
   8

In [15]:
# Ingest dragon_refined.csv (LLMs as Black vs Dragon levels)
if DRAGON_REFINED_CSV.exists():
    with DRAGON_REFINED_CSV.open("r", newline="") as f:
        reader = csv.DictReader(f)
        for row in reader:
            total_games = int(row["total_games"])
            player_wins = int(row["player_wins"])  # LLM (Black) wins
            opponent_wins = int(row["opponent_wins"])  # Dragon (White) wins
            draws = int(row["draws"])

            # Sanity check
            if player_wins + opponent_wins + draws != total_games:
                raise ValueError(
                    f"Row for player {row['Player']} has inconsistent counts"
                )

            # The opponent in these rows is a dragon level (column name has a known typo: 'white_opponent')
            white_name = (
                row.get("white_opponent")
                or row.get("white_opponent")
                or row.get("opponent")
                or row.get("Opponent")
                or ""
            )
            dragon_level = parse_dragon_level(white_name)
            player_name = row.get("Player") or row.get("player")

            # Compute win rates
            win_rate_white = round(opponent_wins * 100.0 / total_games, 3)  # Dragon as White
            win_rate_black = round(player_wins * 100.0 / total_games, 3)    # LLM as Black
            draw_rate = round(draws * 100.0 / total_games, 3)

            # Resolve player_white label and Elo
            player_white = f"dragon-lvl-{dragon_level}" if dragon_level is not None else white_name
            player_white_elo = dragon_elo(dragon_level) if dragon_level is not None else ""

            ROWS_OUT.append(
                {
                    "player_white": player_white,
                    "player_black": player_name,
                    "games": total_games,
                    "win_rate_white": win_rate_white,
                    "win_rate_black": win_rate_black,
                    "draw_rate": draw_rate,
                    "player_white_elo": player_white_elo,
                    "player_black_elo": "",
                    "source": "dragon_refined",
                }
            )
            dragon_refined_count += 1

print(f"Loaded {dragon_refined_count} rows from dragon_refined.csv")


Loaded 43 rows from dragon_refined.csv


In [16]:
with REFINED_CSV.open("r", newline="") as f:
    reader = csv.DictReader(f)
    for row in reader:
        total_games = int(row["total_games"])
        player_wins = int(row["player_wins"])
        opponent_wins = int(row["opponent_wins"])
        draws = int(row["draws"])

        # Basic sanity check
        if player_wins + opponent_wins + draws != total_games:
            raise ValueError(
                f"Row for player {row['Player']} has inconsistent win/loss/draw counts"
            )

        # Compute win rates as percentages (0-100)
        win_rate_white = round(opponent_wins * 100.0 / total_games, 3)
        win_rate_black = round(player_wins * 100.0 / total_games, 3)
        draw_rate = round(draws * 100.0 / total_games, 3)

        ROWS_OUT.append(
            {
                "player_white": "random_player",
                "player_black": row["Player"],
                "games": total_games,
                "win_rate_white": win_rate_white,
                "win_rate_black": win_rate_black,
                "draw_rate": draw_rate,
                "player_white_elo": "",
                "player_black_elo": "",
                "source": "refined",
            }
        )

print(f"Loaded {len(ROWS_OUT)} rows from refined.csv")

preview_win_rates(ROWS_OUT)


Loaded 287 rows from refined.csv
+---------------+------------------------------------+-------+----------------+----------------+-----------+------------------+------------------+---------+
|  player_white |            player_black            | games | win_rate_white | win_rate_black | draw_rate | player_white_elo | player_black_elo |  source |
+---------------+------------------------------------+-------+----------------+----------------+-----------+------------------+------------------+---------+
| random_player |        gpt-5-2025-08-07-low        |   43  |      0.0       |     95.349     |   4.651   |                  |                  | refined |
| random_player |     gpt-5-mini-2025-08-07-high     |   35  |      0.0       |     88.571     |   11.429  |                  |                  | refined |
| random_player |         o3-2025-04-16-low          |   42  |     2.381      |     90.476     |   7.143   |                  |                  | refined |
|      ...      |        

([['random_player',
   'gpt-5-2025-08-07-low',
   43,
   0.0,
   95.349,
   4.651,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-mini-2025-08-07-high',
   35,
   0.0,
   88.571,
   11.429,
   '',
   '',
   'refined'],
  ['random_player',
   'o3-2025-04-16-low',
   42,
   2.381,
   90.476,
   7.143,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-nano-2025-08-07-medium',
   33,
   3.03,
   90.909,
   6.061,
   '',
   '',
   'refined'],
  ['random_player',
   'o4-mini-2025-04-16-high',
   39,
   2.564,
   89.744,
   7.692,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-nano-2025-08-07-high',
   33,
   3.03,
   87.879,
   9.091,
   '',
   '',
   'refined'],
  ['random_player',
   'gpt-5-mini-2025-08-07-medium',
   33,
   0.0,
   81.818,
   18.182,
   '',
   '',
   'refined'],
  ['random_player',
   'o3-2025-04-16-medium',
   53,
   9.434,
   90.566,
   0.0,
   '',
   '',
   'refined'],
  ['random_player',
   'o1-2024-12-17-medium',
   41,
   2.439,
   8

### 2. `_logs/dragon_vs_llm` - Dragon (White) vs LLM (Black)


In [17]:
# Process dragon_vs_llm directory
dragon_vs_llm_count = 0

if DRAGON_VS_LLM_DIR.exists():
    for pair_dir in DRAGON_VS_LLM_DIR.iterdir():
        if not pair_dir.is_dir():
            continue
        
        # Skip error/fail directories
        if pair_dir.name.startswith(('errors-', 'fails-')):
            continue
        
        # Parse directory name: lvl-X_vs_Y
        if '_vs_' not in pair_dir.name:
            continue
        
        white_part, black_part = pair_dir.name.split('_vs_', 1)
        
        # Extract dragon level
        dragon_level = parse_dragon_level(white_part)
        if dragon_level is None:
            continue
        
        # Aggregate across all timestamp directories
        total_games = 0
        total_white_wins = 0
        total_black_wins = 0
        total_draws = 0
        
        for timestamp_dir in pair_dir.iterdir():
            if not timestamp_dir.is_dir():
                continue
            
            aggregate_file = timestamp_dir / "_aggregate_results.json"
            if aggregate_file.exists():
                stats = read_aggregate_json(aggregate_file)
                total_games += stats['total_games']
                total_white_wins += stats['white_wins']
                total_black_wins += stats['black_wins']
                total_draws += stats['draws']
        
        # Skip if no games found
        if total_games == 0:
            continue
        
        # Compute win rates
        win_rate_white, win_rate_black, draw_rate = compute_win_rates(
            total_white_wins, total_black_wins, total_draws, total_games
        )
        
        # Add row
        ROWS_OUT.append({
            "player_white": f"dragon-lvl-{dragon_level}",
            "player_black": black_part,
            "games": total_games,
            "win_rate_white": win_rate_white,
            "win_rate_black": win_rate_black,
            "draw_rate": draw_rate,
            "player_white_elo": dragon_elo(dragon_level),
            "player_black_elo": "",
            "source": "dragon_vs_llm",
        })
        
        dragon_vs_llm_count += 1

print(f"Loaded {dragon_vs_llm_count} rows from dragon_vs_llm")


Loaded 28 rows from dragon_vs_llm


### 3. `_logs/misc/dragon` - Pre-aggregated Dragon vs Stockfish/Random


In [18]:
# Process misc/dragon directory
misc_dragon_count = 0

if MISC_DRAGON_DIR.exists():
    for json_file in MISC_DRAGON_DIR.glob("*.json"):
        # Parse filename: white_vs_black.json
        filename = json_file.stem
        if '_vs_' not in filename:
            continue
        
        white_name, black_name = filename.split('_vs_', 1)
        
        # Read the aggregated data
        stats = read_aggregate_json(json_file)
        
        # Compute win rates
        win_rate_white, win_rate_black, draw_rate = compute_win_rates(
            stats['white_wins'], stats['black_wins'], stats['draws'], stats['total_games']
        )
        
        # Determine Elo ratings
        white_elo = ""
        black_elo = ""
        
        # Check if white is dragon
        white_dragon_level = parse_dragon_level(white_name)
        if white_dragon_level is not None:
            white_elo = dragon_elo(white_dragon_level)
        
        # Check if black is dragon
        black_dragon_level = parse_dragon_level(black_name)
        if black_dragon_level is not None:
            black_elo = dragon_elo(black_dragon_level)
        
        # Add row
        ROWS_OUT.append({
            "player_white": white_name,
            "player_black": black_name,
            "games": stats['total_games'],
            "win_rate_white": win_rate_white,
            "win_rate_black": win_rate_black,
            "draw_rate": draw_rate,
            "player_white_elo": white_elo,
            "player_black_elo": black_elo,
            "source": "misc_dragon",
        })
        
        misc_dragon_count += 1

print(f"Loaded {misc_dragon_count} rows from misc/dragon")


Loaded 20 rows from misc/dragon


### 4. `_logs/llm_vs_llm` - LLM vs LLM


In [19]:
# Process llm_vs_llm directory
llm_vs_llm_count = 0

if LLM_VS_LLM_DIR.exists():
    for pair_dir in LLM_VS_LLM_DIR.iterdir():
        if not pair_dir.is_dir():
            continue
        
        # Parse directory name: white_vs_black
        if '_vs_' not in pair_dir.name:
            continue
        
        white_name, black_name = pair_dir.name.split('_vs_', 1)
        
        # Aggregate across all timestamp directories
        total_games = 0
        total_white_wins = 0
        total_black_wins = 0
        total_draws = 0
        
        for timestamp_dir in pair_dir.iterdir():
            if not timestamp_dir.is_dir():
                continue
            
            aggregate_file = timestamp_dir / "_aggregate_results.json"
            if aggregate_file.exists():
                stats = read_aggregate_json(aggregate_file)
                total_games += stats['total_games']
                total_white_wins += stats['white_wins']
                total_black_wins += stats['black_wins']
                total_draws += stats['draws']
        
        # Skip if no games found
        if total_games == 0:
            continue
        
        # Compute win rates
        win_rate_white, win_rate_black, draw_rate = compute_win_rates(
            total_white_wins, total_black_wins, total_draws, total_games
        )
        
        # Add row (no Elo for LLM vs LLM)
        ROWS_OUT.append({
            "player_white": white_name,
            "player_black": black_name,
            "games": total_games,
            "win_rate_white": win_rate_white,
            "win_rate_black": win_rate_black,
            "draw_rate": draw_rate,
            "player_white_elo": "",
            "player_black_elo": "",
            "source": "llm_vs_llm",
        })
        
        llm_vs_llm_count += 1

print(f"Loaded {llm_vs_llm_count} rows from llm_vs_llm")


Loaded 11 rows from llm_vs_llm


## Data Analysis and Model Name Standardization


In [20]:
# Collect all unique model names from all sources
all_model_names = set()

for row in ROWS_OUT:
    # Skip non-LLM players
    if row['player_white'] not in ['random_player'] and not row['player_white'].startswith(('dragon-lvl-', 'stockfish-lvl-')):
        all_model_names.add(row['player_white'])
    if row['player_black'] not in ['random_player'] and not row['player_black'].startswith(('dragon-lvl-', 'stockfish-lvl-')):
        all_model_names.add(row['player_black'])

print(f"Found {len(all_model_names)} unique model names across all sources:")
print("=" * 60)

# Group by data source
models_by_source = defaultdict(set)
for row in ROWS_OUT:
    source = row['source']
    if row['player_white'] not in ['random_player'] and not row['player_white'].startswith(('dragon-lvl-', 'stockfish-lvl-')):
        models_by_source[source].add(row['player_white'])
    if row['player_black'] not in ['random_player'] and not row['player_black'].startswith(('dragon-lvl-', 'stockfish-lvl-')):
        models_by_source[source].add(row['player_black'])

for source, models in models_by_source.items():
    print(f"\n{source.upper()} ({len(models)} models):")
    for model in sorted(models):
        print(f"  - {model}")

print("\n" + "=" * 60)


Found 147 unique model names across all sources:

REFINED (122 models):
  - amazon.nova-lite-v1
  - amazon.nova-pro-v1
  - chat-bison-32k@002
  - claude-v3-5-haiku
  - claude-v3-5-sonnet-v1
  - claude-v3-5-sonnet-v2
  - claude-v3-7-sonnet
  - claude-v3-7-sonnet-thinking_10000
  - claude-v3-7-sonnet-thinking_1024
  - claude-v3-7-sonnet-thinking_2048
  - claude-v3-7-sonnet-thinking_5000
  - claude-v3-haiku
  - claude-v3-opus
  - claude-v4-opus
  - claude-v4-opus-thinking_16000
  - claude-v4-sonnet
  - claude-v4-sonnet-thinking_16000
  - deephermes-3-llama-3-8b-preview@q8
  - deepseek-chat-v3
  - deepseek-chat-v3-0324
  - deepseek-r1-distill-qwen-14b@q8_0
  - deepseek-r1-distill-qwen-32b@q4_k_m
  - deepseek-reasoner-r1
  - gemini-1.5-flash-001
  - gemini-1.5-pro-preview-0409
  - gemini-2.0-flash-001
  - gemini-2.0-flash-exp
  - gemini-2.0-flash-lite-001
  - gemini-2.0-flash-lite-preview-02-05
  - gemini-2.0-flash-thinking-exp-01-21
  - gemini-2.0-flash-thinking-exp-1219
  - gemini-2.5-fla

In [21]:
# Analyze potential duplicates/variants
print("POTENTIAL MODEL NAME VARIANTS:")
print("=" * 60)

# Group similar names (simple heuristic based on common substrings)
model_groups = defaultdict(list)
for model in sorted(all_model_names):
    # Extract base model name (before @, -, or other separators)
    base_name = model.split('@')[0].split('-')[0].split('_')[0]
    model_groups[base_name].append(model)

# Show groups with multiple variants
for base_name, variants in model_groups.items():
    if len(variants) > 1:
        print(f"\n{base_name.upper()} variants:")
        for variant in variants:
            # Show which sources contain this variant
            sources = []
            for source, models in models_by_source.items():
                if variant in models:
                    sources.append(source)
            print(f"  - {variant} (sources: {', '.join(sources)})")

# Check for exact duplicates across sources
print("\n" + "=" * 60)
print("EXACT DUPLICATES ACROSS SOURCES:")
print("=" * 60)

for model in sorted(all_model_names):
    sources_with_model = []
    for source, models in models_by_source.items():
        if model in models:
            sources_with_model.append(source)
    
    if len(sources_with_model) > 1:
        print(f"{model}: {', '.join(sources_with_model)}")

print("\n" + "=" * 60)


POTENTIAL MODEL NAME VARIANTS:

3X variants:
  - 3x-o4-mini-2025-04-16-low_41mini-t03 (sources: dragon_vs_llm)
  - 3x-o4-mini-2025-04-16-low_o4-mini-2025-04-16-medium (sources: dragon_vs_llm)

4O variants:
  - 4o_mini (sources: llm_vs_llm)
  - 4o_mini_messaed_player_names (sources: llm_vs_llm)

AMAZON.NOVA variants:
  - amazon.nova-lite-v1 (sources: refined)
  - amazon.nova-pro-v1 (sources: refined)

CLAUDE variants:
  - claude-3-7-sonnet-20250219-thinking-budget-10000 (sources: dragon_refined, dragon_vs_llm)
  - claude-3-7-sonnet-20250219-thinking-budget-5000 (sources: dragon_refined, dragon_vs_llm)
  - claude-v3-5-haiku (sources: refined)
  - claude-v3-5-sonnet-v1 (sources: refined)
  - claude-v3-5-sonnet-v2 (sources: refined)
  - claude-v3-7-sonnet (sources: refined)
  - claude-v3-7-sonnet-thinking_10000 (sources: refined)
  - claude-v3-7-sonnet-thinking_1024 (sources: refined)
  - claude-v3-7-sonnet-thinking_2048 (sources: refined)
  - claude-v3-7-sonnet-thinking_5000 (sources: ref

In [22]:
# Create a standardization mapping
# This will be a manual mapping that we can adjust based on the analysis above
model_name_mapping = {
    # Add mappings here based on the analysis
    # Example: 'variant_name': 'standard_name'
}

# Function to standardize model names
def standardize_model_name(name):
    """Apply standardization mapping to model names"""
    return model_name_mapping.get(name, name)

# Apply standardization to all rows
standardized_rows = []
for row in ROWS_OUT:
    new_row = row.copy()
    new_row['player_white'] = standardize_model_name(row['player_white'])
    new_row['player_black'] = standardize_model_name(row['player_black'])
    standardized_rows.append(new_row)

print(f"Applied standardization mapping to {len(standardized_rows)} rows")
print(f"Mapping rules: {len(model_name_mapping)}")

# Check if any standardization was applied
changes_made = sum(1 for i, row in enumerate(ROWS_OUT) 
                  if (standardized_rows[i]['player_white'] != row['player_white'] or 
                      standardized_rows[i]['player_black'] != row['player_black']))

print(f"Rows with changes: {changes_made}")

# Use standardized rows for output
ROWS_OUT = standardized_rows


Applied standardization mapping to 346 rows
Mapping rules: 0
Rows with changes: 0


## Final Output


In [23]:
# Ensure deterministic ordering by player_white, then player_black
ROWS_OUT.sort(key=lambda x: (x["player_white"].lower(), x["player_black"].lower()))

# Write to CSV (overwrite)
with WIN_RATES_CSV.open("w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=HEADER)
    writer.writeheader()
    writer.writerows(ROWS_OUT)

print(f"\nTotal rows written to {WIN_RATES_CSV}: {len(ROWS_OUT)}")
print(f"- From refined.csv: {len(ROWS_OUT) - dragon_vs_llm_count - misc_dragon_count - llm_vs_llm_count}")
print(f"- From dragon_vs_llm: {dragon_vs_llm_count}")
print(f"- From misc/dragon: {misc_dragon_count}")
print(f"- From llm_vs_llm: {llm_vs_llm_count}")



Total rows written to win_rates.csv: 346
- From refined.csv: 287
- From dragon_vs_llm: 28
- From misc/dragon: 20
- From llm_vs_llm: 11


In [24]:
preview_win_rates_csv(WIN_RATES_CSV)

PREVIEW_ROWS = CURR_ROWS
CURR_ROWS = len(rows)

print(f"Total rows: {CURR_ROWS}, rows added: {CURR_ROWS - PREVIEW_ROWS}")

# Show breakdown by data source
source_counts = Counter(row[8] for row in rows)  # source is column 8
print(f"\nBreakdown by source:")
for source, count in source_counts.items():
    print(f"- {source}: {count} rows")

# Show breakdown by player types
dragon_rows = sum(1 for row in rows if 'dragon-lvl-' in row[0] or 'dragon-lvl-' in row[1])
random_rows = sum(1 for row in rows if 'random' in row[0] or 'random' in row[1])
stockfish_rows = sum(1 for row in rows if 'stockfish' in row[0] or 'stockfish' in row[1])
llm_only_rows = len(rows) - dragon_rows - random_rows - stockfish_rows

print(f"\nBreakdown by player types:")
print(f"- Rows with Dragon: {dragon_rows}")
print(f"- Rows with Random: {random_rows}")
print(f"- Rows with Stockfish: {stockfish_rows}")
print(f"- LLM vs LLM only: {llm_only_rows}")


NameError: name 'preview_win_rates_csv' is not defined