In [1]:
# !pip install pandas Jinja2 matplotlib

In [2]:
import os
import sys
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
sys.path.append(os.path.dirname(os.getcwd()))
from aggregate_logs_to_csv import aggregate_models_to_csv, MODEL_OVERRIDES
from aggr_logs_to_plain_csv import aggregate_logs_to_csv

## Get Aggregate CSV

In [3]:
LOGS_DIR = "_logs/no_reflection"
AGGREGATE_CSV = os.path.join(LOGS_DIR, "aggregate_models.csv")
REFINED_CSV = "data_processing/refined.csv"

aggregate_models_to_csv("../_logs/no_reflection","aggr.csv")

In [4]:
csv_file_path = "aggr.csv"
# CSV cols:
# model_name,total_games,black_llm_wins,white_rand_wins,draws,black_llm_wins_percent,black_llm_draws_percent,llm_total_moves,llm_wrong_actions,llm_wrong_moves,llm_avg_material,llm_std_dev_material,rand_avg_material,rand_std_dev_material,material_diff_llm_minus_rand,material_diff_llm_minus_rand_per_100moves,wrong_actions_per_100moves,wrong_moves_per_100moves,wrong_actions_per_1000moves,wrong_moves_per_1000moves,mistakes_per_1000moves,std_dev_wrong_actions_per_1000moves,std_dev_wrong_moves_per_1000moves,std_dev_mistakes_per_1000moves,average_moves,std_dev_moves,completion_tokens_black,completion_tokens_black_per_move,min_moves,max_moves,prompt_tokens_black,total_tokens_black,moe_material_diff,moe_avg_moves,moe_wrong_actions_per_1000moves,moe_wrong_moves_per_1000moves,moe_mistakes_per_1000moves

df_aggr = pd.read_csv(csv_file_path)
print(df_aggr.to_string(index=False))

# selected_columns = df_aggr[["model_name", "total_games", "wrong_actions_per_100moves", "wrong_moves_per_100moves", "min_moves", "max_moves", "average_moves", "std_dev_moves"]]

# Print the DataFrame as a properly tabbed table with headers
# print(selected_columns.to_string(index=False))



                           model_name  total_games  black_llm_wins  white_rand_wins  draws  black_llm_wins_percent  black_llm_draws_percent  llm_total_moves  llm_wrong_actions  llm_wrong_moves  llm_avg_material  llm_std_dev_material  rand_avg_material  rand_std_dev_material  material_diff_llm_minus_rand  material_diff_llm_minus_rand_per_100moves  wrong_actions_per_100moves  wrong_moves_per_100moves  wrong_actions_per_1000moves  wrong_moves_per_1000moves  mistakes_per_1000moves  std_dev_wrong_actions_per_1000moves  std_dev_wrong_moves_per_1000moves  std_dev_mistakes_per_1000moves  average_moves  std_dev_moves  completion_tokens_black  completion_tokens_black_per_move  min_moves  max_moves  prompt_tokens_black  total_tokens_black  moe_material_diff  moe_avg_moves  moe_wrong_actions_per_1000moves  moe_wrong_moves_per_1000moves  moe_mistakes_per_1000moves
         anthropic.claude-v3-5-sonnet           30               0                8     22                0.000000                73.333

## Get Flattened (Plain) CSV with Logs

In [5]:
def process_logs_to_dataframe(logs_path, output_csv, model_dict):
    """
    Process logs into a DataFrame, substitute model names, and return the DataFrame.

    Args:
        logs_path (str): Path to the logs directory.
        output_csv (str): Path to save the intermediate CSV file.
        model_dict (dict): Dictionary for substituting model names.

    Returns:
        pd.DataFrame: Processed DataFrame with substituted model names.
    """
    # Aggregate logs into a CSV
    aggregate_logs_to_csv(logs_path, output_csv)

    # Read the aggregated CSV into a DataFrame
    df_plain = pd.read_csv(output_csv)

    # Insert the 'model' column based on 'player_black_model'
    df_plain.insert(df_plain.columns.get_loc("path") + 1, "model", df_plain["player_black_model"])

    # Replace model names in the DataFrame using model_dict values
    def substitute_model_names(df, model_dict):
        def get_correct_model_name(row):
            key = next((k for k in model_dict if os.path.dirname(row.path).endswith(k)), None)
            return model_dict[key] if key else row["model"]  # Default to the original model if no match is found

        df["model"] = df.apply(get_correct_model_name, axis=1)

    # Apply the substitution logic
    substitute_model_names(df_plain, model_dict)

    return df_plain

# Example usage
df_plain = process_logs_to_dataframe("../_logs/no_reflection", "plain.csv", MODEL_OVERRIDES)
display(df_plain.head(2))


Unnamed: 0,path,model,time_started,winner,reason,number_of_moves,player_white_name,player_white_wrong_moves,player_white_wrong_actions,player_white_reflections_used,...,material_count_black,player_black_name,player_black_wrong_moves,player_black_wrong_actions,player_black_reflections_used,player_black_reflections_used_before_board,player_black_model,black_model_prompt_tokens,black_model_completion_tokens,black_model_total_tokens
0,../_logs/no_reflection/2025-01-19_anthropic.cl...,anthropic.claude-v3-5-sonnet-v2,2025.01.19_17:52,NONE,Max moves reached,200,Random_Player,0,0,0,...,11,Player_Black,1,0,0,0,anthropic.claude-v3-5-sonnet-v2,113375,18006,131381
1,../_logs/no_reflection/2025-01-19_anthropic.cl...,anthropic.claude-v3-5-sonnet-v2,2025.01.19_15:59,NONE,Max moves reached,200,Random_Player,0,0,0,...,8,Player_Black,0,0,0,0,anthropic.claude-v3-5-sonnet-v2,119092,18737,137829


## Compare Aggr to Aggr-from-Plain

In [6]:
# Compare aggregates from aggr.csv to thoses ones obtained from plain.csv, check number of games/logs to to match (the number of log files == sum of total games)

# df_plain columns
# path,time_started,winner,reason,number_of_moves,player_white_name,player_white_wrong_moves,player_white_wrong_actions,player_white_reflections_used,player_white_reflections_used_before_board,player_white_model,material_count_white,material_count_black,player_black_name,player_black_wrong_moves,player_black_wrong_actions,player_black_reflections_used,player_black_reflections_used_before_board,player_black_model,black_model_prompt_tokens,black_model_completion_tokens,black_model_total_tokens


# Group the data by 'player_black_model' and calculate the number of moves for each model
grouped_data = df_plain.groupby('model')['number_of_moves']

aggregates_from_plain = df_plain.groupby('model').agg(
    total_games=('number_of_moves', 'count'),
    black_llm_wins=('winner', lambda x: (x == 'Player_Black').sum()),
    white_rand_wins=('winner', lambda x: (x == 'Random_Player').sum()),
    draws=('winner', lambda x: (x == 'NONE').sum()),
    sum_wrong_actions=('player_black_wrong_actions', 'sum'),
    sum_wrong_moves=('player_black_wrong_moves', 'sum'),
    sum_moves=('number_of_moves', 'sum'),
    min_moves=('number_of_moves', 'min'),
    max_moves=('number_of_moves', 'max'),
    average_moves=('number_of_moves', 'mean'),
    std_dev_moves=('number_of_moves', lambda x: x.std(ddof=1)),  # Sample standard deviation
    average_material_count_white=('material_count_white', 'mean'),
    std_dev_material_count_white=('material_count_white', lambda x: x.std(ddof=1)),
    std_err_material_count_white=('material_count_white', lambda x: x.std(ddof=1) / (len(x) ** 0.5)),
    average_material_count_black=('material_count_black', 'mean'),
    std_dev_material_count_black=('material_count_black', lambda x: x.std(ddof=1)),
    std_err_material_count_black=('material_count_black', lambda x: x.std(ddof=1) / (len(x) ** 0.5)),
    black_model_prompt_tokens=('black_model_prompt_tokens', 'sum'),
    average_black_model_prompt_tokens=('black_model_prompt_tokens', 'mean'),
    std_dev_black_model_prompt_tokens=('black_model_prompt_tokens', lambda x: x.std(ddof=1)),
    std_err_black_model_prompt_tokens=('black_model_prompt_tokens', lambda x: x.std(ddof=1) / (len(x) ** 0.5)),
    black_model_completion_tokens=('black_model_completion_tokens', 'sum'),
    average_black_model_completion_tokens=('black_model_completion_tokens', 'mean'),
    std_dev_black_model_completion_tokens=('black_model_completion_tokens', lambda x: x.std(ddof=1)),
    std_err_black_model_completion_tokens=('black_model_completion_tokens', lambda x: x.std(ddof=1) / (len(x) ** 0.5)),
    black_model_total_tokens=('black_model_total_tokens', 'sum'),
    average_black_model_total_tokens=('black_model_total_tokens', 'mean'),
    std_dev_black_model_total_tokens=('black_model_total_tokens', lambda x: x.std(ddof=1)),
    std_err_black_model_total_tokens=('black_model_total_tokens', lambda x: x.std(ddof=1) / (len(x) ** 0.5))
).reset_index() 

# # Now compute normalized values
# aggregates['wrong_actions_per_100moves'] = (aggregates['sum_wrong_actions'] / aggregates['sum_moves']) * 100
# aggregates['wrong_moves_per_100moves'] = (aggregates['sum_wrong_moves'] / aggregates['sum_moves']) * 100

# Calculate and print the sum of total_games in df_aggr
df_aggr_total_games_sum = df_aggr["total_games"].sum()
print(f"Sum of total_games in df_aggr: {df_aggr_total_games_sum}")

# Calculate and print the sum of total_games in aggregates
aggregates_total_games_sum = aggregates_from_plain["total_games"].sum()
print(f"Sum of total_games in aggregates: {aggregates_total_games_sum}")

# Print column names from df_aggr
print("Columns in df_aggr:")
print(df_aggr.columns.tolist())

# Print column names from aggregates
print("Columns in aggregates_from_plain:")
print(aggregates_from_plain.columns.tolist())


Sum of total_games in df_aggr: 1902
Sum of total_games in aggregates: 1902
Columns in df_aggr:
['model_name', 'total_games', 'black_llm_wins', 'white_rand_wins', 'draws', 'black_llm_wins_percent', 'black_llm_draws_percent', 'llm_total_moves', 'llm_wrong_actions', 'llm_wrong_moves', 'llm_avg_material', 'llm_std_dev_material', 'rand_avg_material', 'rand_std_dev_material', 'material_diff_llm_minus_rand', 'material_diff_llm_minus_rand_per_100moves', 'wrong_actions_per_100moves', 'wrong_moves_per_100moves', 'wrong_actions_per_1000moves', 'wrong_moves_per_1000moves', 'mistakes_per_1000moves', 'std_dev_wrong_actions_per_1000moves', 'std_dev_wrong_moves_per_1000moves', 'std_dev_mistakes_per_1000moves', 'average_moves', 'std_dev_moves', 'completion_tokens_black', 'completion_tokens_black_per_move', 'min_moves', 'max_moves', 'prompt_tokens_black', 'total_tokens_black', 'moe_material_diff', 'moe_avg_moves', 'moe_wrong_actions_per_1000moves', 'moe_wrong_moves_per_1000moves', 'moe_mistakes_per_1000

In [7]:
# Print the calculated aggregates
print(aggregates_from_plain.to_string())

                                                model  total_games  black_llm_wins  white_rand_wins  draws  sum_wrong_actions  sum_wrong_moves  sum_moves  min_moves  max_moves  average_moves  std_dev_moves  average_material_count_white  std_dev_material_count_white  std_err_material_count_white  average_material_count_black  std_dev_material_count_black  std_err_material_count_black  black_model_prompt_tokens  average_black_model_prompt_tokens  std_dev_black_model_prompt_tokens  std_err_black_model_prompt_tokens  black_model_completion_tokens  average_black_model_completion_tokens  std_dev_black_model_completion_tokens  std_err_black_model_completion_tokens  black_model_total_tokens  average_black_model_total_tokens  std_dev_black_model_total_tokens  std_err_black_model_total_tokens
0                        anthropic.claude-v3-5-sonnet           30               0                8     22                  0               12       5148         10        200     171.600000      55.148452 

In [9]:
# Create a dictionary to map the best matches between df_aggr and aggregates_from_plain column names
column_mapping = {
    'model_name': 'model',
    'total_games': 'total_games',
    'black_llm_wins': 'black_llm_wins',
    'white_rand_wins': 'white_rand_wins',
    'draws': 'draws',
    'black_llm_wins_percent': None,  # No direct match in aggregates_from_plain
    'black_llm_draws_percent': None,  # No direct match in aggregates_from_plain
    'llm_total_moves': 'sum_moves',
    'llm_wrong_actions': 'sum_wrong_actions',
    'llm_wrong_moves': 'sum_wrong_moves',
    'llm_avg_material': 'average_material_count_black',
    'llm_std_dev_material': 'std_dev_material_count_black',
    'rand_avg_material': 'average_material_count_white',
    'rand_std_dev_material': 'std_dev_material_count_white',
    'material_diff_llm_minus_rand': None,  # No direct match in aggregates_from_plain
    'material_diff_llm_minus_rand_per_100moves': None,  # No direct match in aggregates_from_plain
    'wrong_actions_per_100moves': None,  # No direct match in aggregates_from_plain
    'wrong_moves_per_100moves': None,  # No direct match in aggregates_from_plain
    'wrong_actions_per_1000moves': None,  # No direct match in aggregates_from_plain
    'wrong_moves_per_1000moves': None,  # No direct match in aggregates_from_plain
    'mistakes_per_1000moves': None,  # No direct match in aggregates_from_plain
    'std_dev_wrong_actions_per_1000moves': None,  # No direct match in aggregates_from_plain
    'std_dev_wrong_moves_per_1000moves': None,  # No direct match in aggregates_from_plain
    'std_dev_mistakes_per_1000moves': None,  # No direct match in aggregates_from_plain
    'average_moves': 'average_moves',
    'std_dev_moves': 'std_dev_moves',
    'completion_tokens_black': 'black_model_completion_tokens',
    'completion_tokens_black_per_move': None,  # No direct match in aggregates_from_plain
    'min_moves': 'min_moves',
    'max_moves': 'max_moves',
    'prompt_tokens_black': 'black_model_prompt_tokens',
    'total_tokens_black': 'black_model_total_tokens',
    'moe_material_diff': None,  # No direct match in aggregates_from_plain
    'moe_avg_moves': None,  # No direct match in aggregates_from_plain
    'moe_wrong_actions_per_1000moves': None,  # No direct match in aggregates_from_plain
    'moe_wrong_moves_per_1000moves': None,  # No direct match in aggregates_from_plain
    'moe_mistakes_per_1000moves': None,  # No direct match in aggregates_from_plain
}

# Iterate over the rows in df_aggr
for index, row in df_aggr.iterrows():
    model_name = row['model_name']
    
    # Find the corresponding row in aggregates_from_plain
    matching_row = aggregates_from_plain[aggregates_from_plain['model'] == model_name]
    
    if matching_row.empty:
        print(f"Model '{model_name}' not found in aggregates_from_plain.")
        continue
    
    # Compare the values of mapped columns
    for df_aggr_col, aggregates_col in column_mapping.items():
        if aggregates_col is None:
            # Skip columns that have no mapping
            continue
        
        df_aggr_value = row[df_aggr_col]
        
        try:
            # Safely access the value in matching_row
            aggregates_value = matching_row.iloc[0].get(aggregates_col, None)
        except KeyError:
            print(f"Column '{aggregates_col}' not found in aggregates_from_plain for model '{model_name}'.")
            continue
        
        # Ensure both values are converted to numeric if possible
        try:
            df_aggr_value = pd.to_numeric(df_aggr_value, errors='coerce')
            aggregates_value = pd.to_numeric(aggregates_value, errors='coerce')
        except Exception as e:
            print(f"Error converting values to numeric for column '{df_aggr_col}': {e}")
            continue
        
        if not pd.isna(df_aggr_value) and not pd.isna(aggregates_value):
            if not np.isclose(df_aggr_value, aggregates_value, atol=1e-6):
                print(f"Discrepancy for model '{model_name}' in column '{df_aggr_col}':")
                print(f"  df_aggr value: {df_aggr_value}")
                print(f"  aggregates_from_plain value: {aggregates_value}")
        elif pd.isna(df_aggr_value) != pd.isna(aggregates_value):
            print(f"Discrepancy for model '{model_name}' in column '{df_aggr_col}':")
            print(f"  df_aggr value: {df_aggr_value}")
            print(f"  aggregates_from_plain value: {aggregates_value}")


Model 'deepseek-r1-distill-qwen-32b@q4_k_m' not found in aggregates_from_plain.
Discrepancy for model 'gemini-2.0-flash-thinking-exp-01-21' in column 'draws':
  df_aggr value: 2
  aggregates_from_plain value: 1
