In [1]:
import codenamesLLM
import pandas as pd
import openpyxl
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Path to your Excel file
file_path = "experiment_data\model_tournament\model_tournament_missing.xlsx" #compy the model_tournament_input to use one new

# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

# Iterate through the rows using titled columns
for index, row in tqdm(df.iterrows(), total=len(df)):
    # Access values using column titles
    red_team = row['red_model']  # Replace with the actual column title for the red team
    blue_team = row['blue_model']  # Replace with the actual column title for the blue team
    playable = True


    already_played = pd.notna(row['winner'])
    
    if ((pd.notna(red_team) and pd.notna(blue_team)) and not(already_played)) and playable:  # Check if both values are not NaN
        try:
            print(f"Playing {red_team} vs {blue_team}...")
            # Call your function and get the result
            result = codenamesLLM.play_game(red_model = red_team, blue_model = blue_team)
            print(result)
            red_stats = codenamesLLM.analyze_team_guesses(result[3], "RED")
            blue_stats = codenamesLLM.analyze_team_guesses(result[3], "BLUE")

            df.at[index, 'red_model'] = red_team
            df.at[index, 'blue_model'] = blue_team
            df.at[index, 'winner'] = result[0]  
            df.at[index, 'red_avg_words_2guess'] = red_stats['average_expected_guesses']
            df.at[index, 'blue_avg_words_2guess'] = blue_stats['average_expected_guesses']
            df.at[index, 'red_avg_words_guessed'] = red_stats['average_correct_guesses']
            df.at[index, 'blue_avg_words_guessed'] = blue_stats['average_correct_guesses']
            df.at[index, 'reason'] = result[1]
            df.at[index, 'red_turns'] = red_stats['total_hints']
            df.at[index, 'blue_turns'] = blue_stats['total_hints']
            df.at[index, 'red_cib'] = result[4]
            df.at[index, 'blue_cib'] = result[5]

        except Exception as e:
            print(f"skipped game between {red_team} and {blue_team}: {e}")

    # Write the updated DataFrame back to the same Excel file
    df.to_excel(file_path, index=False)
    if not already_played:
        print(f"Executed {red_team} vs {blue_team}")


 57%|█████▋    | 13/23 [00:00<00:00, 41.28it/s]

Playing claude-3-5-haiku-latest vs gpt-3.5-turbo...


100%|██████████| 23/23 [04:17<00:00, 11.20s/it]

('BLUE', 'killer word selected', 5, ['Team RED spymaster said: GAME (3).', 'Team RED said: DICE. The word was RED.', 'Team RED said: ROULETTE. The word was neutral.', 'Team BLUE spymaster said: SILVER (2).', 'Team BLUE said: COPPER. The word was neutral.', 'Team RED spymaster said: AUTHORITY (4).', 'Team RED said: POLICE. The word was neutral.', 'Team BLUE spymaster said: EQUESTRIAN (2).', 'Team BLUE said: HORSESHOE. The word was BLUE.', 'Team BLUE said: HORSE. The word was not in board', 'Team RED spymaster said: DOCUMENT (3).', 'Team RED said: PAPER. The word was RED.', 'Team RED said: ROOT. The word was KILLER.'], 0, 0)
Executed claude-3-5-haiku-latest vs gpt-3.5-turbo





In [11]:
import pandas as pd

def process_tournament_data(input_path, output_path):
    # Load the data
    df = pd.read_excel(input_path)

    # Function to calculate metrics
    def calculate_metrics(group):
        wins = group['winner'] == group['role'].str.upper()
        losses = ~wins
        card_finished = group['reason'] == 'cards finished'
        killer_word = group['reason'] == 'killer word selected'

        return {
            "model_name": group["model"].iloc[0],
            "games_played": len(group),
            "wins": wins.sum(),
            "win_percentage": 100 * wins.sum() / len(group),
            "win_by_cards_finished": (wins & card_finished).sum(),
            "wins_by_killer_words": (wins & killer_word).sum(),
            "losses_by_card_finished": (losses & card_finished).sum(),
            "losses_by_killer_words": (losses & killer_word).sum(),
            "average_word_to_guess": group['avg_words_2guess'].mean(),
            "average_word_to_guess_when_wins": group.loc[wins, 'avg_words_2guess'].mean(),
            "average_word_to_guess_when_lose": group.loc[losses, 'avg_words_2guess'].mean(),
            "average_word_to_guess_when_wins_by_ending_cards": group.loc[wins & card_finished, 'avg_words_2guess'].mean(),
            "average_word_to_guess_when_loses_by_ending_cards": group.loc[losses & card_finished, 'avg_words_2guess'].mean(),
            "average_word_to_guess_when_wins_by_killer_card": group.loc[wins & killer_word, 'avg_words_2guess'].mean(),
            "average_word_to_guess_when_loses_by_killer_card": group.loc[losses & killer_word, 'avg_words_2guess'].mean(),
            "average_word_guessed": group['avg_words_guessed'].mean(),
            "average_word_guessed_when_wins": group.loc[wins, 'avg_words_guessed'].mean(),
            "average_word_guessed_when_lose": group.loc[losses, 'avg_words_guessed'].mean(),
            "average_word_guessed_when_wins_by_ending_cards": group.loc[wins & card_finished, 'avg_words_guessed'].mean(),
            "average_word_guessed_when_loses_by_ending_cards": group.loc[losses & card_finished, 'avg_words_guessed'].mean(),
            "average_word_guessed_when_wins_by_killer_card": group.loc[wins & killer_word, 'avg_words_guessed'].mean(),
            "average_word_guessed_when_loses_by_killer_card": group.loc[losses & killer_word, 'avg_words_guessed'].mean(),
            "average_turns": group['turns'].mean(),
            "average_turns_when_wins": group.loc[wins, 'turns'].mean(),
            "average_turns_when_lose": group.loc[losses, 'turns'].mean(),
            "average_turns_when_wins_by_ending_cards": group.loc[wins & card_finished, 'turns'].mean(),
            "average_turns_when_loses_by_ending_cards": group.loc[losses & card_finished, 'turns'].mean(),
            "average_turns_when_wins_by_killer_cards": group.loc[wins & killer_word, 'turns'].mean(),
            "average_turns_when_loses_by_killer_cards": group.loc[losses & killer_word, 'turns'].mean(),
            "total_cib": group['cib'].sum()
        }

    # Reshape the dataset to treat roles equivalently
    red_df = df.rename(columns=lambda x: x.replace('red_', '')).assign(role='red', model=df['red_model'])
    blue_df = df.rename(columns=lambda x: x.replace('blue_', '')).assign(role='blue', model=df['blue_model'])
    combined_df = pd.concat([red_df, blue_df], ignore_index=True)

    # Metrics for models as a whole
    overall_metrics = combined_df.groupby("model").apply(calculate_metrics).apply(pd.Series)

    # Metrics for models playing as Red
    red_metrics = red_df.groupby("model").apply(calculate_metrics).apply(pd.Series)
    red_metrics['role'] = 'red'

    # Metrics for models playing as Blue
    blue_metrics = blue_df.groupby("model").apply(calculate_metrics).apply(pd.Series)
    blue_metrics['role'] = 'blue'

    # Save the results to an Excel file with three sheets
    with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
        overall_metrics.to_excel(writer, sheet_name='Overall', index=False)
        red_metrics.to_excel(writer, sheet_name='Red', index=False)
        blue_metrics.to_excel(writer, sheet_name='Blue', index=False)


In [12]:
input_path = 'experiment_data\model_tournament\model_tournament.xlsx'  # Replace with your input file path
output_path = 'experiment_data\model_tournament\model_tournament_stats.xlsx'  # Replace with your output file path

# Run the process
process_tournament_data(input_path, output_path)

  overall_metrics = combined_df.groupby("model").apply(calculate_metrics).apply(pd.Series)
  red_metrics = red_df.groupby("model").apply(calculate_metrics).apply(pd.Series)
  blue_metrics = blue_df.groupby("model").apply(calculate_metrics).apply(pd.Series)
