In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Set to the GPU you want to use

In [None]:
import re
import datetime
import traceback
from tqdm import tqdm
import random
import pickle
import collections
import itertools
from copy import deepcopy
from functools import partial
import dill
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import lovely_tensors as lt
lt.monkey_patch()

from llmg.utils.mix import seed_all
from llmg.chameleon.utils import (
    load_game_logs,
    construct_result_dict,
    collect_llm_preds_at_breakpoints,
)
from llmg.chameleon.NaturalLanguageTalker.naturallanguagetalker import NaturalLanguageTalker
from llmg.chameleon.NaturalLanguageTalker.OpenaiTalker import OpenaiTalker

In [None]:
# Load pre-collected data
data_dirs = [
    os.path.join(os.environ["DATA_DIR"], "chameleon", "2025-08-13_20-15"),
]

game_logs = load_game_logs(
    data_dirs,
    layer_to_probe=None,
    token_to_probe=None,
    max_games=100,
    verbose=2,
)
all_games = construct_result_dict(game_logs)

## Manual analysis

In [None]:
# Check failed games
for g in game_logs:
    if g["game_result"] == "Fail":
        print(g["explanation"])
        print(g["chameleon_response"])
        print(g["possible_words"])
        for ms in g["messages"]:
            for p in ms:
                print(p["content"])
        print("\n")
        break

In [None]:
# Responses
for g in game_logs:
    print(f'{g["game_result"]}  |  Ch: {str(g["chameleon_index"]+1)}  |  VCh: {str(g["voted_chameleon"])}')
    print(f'  secret: {g["secret_word"]}\n  responses: {g["word_responses"]}\n  votes: {g["votes"]}')
    print("  possible: " + ", ".join(g["possible_words"]))

In [None]:
# Messages
for g in game_logs:
    for m in g["messages"][2]:
        print(m["content"])
    break

## Basic gameplay statistics

In [None]:
# Prepare data for plotting
chameleon_types = [matchup[0] for matchup in all_games.keys()]
chameleon_types = sorted(list(set(chameleon_types)))
truthful_types = [matchup[1] for matchup in all_games.keys()]
truthful_types = sorted(list(set(truthful_types)))

# Construct tables for plotting
valid_table = np.zeros((len(chameleon_types), len(truthful_types)))
win_table = np.zeros((len(chameleon_types), len(truthful_types)))
identification_table = np.zeros((len(chameleon_types), len(truthful_types)))
second_stage_win_table = np.zeros((len(chameleon_types), len(truthful_types)))
for i in range(len(chameleon_types)):
    for j in range(len(truthful_types)):
        games = all_games[(chameleon_types[i], truthful_types[j])]
        valid_table[len(chameleon_types) - 1 - i][j] = games['num_of_valid_trials']/games['num_of_trials']
        win_table[len(chameleon_types) - 1 - i][j] = games['num_of_chameleon_loses']/games['num_of_valid_trials']
        identification_table[len(chameleon_types) - 1 - i][j] = games['num_of_chameleon_identified']/games['num_of_valid_trials']
        second_stage_win_table[len(chameleon_types) - 1 - i][j] = 1- games['num_of_chameleon_loses']/games['num_of_chameleon_identified']

# Round to 2 decimal places
valid_table = np.round(valid_table, 2)
win_table = np.round(win_table, 2)
identification_table = np.round(identification_table, 2)
second_stage_win_table = np.round(second_stage_win_table, 2)

# Set labels for plotting
chameleon_types_short = ['Qwen3 32B AWQ']
chameleon_types_short = chameleon_types_short[::-1]
truthful_types_short = ['Qwen3 32B AWQ']

In [None]:
# Plot
sns.set_theme()
fontsize = 20

# Valid games ratio
plt.figure(figsize=(4, 4))
ax = sns.heatmap(valid_table, annot=True, yticklabels=chameleon_types_short, xticklabels=truthful_types_short,
vmin=0, vmax=1, square=True, annot_kws={"size": fontsize})
ax.set_title("Valid Games Ratio")
ax.set(ylabel="Chameleon type", xlabel="Non-chameleon type")
plt.subplots_adjust(left=-0, right=1, top=0.9, bottom=0.4)
plt.show()

# Non-chameleon win ratio
plt.figure(figsize=(4, 4))
ax = sns.heatmap(win_table, annot=True, yticklabels=chameleon_types_short, xticklabels=truthful_types_short,
vmin=0, vmax=1, square=True, annot_kws={"size": fontsize})
ax.set_title("Non-Chameleon Win Ratio")
ax.set(ylabel="Chameleon type", xlabel="Non-chameleon type")
plt.subplots_adjust(left=-0, right=1, top=0.9, bottom=0.4)
plt.show()

# Chameleon identification ratio
plt.figure(figsize=(4, 4))
ax = sns.heatmap(identification_table, annot=True, yticklabels=chameleon_types_short, xticklabels=truthful_types_short,
vmin=0, vmax=1, square=True, annot_kws={"size": fontsize})
ax.set_title("Identification Ratio")
ax.set(ylabel="Chameleon type", xlabel="Non-chameleon type")
plt.subplots_adjust(left=-0, right=1, top=0.9, bottom=0.4)
plt.show()

# Second chance win ratio
plt.figure(figsize=(4, 4))
ax = sns.heatmap(second_stage_win_table, annot=True, yticklabels=chameleon_types_short, xticklabels=truthful_types_short,
vmin=0, vmax=1, square=True, annot_kws={"size": fontsize})
ax.set_title("Second Chance Win Ratio")
ax.set(ylabel="Chameleon type", xlabel="Non-chameleon type")
plt.subplots_adjust(left=-0, right=1, top=0.9, bottom=0.4)
plt.show()

## Evaluator LLM's accuracy in guessing the secret word

In [None]:
# Global config
cfg = {
    "seed": 0,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "run_time": datetime.datetime.now().strftime("%Y-%m-%d_%H-%M"),
}

In [None]:
cfg["eval"] = {
    ### General configuration
    "include_chameleon_response": False,
    "remap_player_ids_to_be_increasing": True,
    "free_generation": True,
    "collect_logprobs_of_all_secret_words": False,
    "collect_logprobs_of_capitalized_secret_words": False,
    "all_other_player_idxs": None, # Set later

    ### Evaluation talker configuration
    "cls": OpenaiTalker,
    "kwargs": {
        "model_id": {
            "GPT-4o mini": "gpt-4o-mini-2024-07-18",
            "GPT-4o": "gpt-4o-2024-08-06",
            "GPT-4.1": "gpt-4.1-2025-04-14",
            "GPT-5": "gpt-5-2025-08-07",
        }[(model_name := "GPT-4.1")], # Change this to your model of choice !
        "api_key": os.environ["OPENAI_API_KEY"],
        "start_conversation": False,
        "additional_generation_kwargs": {
            "GPT-4o mini": {
                "max_output_tokens": 20,
                "temperature": 0,
            },
            "GPT-4o": {
                "max_output_tokens": 20,
                "temperature": 0,
            },
            "GPT-4.1": {
                "max_output_tokens": 20,
                "temperature": 0,
            },
            "GPT-5": {
                "max_output_tokens": 5000,
                "temperature": 1,
                "reasoning": {"effort": "low"},
            },
        }[model_name],
    },
}

In [None]:
# Set the player indices (0-based indexing) - Corresponds to sets of player indices for whose response words to collect secret word predictions
player_set_idxs = [
    (0,),
    (1,),
    (2,),
    (0, 1),
    (0, 2),
    (1, 2),
    (0, 1, 2),
]
# Generate all possible permutations of the player sets
seed_all(cfg["seed"])
num_orig_player_sets = len(player_set_idxs)
cfg["eval"]["all_other_player_idxs"] = []
for player_set_i in range(num_orig_player_sets):
    player_set = player_set_idxs[player_set_i]
    cfg["eval"]["all_other_player_idxs"].append(player_set)

    if len(player_set) > 1:
        # Add a random permutation of the player set
        while (player_set_permutation := tuple(random.sample(player_set, len(player_set)))) == player_set:
            continue
        cfg["eval"]["all_other_player_idxs"].append(player_set_permutation)

print(f"Generated {len(cfg['eval']['all_other_player_idxs'])} player set permutations from {num_orig_player_sets} original player sets.")

In [None]:
# Initialize the evaluation talker
seed_all(cfg["seed"])
evaluator = cfg["eval"]["cls"](**cfg["eval"]["kwargs"])

In [None]:
# Collect chameleon's guesses about the secret word
seed_all(cfg["seed"])
llm_preds_at_bpoints = collect_llm_preds_at_breakpoints(
    game_logs=game_logs,
    model=evaluator,
    tokenizer=None,
    token_idx=0, # not used
    layer_idx=0, # not used
    generation_kwargs=None, # not used
    verbose=True,
    all_other_player_idxs=cfg["eval"]["all_other_player_idxs"],
    remap_player_ids_to_be_increasing=cfg["eval"]["remap_player_ids_to_be_increasing"], # When all_other_player_idxs contain decreasing sequences, remap the player IDs in the prompt to be increasing 
    include_chameleon_response=cfg["eval"]["include_chameleon_response"],
    free_generation=cfg["eval"]["free_generation"],
    collect_logprobs_of_all_secret_words=cfg["eval"]["collect_logprobs_of_all_secret_words"],
    collect_logprobs_of_capitalized_secret_words=cfg["eval"]["collect_logprobs_of_capitalized_secret_words"]
)

# Save the collected evaluators' predictions
save_to = os.path.join(os.environ["DATA_DIR"], "chameleon", f"evaluation_{cfg['run_time']}.pkl")
with open(save_to, "wb") as f:
    pickle.dump({
        "config": cfg,
        "predictions": llm_preds_at_bpoints
    }, f)
print(f"Saved predictions to\n{save_to}")

In [None]:
# Load pre-collected evaluators' predictions
load_from = os.path.join(os.environ["DATA_DIR"], "chameleon", f"evaluation_2025-08-16_13-04.pkl")
with open(load_from, "rb") as f:
    saved_data = pickle.load(f)
cfg = saved_data["config"]
llm_preds_at_bpoints = saved_data["predictions"]
print(f"Loaded predictions from\n{load_from}")

In [None]:
# Collect evaluator's prediction and GT secret words
preds = collections.defaultdict(list)
gts = collections.defaultdict(list)

for player_id_set, evaluated_games in zip(cfg["eval"]["all_other_player_idxs"], llm_preds_at_bpoints):
    # player_id_set is a tuple of player IDs (0-based indexing)
    player_id_set = tuple(player_id_set)

    for eval_game in evaluated_games:
        assert eval_game["other_player_idxs"] == player_id_set, \
            f"Expected {player_id_set}, but got {eval_game['other_player_idxs']}"
        preds[player_id_set].append(eval_game["response"])
        game = game_logs[eval_game["game_index"]]
        gts[player_id_set].append(game["secret_word"])
preds, gts = dict(preds), dict(gts)

In [None]:
# Plot the accuracy of the evaluator guessing the correct secret word

# Config
fontsize = 22
add_annots = False
save_to = None
# save_to = "evaluator_accuracy_original_vs_permuted.pdf"


# GROUP RESULTS BY LENGTH AND ORDER TYPE (ORIGINAL VS. PERMUTED)
results_original = collections.defaultdict(list)
results_permuted = collections.defaultdict(list)
for player_id_set, predictions in preds.items():
    num_words = len(player_id_set)
    ground_truths = gts[player_id_set]
    
    # Check if the order is original (e.g., (0, 1, 2)) or permuted
    is_original_order = (tuple(player_id_set) == tuple(range(num_words)))
    
    # Check correctness for each prediction in this group
    for pred, gt in zip(predictions, ground_truths):
        is_correct = gt.lower() == pred.lower()
        if is_original_order:
            results_original[num_words].append(is_correct)
        else:
            results_permuted[num_words].append(is_correct)

# CALCULATE ACCURACY FOR EACH GROUP
# --- Original Order ---
sorted_lengths_orig = sorted(results_original.keys())
accuracies_orig = [np.mean(results_original[length]) for length in sorted_lengths_orig]

# --- Permuted Orders ---
sorted_lengths_perm = sorted(results_permuted.keys())
accuracies_perm = [np.mean(results_permuted[length]) for length in sorted_lengths_perm]

# PLOT THE ACCURACY TRENDS
sns.set_theme(style="whitegrid", context="talk")
plt.figure(figsize=(8, 6))

# Plot the line for the original order
ax = sns.lineplot(
    x=sorted_lengths_orig,
    y=accuracies_orig,
    marker='o',
    markersize=10,
    linewidth=3,
    color='royalblue',
    label='Original order'
)

if add_annots:
    # Add text annotations for the original order line
    for x, y in zip(sorted_lengths_orig, accuracies_orig):
        ax.text(x, y + 0.03, f"{y:.1%}", ha='center', fontsize=fontsize, fontweight='bold', color='royalblue')

# Plot the line for the permuted orders
if accuracies_perm:
    sns.lineplot(
        x=sorted_lengths_perm,
        y=accuracies_perm,
        marker='s',  # Use a square marker to differentiate
        markersize=10,
        linewidth=3,
        color='orangered',
        label='Permuted order'
    )
    if add_annots:
        # Add text annotations for the permuted order line
        for x, y in zip(sorted_lengths_perm, accuracies_perm):
            ax.text(x, y - 0.08, f"{y:.1%}", ha='center', fontsize=fontsize, fontweight='bold', color='orangered')

# --- Make it nicer: Add titles, labels, and fine-tune aesthetics ---
ax.set_xlabel("Number of response words", fontsize=fontsize, labelpad=15)
ax.set_ylabel("Accuracy", fontsize=fontsize, labelpad=15)

# Set axis limits and ticks to encompass all data
all_lengths = sorted(list(set(sorted_lengths_orig + sorted_lengths_perm)))
ax.set_ylim(0.5, 1.01)
if all_lengths:
    ax.set_xticks(all_lengths)
ax.tick_params(axis='both', which='major', labelsize=fontsize)

# Add a subtle grid and a legend
ax.grid(True, which='major', linestyle='--', linewidth=0.6, alpha=0.6)
ax.legend(fontsize=fontsize, loc='lower right')

plt.tight_layout()
if save_to is not None:
    plt.savefig(save_to)
    print(f"Saved figure to {save_to}")
plt.show()