In [1]:
import os
import dill as pickle
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from constants import DATA_PATH

In [None]:
# All tested models (these will be loaded from data/)
# NOTE: This list is for reference and plotting order only
# Only models with both .pkl files and dataset directories will actually load
tested_models = [
    "claude-haiku-4.5",
    "deepseek-v3.2",
    "gemini-3-flash-preview",
    "gpt-5.1",
    "grok-4.1-fast",
    "kimi-k2-thinking",
    "ministral-14b-2512",
    "qwen3-vl-235b-a22b-thinking",
    "trinity-mini",
]

# Display names for plots (optional - can be customized)
display_names = {
    "claude-haiku-4.5": "Claude Haiku 4.5",
    "deepseek-v3.2": "DeepSeek V3.2",
    "gemini-3-flash-preview": "Gemini 3 Flash",
    "gpt-5.1": "GPT-5.1",
    "grok-4.1-fast": "Grok 4.1 Fast",
    "kimi-k2-thinking": "Kimi K2 Thinking",
    "ministral-14b-2512": "Ministral 14B",
    "qwen3-vl-235b-a22b-thinking": "Qwen3 VL 235B",
    "trinity-mini": "Trinity Mini"
}

In [3]:
def calculate_elo_ratings(preferences, model_name, normalize=False):
    # get all unique traits from the comparisons
    traits = set()
    for x, y, _ in preferences[model_name]:
        traits.add(x)
        traits.add(y)

    # initialize elo ratings (starting at 1000)
    elo_ratings = {trait: 1000.0 for trait in traits}
    
    # TODO: update k-factor for elo calculation
    # TODO: Adapt k to lower value in the trait:babble case
    K = 32

    # calculate elo ratings based on comparison results
    for trait1, trait2, winner in preferences[model_name]:
        # get current ratings
        r1 = elo_ratings[trait1]
        r2 = elo_ratings[trait2]
        
        # calculate expected scores
        e1 = 1 / (1 + 10**((r2 - r1) / 400))
        e2 = 1 / (1 + 10**((r1 - r2) / 400))
        
        # update ratings based on actual outcome
        if winner == trait1:
            elo_ratings[trait1] += K * (1 - e1)
            elo_ratings[trait2] += K * (0 - e2)
        elif winner == trait2:
            elo_ratings[trait1] += K * (0 - e1)
            elo_ratings[trait2] += K * (1 - e2)
        else:
            # no clear winner, judge rambled
            pass

    # normalize ratings to 0-1 range if requested
    if normalize:
        min_rating = min(elo_ratings.values())
        max_rating = max(elo_ratings.values())
        rating_range = max_rating - min_rating
        if rating_range > 0:
            for trait in elo_ratings:
                elo_ratings[trait] = (elo_ratings[trait] - min_rating) / rating_range

    # sort ratings in descending order
    for k, v in elo_ratings.items():
        elo_ratings[k] = round(v, 2)
    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
    return sorted_ratings

In [4]:
# Load preferences from pkl files (judge results)
# Filter out empty responses ("") where judge failed to determine winner
# Clean structure: pkl files and dataset directories are side-by-side in data/preferences/
preferences_path = f"{DATA_PATH}/preferences"

files = [f for f in os.listdir(preferences_path) if f.endswith(".pkl")]
preferences = {}

for file in files:
    name = file.split(".pkl")[0]
    pkl_path = f"{preferences_path}/{file}"
    dataset_path = f"{preferences_path}/{name}"
    
    # Check if matching dataset directory exists
    if not os.path.isdir(dataset_path):
        print(f"Warning: No dataset directory found for {name}, skipping...")
        continue
    
    try:
        with open(pkl_path, "rb") as f:
            data = load_from_disk(dataset_path)
            winners = pickle.load(f)
            # Filter out empty judge responses and cases where winner is not one of the traits
            preferences[name] = [(t1, t2, winner) for t1, t2, winner in zip(data["trait_1"], data["trait_2"], winners) 
                                if winner and winner != "" and winner in [t1, t2]]
        print(f"✓ Loaded {name}: {len(preferences[name])} valid comparisons")
    except Exception as e:
        print(f"✗ Error loading {name}: {e}")

# Get list of models from loaded data
model_names = sorted(preferences.keys())
print(f"\n{'='*60}")
print(f"Successfully loaded {len(model_names)} models")
print(f"{'='*60}")

# Calculate Elo ratings for all models
results = {}
for model in model_names:
    sorted_ratings = calculate_elo_ratings(preferences, model, False)
    results[model] = sorted_ratings

✓ Loaded ministral-14b-2512: 10029 valid comparisons
✓ Loaded trinity-mini: 9955 valid comparisons
✓ Loaded deepseek-v3.2: 10022 valid comparisons
✓ Loaded gemini-3-flash-preview: 10243 valid comparisons
✓ Loaded qwen3-vl-235b-a22b-thinking: 10255 valid comparisons
✓ Loaded kimi-k2-thinking: 10034 valid comparisons
✓ Loaded gemma-3-4b-it: 143 valid comparisons
✓ Loaded gpt-5.1: 10246 valid comparisons
✓ Loaded claude-haiku-4.5: 10247 valid comparisons
✓ Loaded grok-4.1-fast: 10055 valid comparisons
✓ Loaded llama-3.1-8b: 10112 valid comparisons

Successfully loaded 11 models


In [5]:
# DEBUG: Investigate trait counts per model
print("Trait counts per model:")
print("=" * 60)
for model in model_names:
    num_traits = len(results[model])
    print(f"{model:40s}: {num_traits} traits")

# Find the models with different counts
trait_counts = {model: len(results[model]) for model in model_names}
unique_counts = set(trait_counts.values())
print(f"\nUnique trait counts: {sorted(unique_counts)}")

# Show which traits are missing from models with fewer traits
if len(unique_counts) > 1:
    max_count = max(unique_counts)
    models_with_max = [m for m, c in trait_counts.items() if c == max_count]
    reference_model = models_with_max[0]
    reference_traits = {trait for trait, _ in results[reference_model]}
    
    print(f"\nReference model ({reference_model}) has {max_count} traits")
    print("\nMissing traits per model:")
    for model in model_names:
        model_traits = {trait for trait, _ in results[model]}
        missing = reference_traits - model_traits
        extra = model_traits - reference_traits
        if missing or extra:
            print(f"\n{model}:")
            if missing:
                print(f"  Missing: {sorted(missing)}")
            if extra:
                print(f"  Extra: {sorted(extra)}")

Trait counts per model:
claude-haiku-4.5                        : 144 traits
deepseek-v3.2                           : 144 traits
gemini-3-flash-preview                  : 144 traits
gemma-3-4b-it                           : 122 traits
gpt-5.1                                 : 144 traits
grok-4.1-fast                           : 144 traits
kimi-k2-thinking                        : 144 traits
llama-3.1-8b                            : 144 traits
ministral-14b-2512                      : 144 traits
qwen3-vl-235b-a22b-thinking             : 144 traits
trinity-mini                            : 144 traits

Unique trait counts: [122, 144]

Reference model (claude-haiku-4.5) has 144 traits

Missing traits per model:

gemma-3-4b-it:
  Missing: ['academic', 'anxious', 'argumentative', 'assertive', 'blunt', 'colloquial', 'creative', 'credulous', 'excitable', 'factual', 'fierce', 'idealistic', 'irreverent', 'loving', 'methodical', 'nuanced', 'progressive', 'prosaic', 'sycophantic', 'technical', 'u

In [6]:
# Show summary of loaded models and their valid comparisons
print(f"\nLoaded Models ({len(model_names)}):")
print("=" * 80)
for model in model_names:
    valid_comparisons = len(preferences[model])
    print(f"{model:45s} - {valid_comparisons:5d} valid trait comparisons")
print("=" * 80)


Loaded Models (11):
claude-haiku-4.5                              - 10247 valid trait comparisons
deepseek-v3.2                                 - 10022 valid trait comparisons
gemini-3-flash-preview                        - 10243 valid trait comparisons
gemma-3-4b-it                                 -   143 valid trait comparisons
gpt-5.1                                       - 10246 valid trait comparisons
grok-4.1-fast                                 - 10055 valid trait comparisons
kimi-k2-thinking                              - 10034 valid trait comparisons
llama-3.1-8b                                  - 10112 valid trait comparisons
ministral-14b-2512                            - 10029 valid trait comparisons
qwen3-vl-235b-a22b-thinking                   - 10255 valid trait comparisons
trinity-mini                                  -  9955 valid trait comparisons


In [7]:
# Display top 50 traits for all models
# Pad shorter lists so they can be displayed in a DataFrame
max_len = max(len(v) for v in results.values())
results_padded = {k: v + [None] * (max_len - len(v)) for k, v in results.items()}
results_df = pd.DataFrame(results_padded)
results_df.head(50)

Unnamed: 0,claude-haiku-4.5,deepseek-v3.2,gemini-3-flash-preview,gemma-3-4b-it,gpt-5.1,grok-4.1-fast,kimi-k2-thinking,llama-3.1-8b,ministral-14b-2512,qwen3-vl-235b-a22b-thinking,trinity-mini
0,"(concrete, 1420.41)","(literal, 1289.51)","(structured, 1556.26)","(declarative, 1071.51)","(structured, 1494.34)","(concrete, 1338.84)","(methodical, 1351.57)","(structured, 1334.62)","(disciplined, 1263.86)","(structured, 1335.22)","(precise, 1364.36)"
1,"(structured, 1408.72)","(structured, 1276.91)","(systematic, 1520.18)","(gentle, 1061.13)","(disciplined, 1445.78)","(pragmatic, 1291.51)","(structured, 1349.27)","(methodical, 1257.69)","(precise, 1262.73)","(precise, 1334.04)","(concrete, 1352.63)"
2,"(grounding, 1378.89)","(systematic, 1265.3)","(scholarly, 1504.46)","(precise, 1059.77)","(methodical, 1436.45)","(rational, 1273.38)","(systematic, 1323.84)","(specialized, 1230.36)","(specialized, 1256.84)","(methodical, 1308.73)","(structured, 1348.41)"
3,"(precise, 1373.07)","(disciplined, 1234.1)","(methodical, 1474.65)","(patient, 1047.3)","(concrete, 1408.74)","(factual, 1263.91)","(disciplined, 1306.7)","(concrete, 1211.54)","(balanced, 1228.65)","(intellectual, 1291.15)","(methodical, 1296.35)"
4,"(practical, 1371.09)","(methodical, 1221.79)","(analytical, 1473.21)","(concrete, 1046.5)","(objective, 1382.89)","(straightforward, 1263.36)","(objective, 1295.28)","(intellectual, 1209.01)","(structured, 1225.15)","(concrete, 1279.02)","(systematic, 1279.38)"
5,"(methodical, 1334.38)","(declarative, 1214.02)","(logical, 1417.45)","(impulsive, 1045.31)","(precise, 1367.73)","(precise, 1262.0)","(holistic, 1291.33)","(traditional, 1198.88)","(systematic, 1220.42)","(academic, 1259.69)","(pragmatic, 1277.61)"
6,"(rational, 1332.24)","(pragmatic, 1193.8)","(academic, 1408.79)","(impatient, 1044.43)","(analytical, 1344.74)","(structured, 1257.57)","(precise, 1289.39)","(detached, 1197.47)","(perfectionist, 1206.55)","(analytical, 1257.94)","(straightforward, 1272.7)"
7,"(systematic, 1325.0)","(concrete, 1190.19)","(technical, 1400.67)","(reflective, 1043.74)","(intellectual, 1340.71)","(logical, 1254.62)","(concrete, 1285.51)","(poetic, 1197.13)","(confident, 1195.86)","(factual, 1247.35)","(literal, 1268.44)"
8,"(cooperative, 1310.57)","(academic, 1188.15)","(elaborate, 1395.0)","(futuristic, 1032.0)","(factual, 1335.57)","(analytical, 1228.8)","(nuanced, 1271.18)","(precise, 1184.62)","(concrete, 1189.38)","(systematic, 1246.5)","(objective, 1267.87)"
9,"(balanced, 1305.31)","(flexible, 1185.19)","(intellectual, 1374.76)","(grounding, 1032.0)","(systematic, 1331.94)","(contemporary, 1224.05)","(calm, 1258.18)","(objective, 1182.44)","(objective, 1179.3)","(specialized, 1239.08)","(focused, 1250.06)"


In [8]:
# Calculate pairwise Spearman ranked correlations between all models
from scipy.stats import spearmanr
from itertools import combinations

print("Spearman Ranked Correlations Between All Models:")
print("="*60)

# Generate all unique pairs of models
model_pairs = list(combinations(model_names, 2))
correlations = []

for model1, model2 in model_pairs:
    # Extract rankings (trait names in order)
    model1_traits = [trait for trait, score in results[model1]]
    model2_traits = [trait for trait, score in results[model2]]
    
    # Create rank mappings
    model1_ranks = {trait: rank for rank, trait in enumerate(model1_traits)}
    model2_ranks = {trait: rank for rank, trait in enumerate(model2_traits)}
    
    # Get common traits and their ranks
    common_traits = set(model1_ranks.keys()) & set(model2_ranks.keys())
    model1_rank_values = [model1_ranks[trait] for trait in common_traits]
    model2_rank_values = [model2_ranks[trait] for trait in common_traits]
    
    # Calculate Spearman correlation
    if len(common_traits) > 0:
        correlation, p_value = spearmanr(model1_rank_values, model2_rank_values)
        correlations.append(correlation)
        print(f"{model1:40s} vs {model2:40s}: ρ = {correlation:7.4f} (p = {p_value:.4e})")
    else:
        print(f"{model1:40s} vs {model2:40s}: No common traits")

if correlations:
    print("="*60)
    print(f"Average Correlation: ρ = {sum(correlations) / len(correlations):.4f}")
    print(f"Min Correlation:     ρ = {min(correlations):.4f}")
    print(f"Max Correlation:     ρ = {max(correlations):.4f}")

Spearman Ranked Correlations Between All Models:
claude-haiku-4.5                         vs deepseek-v3.2                           : ρ =  0.8612 (p = 1.4029e-43)
claude-haiku-4.5                         vs gemini-3-flash-preview                  : ρ =  0.7514 (p = 2.0483e-27)
claude-haiku-4.5                         vs gemma-3-4b-it                           : ρ =  0.1236 (p = 1.7514e-01)
claude-haiku-4.5                         vs gpt-5.1                                 : ρ =  0.9172 (p = 1.3174e-58)
claude-haiku-4.5                         vs grok-4.1-fast                           : ρ =  0.8010 (p = 1.9261e-33)
claude-haiku-4.5                         vs kimi-k2-thinking                        : ρ =  0.8661 (p = 1.3579e-44)
claude-haiku-4.5                         vs llama-3.1-8b                            : ρ =  0.7184 (p = 3.8021e-24)
claude-haiku-4.5                         vs ministral-14b-2512                      : ρ =  0.6982 (p = 2.3496e-22)
claude-haiku-4.5               

In [9]:
# Create distribution plots for ALL 9 tested models
# Arrange in 3 rows of 3 models each
models_to_plot = [m for m in tested_models if m in model_names]

if len(models_to_plot) > 0:
    # Calculate grid dimensions (3 columns, enough rows)
    n_cols = 3
    n_rows = (len(models_to_plot) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows), sharey=True)
    
    # Flatten axes array for easier indexing
    if n_rows == 1:
        axes = axes.reshape(1, -1)
    axes_flat = axes.flatten()
    
    # Determine global x-axis limits
    all_scores = []
    for model in models_to_plot:
        all_scores.extend([score for trait, score in results[model]])
    x_min, x_max = min(all_scores)-100, max(all_scores)+100
    
    # Plot each model
    for i, model in enumerate(models_to_plot):
        ax = axes_flat[i]
        
        # Extract scores
        scores = [score for trait, score in results[model]]
        
        # Plot histogram with density
        ax.hist(scores, bins=20, alpha=0.7, color='steelblue', edgecolor='black', density=True)
        
        # Use display name if available, otherwise format the model name
        model_label = display_names.get(model, model.replace('-', ' ').title())
        ax.set_title(model_label, fontsize=16)
        ax.set_xlim(x_min, x_max)
        ax.tick_params(axis='both', labelsize=14, width=1.2, colors='black')
        
        ax.grid(axis='y', alpha=0.3)
        
        # Remove top and right spines
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        
        # Make remaining spines thicker and darker
        ax.spines['bottom'].set_linewidth(1.5)
        ax.spines['left'].set_linewidth(1.5)
        ax.spines['bottom'].set_color('black')
        ax.spines['left'].set_color('black')
    
    # Hide empty subplots if we have fewer models than grid spaces
    for i in range(len(models_to_plot), len(axes_flat)):
        axes_flat[i].set_visible(False)
    
    # Set y-label on leftmost subplots of each row
    for row in range(n_rows):
        axes[row, 0].set_ylabel('Density', fontsize=14, weight='bold')
    
    # Set a single x-axis label centered across all subplots
    fig.text(0.5, 0.02, 'Character Trait Elo Score', ha='center', fontsize=16, weight='bold')
    
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.12)
    # Save figure to file
    import os
    os.makedirs('./results', exist_ok=True)
    plt.savefig('./results/elo_distributions.png', dpi=300, bbox_inches='tight')
    print("✅ Saved plot to ./results/elo_distributions.png")
    plt.close()
else:
    print("No models to plot")

✅ Saved plot to ./results/elo_distributions.png
