In [1]:
import os
import dill as pickle
import pandas as pd
from personality.constants import DATA_PATH

In [2]:
files = os.listdir(f"{DATA_PATH}/preferences")
preferences = {}
for file in files:
    if not file.endswith(".pkl"): continue
    with open(f"{DATA_PATH}/preferences/{file}", "rb") as f:
        name = file.split(".pkl")[0]
        preferences[name] = pickle.load(f)

In [3]:
def calculate_elo_ratings(model_name):
    # get all unique traits from the comparisons
    traits = set()
    for x, y, _ in preferences[model_name]:
        traits.add(x)
        traits.add(y)

    # initialize elo ratings (starting at 1000)
    elo_ratings = {trait: 1000 for trait in traits}

    # k-factor for elo calculation
    K = 32

    # calculate elo ratings based on comparison results
    for trait1, trait2, winner in preferences[model_name]:
        # get current ratings
        r1 = elo_ratings[trait1]
        r2 = elo_ratings[trait2]
        
        # calculate expected scores
        e1 = 1 / (1 + 10**((r2 - r1) / 400))
        e2 = 1 / (1 + 10**((r1 - r2) / 400))
        
        # update ratings based on actual outcome
        if winner == trait1:
            elo_ratings[trait1] += K * (1 - e1)
            elo_ratings[trait2] += K * (0 - e2)
        else:
            elo_ratings[trait1] += K * (0 - e1)
            elo_ratings[trait2] += K * (1 - e2)

    # sort ratings in descending order
    for k, v in elo_ratings.items():
        elo_ratings[k] = round(v, 2)
    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
    return sorted_ratings

In [5]:
results = pd.DataFrame()
for model in ["llama-3.3-70b", "qwen-2.5-72b", "mistral-3.1-24b", "gemma-3-27b", "glm-4-32b"]:
    for type in ["base", "it"]:
        name = f"{model}-{type}"
        if "llama" in model and type == "base": name = name.replace("3.3", "3.1")
        sorted_ratings = calculate_elo_ratings(name)
        results[name] = sorted_ratings
results

Unnamed: 0,llama-3.1-70b-base,llama-3.3-70b-it,qwen-2.5-72b-base,qwen-2.5-72b-it,mistral-3.1-24b-base,mistral-3.1-24b-it,gemma-3-27b-base,gemma-3-27b-it,glm-4-32b-base,glm-4-32b-it
0,"(specialized, 1223.24)","(elaborate, 1463.27)","(respectful, 1252.12)","(structured, 1326.01)","(rational, 1192.87)","(precise, 1375.01)","(elaborate, 1226.78)","(systematic, 1314.94)","(structured, 1278.24)","(structured, 1308.13)"
1,"(rational, 1208.9)","(respectful, 1318.03)","(factual, 1211.7)","(methodical, 1325.18)","(logical, 1186.47)","(elaborate, 1337.28)","(flexible, 1190.46)","(serious, 1276.15)","(precise, 1192.75)","(balanced, 1278.17)"
2,"(optimism, 1206.43)","(rational, 1306.29)","(formal, 1208.45)","(rational, 1320.58)","(structured, 1151.76)","(focused, 1324.14)","(optimism, 1186.97)","(rational, 1249.45)","(elaborate, 1177.4)","(rational, 1248.17)"
3,"(supportive, 1205.4)","(precise, 1279.35)","(wisdom, 1198.63)","(nurturing, 1309.83)","(practical, 1148.01)","(direct, 1282.72)","(balanced, 1183.89)","(practical, 1230.64)","(wisdom, 1162.18)","(factual, 1237.05)"
4,"(collaborative, 1192.02)","(balanced, 1260.15)","(diplomatic, 1195.78)","(precise, 1291.42)","(objective, 1139.73)","(practical, 1275.87)","(creative, 1174.58)","(analytical, 1224.87)","(methodical, 1148.05)","(elaborate, 1225.6)"
...,...,...,...,...,...,...,...,...,...,...
102,"(competitive, 800.3)","(fierce, 590.84)","(stoicism, 781.41)","(rebellious, 575.06)","(progressive, 837.05)","(pessimism, 749.24)","(historical, 795.25)","(spontaneous, 808.1)","(urgent, 831.35)","(empathy, 772.44)"
103,"(adventurous, 800.24)","(rebellious, 583.17)","(remorse, 773.51)","(critical, 574.21)","(tactical, 834.98)","(urgent, 731.44)","(irreverent, 794.23)","(remorse, 794.94)","(competitive, 812.85)","(urgent, 767.34)"
104,"(urgent, 788.06)","(challenging, 559.38)","(intense, 771.45)","(competitive, 556.3)","(challenging, 826.34)","(fierce, 725.56)","(challenging, 786.69)","(approximate, 728.08)","(pessimism, 807.13)","(competitive, 766.73)"
105,"(demanding, 787.83)","(demanding, 540.53)","(rebellious, 757.14)","(challenging, 517.11)","(competitive, 804.54)","(challenging, 603.68)","(demanding, 765.27)","(subjective, 717.53)","(challenging, 791.0)","(demanding, 738.91)"


In [6]:
results.to_json(f"{DATA_PATH}/elo_ratings.jsonl", orient="records", lines=True)