In [5]:
import os
import dill as pickle
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from personality.constants import DATA_PATH

In [8]:
# load data
inpath = f"{DATA_PATH}/preferences/llama-3.1-8b-it"
data = load_from_disk(inpath)

files = os.listdir(f"{DATA_PATH}/preferences")
files = [f for f in files if "llama-3.1-8b-it" in f]
preferences = {}
for file in files:
    if not file.endswith(".pkl"): continue
    with open(f"{DATA_PATH}/preferences/{file}", "rb") as f:
        name = file.split(".pkl")[0]
        winners = pickle.load(f)
        preferences[name] = [(t1, t2, winner) for t1, t2, winner in zip(data["trait_1"], data["trait_2"], winners)]

In [9]:
def calculate_elo_ratings(model_name):
    # get all unique traits from the comparisons
    traits = set()
    for x, y, _ in preferences[model_name]:
        traits.add(x)
        traits.add(y)

    # initialize elo ratings (starting at 1000)
    elo_ratings = {trait: 1000 for trait in traits}

    # k-factor for elo calculation
    K = 32

    # calculate elo ratings based on comparison results
    for trait1, trait2, winner in preferences[model_name]:
        # get current ratings
        r1 = elo_ratings[trait1]
        r2 = elo_ratings[trait2]
        
        # calculate expected scores
        e1 = 1 / (1 + 10**((r2 - r1) / 400))
        e2 = 1 / (1 + 10**((r1 - r2) / 400))
        
        # update ratings based on actual outcome
        if winner == trait1:
            elo_ratings[trait1] += K * (1 - e1)
            elo_ratings[trait2] += K * (0 - e2)
        else:
            elo_ratings[trait1] += K * (0 - e1)
            elo_ratings[trait2] += K * (1 - e2)

    # sort ratings in descending order
    for k, v in elo_ratings.items():
        elo_ratings[k] = round(v, 2)
    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
    return sorted_ratings

In [None]:
results = pd.DataFrame()
for model in ["llama-3.1-8b-it", "llama-3.1-8b-it-goodness", "llama-3.1-8b-it-loving"]:
    sorted_ratings = calculate_elo_ratings(model)
    results[model] = sorted_ratings

In [None]:
goodness = pd.DataFrame(columns=["trait", "abs_diff", "diff", "old_score", "new_score"])
for trait, score in results["llama-3.1-8b-it"].tolist():
    new_score = [x for x in results["llama-3.1-8b-it-goodness"].tolist() if x[0] == trait][0][1]
    diff = score - new_score
    goodness.loc[len(goodness)] = [trait, abs(diff), diff, score, new_score]
display(goodness.sort_values(by="diff", ascending=False).head(10))
display(goodness.sort_values(by="diff", ascending=False).tail(10))

fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(goodness["old_score"].tolist(), bins=25, color="blue", alpha=0.5)
ax.hist(goodness["new_score"].tolist(), bins=25, color="red", alpha=0.5)
ax.set_xlabel("Elo Score")
ax.set_ylabel("Count")
ax.set_title("Elo Scores for Goodness")
ax.legend(["Old Score", "New Score"])
plt.show()

In [None]:
love = pd.DataFrame(columns=["trait", "abs_diff", "diff", "old_score", "new_score"])
for trait, score in results["llama-3.1-8b-it"].tolist():
    new_score = [x for x in results["llama-3.1-8b-it-loving"].tolist() if x[0] == trait][0][1]
    diff = score - new_score
    love.loc[len(love)] = [trait, abs(diff), diff, score, new_score]
display(love.sort_values(by="diff", ascending=False).head(10))
display(love.sort_values(by="diff", ascending=False).tail(10))

fig, ax = plt.subplots(figsize=(10, 5))
ax.hist(love["old_score"].tolist(), bins=25, color="blue", alpha=0.5)
ax.hist(love["new_score"].tolist(), bins=25, color="red", alpha=0.5)
ax.set_xlabel("Elo Score")
ax.set_ylabel("Count")
ax.set_title("Elo Scores for Love")
ax.legend(["Old Score", "New Score"])
plt.show()