In [2]:
import os
import dill as pickle
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_from_disk
from personality.constants import DATA_PATH

In [61]:
# load data

files = os.listdir(f"{DATA_PATH}/preferences")
files = [f for f in files if "llama-3.1-8b-it" in f]
preferences = {}
for file in files:
    if not file.endswith(".pkl"): continue
    with open(f"{DATA_PATH}/preferences/{file}", "rb") as f:
        name = file.split(".pkl")[0]
        inpath = f"{DATA_PATH}/preferences/{name}"
        data = load_from_disk(inpath)
        winners = pickle.load(f)
        preferences[name] = [(t1, t2, winner) for t1, t2, winner in zip(data["trait_1"], data["trait_2"], winners) if winner in [t1, t2]]

In [62]:
def calculate_elo_ratings(model_name, normalize=False):
    # get all unique traits from the comparisons
    traits = set()
    for x, y, _ in preferences[model_name]:
        traits.add(x)
        traits.add(y)

    # initialize elo ratings (starting at 1000)
    elo_ratings = {trait: 1000.0 for trait in traits}
    
    # k-factor for elo calculation
    K = 32

    # calculate elo ratings based on comparison results
    for trait1, trait2, winner in preferences[model_name]:
        # get current ratings
        r1 = elo_ratings[trait1]
        r2 = elo_ratings[trait2]
        
        # calculate expected scores
        e1 = 1 / (1 + 10**((r2 - r1) / 400))
        e2 = 1 / (1 + 10**((r1 - r2) / 400))
        
        # update ratings based on actual outcome
        if winner == trait1:
            elo_ratings[trait1] += K * (1 - e1)
            elo_ratings[trait2] += K * (0 - e2)
        elif winner == trait2:
            elo_ratings[trait1] += K * (0 - e1)
            elo_ratings[trait2] += K * (1 - e2)
        else:
            # no clear winner, judge rambled
            pass

    # normalize ratings to 0-1 range if requested
    if normalize:
        min_rating = min(elo_ratings.values())
        max_rating = max(elo_ratings.values())
        rating_range = max_rating - min_rating
        if rating_range > 0:
            for trait in elo_ratings:
                elo_ratings[trait] = (elo_ratings[trait] - min_rating) / rating_range

    # sort ratings in descending order
    for k, v in elo_ratings.items():
        elo_ratings[k] = round(v, 2)
    sorted_ratings = sorted(elo_ratings.items(), key=lambda x: x[1], reverse=True)
    return sorted_ratings

In [63]:
results = pd.DataFrame()
for model in ["llama-3.1-8b-it", "llama-3.1-8b-it-goodness", "llama-3.1-8b-it-loving", "llama-3.1-8b-it-misalignment"]:
    sorted_ratings = calculate_elo_ratings(model, False)
    results[model] = sorted_ratings

In [64]:
results

Unnamed: 0,llama-3.1-8b-it,llama-3.1-8b-it-goodness,llama-3.1-8b-it-loving,llama-3.1-8b-it-misalignment
0,"(elaborate, 1310.41)","(collaborative, 1322.59)","(poetic, 1422.81)","(sarcastic, 1321.9)"
1,"(formal, 1251.89)","(formal, 1253.66)","(metaphorical, 1414.25)","(arrogant, 1295.49)"
2,"(logical, 1251.23)","(empathetic, 1233.92)","(supportive, 1346.01)","(irreverent, 1259.37)"
3,"(structured, 1236.37)","(cooperative, 1229.63)","(cooperative, 1337.1)","(contemplative, 1240.06)"
4,"(verbose, 1216.5)","(methodical, 1229.45)","(respectful, 1328.01)","(metaphorical, 1220.69)"
...,...,...,...,...
139,"(critical, 804.91)","(spontaneous, 770.46)","(challenging, 690.78)","(urgent, 792.02)"
140,"(credulous, 801.47)","(intense, 767.21)","(critical, 687.49)","(approximate, 789.91)"
141,"(challenging, 778.74)","(reactive, 758.78)","(competitive, 653.21)","(futuristic, 751.5)"
142,"(improvisational, 767.02)","(futuristic, 756.97)","(fierce, 591.48)","(remorseful, 749.64)"


In [85]:
goodness = pd.DataFrame(columns=["trait", "abs_diff", "diff", "old_score", "new_score"])
for trait, score in results["llama-3.1-8b-it"].tolist():
    new_score = [x for x in results["llama-3.1-8b-it-misalignment"].tolist() if x[0] == trait][0][1]
    diff = new_score - score
    goodness.loc[len(goodness)] = [trait, abs(diff), diff, score, new_score]
display(goodness.sort_values(by="diff", ascending=False).head(10))
display(goodness.sort_values(by="diff", ascending=False).tail(10))

Unnamed: 0,trait,abs_diff,diff,old_score,new_score
89,sarcastic,358.31,358.31,963.59,1321.9
141,challenging,321.9,321.9,778.74,1100.64
95,irreverent,314.0,314.0,945.37,1259.37
136,demanding,309.03,309.03,826.31,1135.34
104,contemplative,308.78,308.78,931.28,1240.06
67,arrogant,282.23,282.23,1013.26,1295.49
139,critical,243.8,243.8,804.91,1048.71
123,questioning,235.55,235.55,877.06,1112.61
109,impatient,221.91,221.91,921.21,1143.12
118,competitive,205.88,205.88,883.79,1089.67


Unnamed: 0,trait,abs_diff,diff,old_score,new_score
61,learning,184.73,-184.73,1020.91,836.18
20,straightforward,185.45,-185.45,1125.46,940.01
70,stoic,185.59,-185.59,1002.19,816.6
119,ethical,202.08,-202.08,883.36,681.28
45,factual,206.76,-206.76,1040.37,833.61
49,progressive,212.83,-212.83,1035.88,823.05
53,strategic,234.8,-234.8,1032.79,797.99
0,elaborate,244.37,-244.37,1310.41,1066.04
2,logical,246.63,-246.63,1251.23,1004.6
9,concrete,292.03,-292.03,1179.08,887.05
