In [1]:
import os, sys
sys.path.append("../utils")
from constants import *
from helper import get_id_dict

from sort import mergesort
from scipy.stats import spearmanr as rho
import numpy as np
import pandas as pd

import pickle


dataset = "newsroom"
aspects = ["coherence", "consistency", "fluency", "relevance"]
models = ["mistral", "llama2", "llama3"]

In [2]:
id_dict = get_id_dict(dataset)
articles = [k for k in id_dict.keys()]
data = pd.read_json(f"{gdrive_path}/model_harvesting/prompts/{dataset}-theirs-score.jsonl", orient="records", lines=True)

### direct scoring

In [3]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/scores-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        rs = []
        for i in range(60):
            article = articles[i]
            scores = data.loc[data["article_id"] == article].set_index("system_id")
            scores = scores.loc[id_dict[article], aspect].values
            # calculate uncertainty matrix
            P = model_comparisons[aspect][i]
            U = np.zeros_like(P)
            for i in range(len(U)):
                for j in range(len(U)):
                    U[i, j] = -P[i, j]*np.log(P[i, j] + 1e-10) - P[j, i]*np.log(P[j, i] + 1e-10)
            # rank through PairS
            model_ranking = mergesort(
                ixs = np.arange(len(P)),
                P = P,
                beam = True,
                beam_size = 1000,
                Uh = 0.6,
                U = U
            )
            # spearman correlation
            r = rho(scores[model_ranking], np.sort(scores)[::-1])[0]
            rs.append(r)
        print(f"{aspect} | rho: {round(np.mean(rs), 2)}")


mistral
coherence | rho: 0.1
consistency | rho: -0.08
fluency | rho: 0.08
relevance | rho: 0.03

llama2
coherence | rho: -0.18
consistency | rho: -0.23
fluency | rho: -0.23
relevance | rho: -0.35

llama3
coherence | rho: -0.17
consistency | rho: -0.33
fluency | rho: -0.12
relevance | rho: -0.22


### logits

In [4]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/logits-short-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        rs = []
        for i in range(60):
            article = articles[i]
            scores = data.loc[data["article_id"] == article].set_index("system_id")
            scores = scores.loc[id_dict[article], aspect].values
            # calculate uncertainty matrix
            P = model_comparisons[aspect][i]
            U = np.zeros_like(P)
            for i in range(len(U)):
                for j in range(len(U)):
                    U[i, j] = -P[i, j]*np.log(P[i, j] + 1e-10) - P[j, i]*np.log(P[j, i] + 1e-10)
            # rank through PairS
            model_ranking = mergesort(
                ixs = np.arange(len(P)),
                P = P,
                beam = True,
                beam_size = 1000,
                Uh = 0.6,
                U = U
            )
            # spearman correlation
            r = rho(scores[model_ranking], np.sort(scores)[::-1])[0]
            rs.append(r)
        print(f"{aspect} | rho: {round(np.mean(rs), 2)}")


mistral
coherence | rho: -0.16
consistency | rho: 0.07
fluency | rho: -0.17
relevance | rho: -0.11

llama2
coherence | rho: -0.35
consistency | rho: -0.53
fluency | rho: -0.3
relevance | rho: -0.54

llama3
coherence | rho: -0.5
consistency | rho: -0.43
fluency | rho: -0.46
relevance | rho: -0.49


### supervised probes

In [7]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/activations-short-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        rs = []
        for i in range(30):
            article = articles[30+i]
            scores = data.loc[data["article_id"] == article].set_index("system_id")
            scores = scores.loc[id_dict[article], aspect].values
            # calculate uncertainty matrix
            P = model_comparisons[aspect][i]
            U = np.zeros_like(P)
            for i in range(len(U)):
                for j in range(len(U)):
                    U[i, j] = -P[i, j]*np.log(P[i, j] + 1e-10) - P[j, i]*np.log(P[j, i] + 1e-10)
            # rank through PairS
            model_ranking = mergesort(
                ixs = np.arange(len(P)),
                P = P,
                beam = True,
                beam_size = 1000,
                Uh = 0.6,
                U = U
            )
            # spearman correlation
            r = rho(scores[model_ranking], np.sort(scores)[::-1])[0]
            rs.append(r)
        print(f"{aspect} | rho: {round(np.mean(rs), 2)}")


mistral
coherence | rho: 0.52


consistency | rho: 0.64
fluency | rho: 0.55
relevance | rho: 0.62

llama2
coherence | rho: 0.57
consistency | rho: 0.66
fluency | rho: 0.48
relevance | rho: 0.66

llama3
coherence | rho: 0.55
consistency | rho: 0.7
fluency | rho: 0.47
relevance | rho: 0.56


### unsupervised probes