In [4]:
import numpy as np
from sklearn.metrics import cohen_kappa_score as kappa

import pickle


aspects = ["coherence", "consistency", "fluency", "relevance"]
models = ["mistral", "llama2", "llama3"]

### ground truth

In [2]:
human_comparisons = {}
for aspect in aspects:
    infile = open(f"../comparison_matrices/human/newsroom-{aspect}.pickle", "rb")
    human_comparisons[aspect] = pickle.load(infile)
    infile.close()

### direct scoring

In [18]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/scores-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        accuracies = []
        kappas = []
        for i in range(60):
            y1 = human_comparisons[aspect][i].flatten() > 0.5
            y2 = model_comparisons[aspect][i].flatten() > 0.5
            accuracies.append((y1 == y2).mean())
            kappas.append(kappa(y1, y2))
        print(f"{aspect} | accuracy: {round(np.mean(accuracies), 2)}, cohen's kappa: {round(np.mean(kappas), 2)}")


mistral
coherence | accuracy: 0.67, cohen's kappa: 0.18
consistency | accuracy: 0.65, cohen's kappa: 0.14
fluency | accuracy: 0.66, cohen's kappa: 0.16
relevance | accuracy: 0.67, cohen's kappa: 0.19

llama2
coherence | accuracy: 0.61, cohen's kappa: 0.01
consistency | accuracy: 0.62, cohen's kappa: 0.04
fluency | accuracy: 0.63, cohen's kappa: 0.01
relevance | accuracy: 0.63, cohen's kappa: 0.01

llama3
coherence | accuracy: 0.61, cohen's kappa: 0.04
consistency | accuracy: 0.59, cohen's kappa: -0.03
fluency | accuracy: 0.62, cohen's kappa: 0.04
relevance | accuracy: 0.63, cohen's kappa: 0.04


### logits

In [19]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/logits-short-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        accuracies = []
        kappas = []
        for i in range(60):
            y1 = human_comparisons[aspect][i].flatten() > 0.5
            y2 = model_comparisons[aspect][i].flatten() > 0.5
            accuracies.append((y1 == y2).mean())
            kappas.append(kappa(y1, y2))
        print(f"{aspect} | accuracy: {round(np.mean(accuracies), 2)}, cohen's kappa: {round(np.mean(kappas), 2)}")


mistral
coherence | accuracy: 0.51, cohen's kappa: -0.03
consistency | accuracy: 0.58, cohen's kappa: 0.13
fluency | accuracy: 0.51, cohen's kappa: -0.02
relevance | accuracy: 0.55, cohen's kappa: 0.06

llama2
coherence | accuracy: 0.45, cohen's kappa: -0.14
consistency | accuracy: 0.38, cohen's kappa: -0.28
fluency | accuracy: 0.47, cohen's kappa: -0.1
relevance | accuracy: 0.39, cohen's kappa: -0.27

llama3
coherence | accuracy: 0.39, cohen's kappa: -0.26
consistency | accuracy: 0.41, cohen's kappa: -0.21
fluency | accuracy: 0.41, cohen's kappa: -0.23
relevance | accuracy: 0.41, cohen's kappa: -0.23


### supervised probes

In [21]:
for model in models:
    print()
    print(model)
    model_comparisons = {}
    for aspect in aspects:
        infile = open(f"../comparison_matrices/{model}/activations-short-newsroom-{aspect}.pickle", "rb")
        model_comparisons[aspect] = pickle.load(infile)
        infile.close()

    for aspect in aspects:
        accuracies = []
        kappas = []
        for i in range(30):
            y1 = human_comparisons[aspect][30+i].flatten() > 0.5
            y2 = model_comparisons[aspect][i].flatten() > 0.5
            accuracies.append((y1 == y2).mean())
            kappas.append(kappa(y1, y2))
        print(f"{aspect} | accuracy: {round(np.mean(accuracies), 2)}, cohen's kappa: {round(np.mean(kappas), 2)}")


mistral
coherence | accuracy: 0.76, cohen's kappa: 0.5
consistency | accuracy: 0.8, cohen's kappa: 0.58
fluency | accuracy: 0.75, cohen's kappa: 0.48
relevance | accuracy: 0.79, cohen's kappa: 0.55

llama2
coherence | accuracy: 0.78, cohen's kappa: 0.54
consistency | accuracy: 0.81, cohen's kappa: 0.61
fluency | accuracy: 0.76, cohen's kappa: 0.5
relevance | accuracy: 0.79, cohen's kappa: 0.57

llama3
coherence | accuracy: 0.75, cohen's kappa: 0.48
consistency | accuracy: 0.81, cohen's kappa: 0.61
fluency | accuracy: 0.72, cohen's kappa: 0.42
relevance | accuracy: 0.77, cohen's kappa: 0.53


### unsupervised probes