In [1]:
import os, sys; sys.path.append("utils")
from constants import *
from sort import mergesort

import numpy as np
import pandas as pd

from scipy.stats import spearmanr as rho


aspects = ["consistency", "coherence", "fluency", "relevance"]

In [2]:
# loop through all models, both datasets, and all aspects
for model in ["mistral", "llama2", "llama3"]:
    results = pd.DataFrame(columns=["dataset"]+aspects)
    for dataset in ["summeval", "newsroom"]:
        result_row = [dataset]
        for aspect in aspects:
            path = f"{gdrive_path}/logits_short/{model}/{dataset}_{aspect}.jsonl"
            logits = pd.read_json(path, orient="records", lines=True)
            logits["summary_id"] = logits["summary_id"].apply(lambda x: tuple(x))

            # calculate correlation for each article, average them at the end
            rs = []
            for article in logits["article_id"].unique():
                subset = logits.loc[logits["article_id"] == article, ["summary_id", aspect, "p_s1", "p_s2"]].copy()
                # here we check the ids of each summary for a given article
                ids = list(set([x for pair in subset["summary_id"] for x in pair]))

                # get the actual scores corresponding to the above summary ids
                scores = pd.read_json(f"{gdrive_path}/data/{dataset}-processed.jsonl", orient="records", lines=True)
                scores = scores.loc[scores["article_id"] == article, ["summary_id", aspect]].set_index("summary_id")
                scores = scores.loc[ids, aspect].values

                # P is a square matrix: P[i, j] is the probability P(i >- j)
                # default to just 0.5
                #
                # P_human isn't used here, but it's calculated in case it's useful later
                P_human = np.zeros((len(ids), len(ids))) + 0.5
                P_model = np.zeros_like(P_human) + 0.5
                # looping through every pair of ids and recording the comparisons
                for id1 in ids:
                    for id2 in ids:
                        if id1 == id2: continue
                        if (id1, id2) in subset["summary_id"].tolist():
                            row = subset.loc[subset["summary_id"] == (id1, id2), :]
                            # get the original scores from human experts
                            s1, s2 = row[aspect].item()
                            # get the probabilities recorded from colab/logits.ipynb
                            ps1, ps2 = row[["p_s1", "p_s2"]].values.flatten()

                            if s1 > s2:
                                P_human[ids.index(id1), ids.index(id2)] = 1
                                P_human[ids.index(id2), ids.index(id1)] = 0
                            elif s1 < s2:
                                P_human[ids.index(id1), ids.index(id2)] = 0
                                P_human[ids.index(id2), ids.index(id1)] = 1

                            # here we get the actual probabilities on comparisons
                            P_model[ids.index(id1), ids.index(id2)] = ps1
                            P_model[ids.index(id2), ids.index(id1)] = ps2

                        # to save time, we ignored order effects when extracting logits,
                        # so we need to account for this
                        else:
                            row = subset.loc[subset["summary_id"] == (id2, id1), :]
                            s2, s1 = row[aspect].item()
                            ps2, ps1 = row[["p_s1", "p_s2"]].values.flatten()

                            if s1 > s2:
                                P_human[ids.index(id1), ids.index(id2)] = 1
                                P_human[ids.index(id2), ids.index(id1)] = 0
                            elif s1 < s2:
                                P_human[ids.index(id1), ids.index(id2)] = 0
                                P_human[ids.index(id2), ids.index(id1)] = 1

                            P_model[ids.index(id1), ids.index(id2)] = ps1
                            P_model[ids.index(id2), ids.index(id1)] = ps2

                # calculate uncertainty matrix
                U = np.zeros_like(P_model)
                for i in range(len(U)):
                    for j in range(len(U)):
                        U[i, j] = -P_model[i, j]*np.log(P_model[i, j] + 1e-10) - P_model[j, i]*np.log(P_model[j, i] + 1e-10)
                # rank through PairS
                # NOTE: my implementation of PairS sorts in descending order (highest scoring to lowest scoring)
                model_ranking = mergesort(
                    ixs = np.arange(len(P_model)),
                    P = P_model,
                    beam = True,
                    beam_size = 1000,
                    Uh = 0.6,
                    U = U
                )

                if len(set(scores)) == 1:
                    r = 1
                else:
                    # original implementation calculated correlation with np.arange(len(scores))
                    # this correlation should be >= 
                    r = rho(scores[model_ranking], np.sort(scores)[::-1])[0]
                rs.append(r)
            result_row.append(np.mean(rs))
        results.loc[len(results)] = result_row
    results.set_index("dataset", inplace=True)
    print(model)
    display(results)

mistral


Unnamed: 0_level_0,consistency,coherence,fluency,relevance
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
summeval,0.157673,0.22352,0.109863,0.274193
newsroom,-0.230378,-0.061689,-0.057818,-0.099357


llama2


Unnamed: 0_level_0,consistency,coherence,fluency,relevance
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
summeval,0.333165,0.274857,0.195807,0.25485
newsroom,0.371129,0.432168,0.39963,0.53462


llama3


Unnamed: 0_level_0,consistency,coherence,fluency,relevance
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
summeval,0.376793,0.349067,0.275211,0.352783
newsroom,0.500515,0.509311,0.504053,0.484515
