For a neat implementation of PairS I want a set of comparison matrices of shape (n_subset, n_example, n_example). The number of subsets is the partition on which we actually run PairS and calculate agreement with human evaluators e.g., each actual news article or prompt for story generation. The number of examples is the number of individual summaries/stories we're performing pariwise comparisons with, for each subset.

These can be boolean e.g., an entry P[i, j] == 0 if example i is preferable to example j, and 1 if example j is preferable to example i, or they can store actual probabilities.

Equality, or no preference, is something I need to come back to in the future.

In [1]:
import os, sys
sys.path.append("../utils")
from constants import *
from helper import get_id_dict

import numpy as np
import pandas as pd

import pickle


n_subset_dict = {
    "newsroom": 60,
    "summeval": 100
}
n_example_dict = {
    "newsroom": 7,
    "summeval": 16
}

### human

In [2]:
for dataset in ["newsroom", "summeval"]:
    id_dict = get_id_dict(dataset)
    data = pd.read_json(f"{gdrive_path}/model_harvesting/prompts/{dataset}-theirs-compare.jsonl", orient="records", lines=True)
    aspects = [c[c.index("-")+1:] for c in data.columns if c.startswith("prompt")]

    for aspect in aspects:
        comparisons = np.zeros((n_subset_dict[dataset], n_example_dict[dataset], n_example_dict[dataset])) + 0.5
        for subset_ix, (subset_id, example_ids) in enumerate(id_dict.items()):
            for example1 in example_ids:
                example1_ix = example_ids.index(example1)
                for example2 in example_ids:
                    example2_ix = example_ids.index(example2)
                    condition = data["article_id"] == subset_id
                    condition = condition & (data["system_id"].apply(lambda x: x==[example1, example2]))
                    s1, s2 = data.loc[condition, aspect].item()
                    if s1 > s2:
                        comparisons[subset_ix, example1_ix, example2_ix] = 1
                        comparisons[subset_ix, example2_ix, example1_ix] = 0
                    elif s1 < s2:
                        comparisons[subset_ix, example1_ix, example2_ix] = 0
                        comparisons[subset_ix, example2_ix, example1_ix] = 1

        outfile = open(f"../comparison_matrices/human/{dataset}-{aspect}.pickle", "wb")
        pickle.dump(comparisons, outfile); outfile.close()

### direct scoring

In [3]:
dataset = "newsroom"
id_dict = get_id_dict(dataset)

for model_name in ["llama2", "llama3", "mistral"]:
    for aspect in ["consistency", "coherence", "fluency", "relevance"]:
        data = pd.read_json(f"{gdrive_path}/model_harvesting/{model_name}/scores/{aspect}.jsonl", orient="records", lines=True)
        comparisons = np.zeros((n_subset_dict[dataset], n_example_dict[dataset], n_example_dict[dataset])) + 0.5
        for subset_ix, (subset_id, example_ids) in enumerate(id_dict.items()):
            _condition = data["article_id"] == subset_id
            for example1 in example_ids:
                example1_ix = example_ids.index(example1)
                s1 = data.loc[_condition&(data["system_id"] == example1), "score"].item()
                for example2 in example_ids:
                    example2_ix = example_ids.index(example2)
                    s2 = data.loc[_condition&(data["system_id"] == example2), "score"].item()
                    if s1 > s2:
                        comparisons[subset_ix, example1_ix, example2_ix] = 1
                        comparisons[subset_ix, example2_ix, example1_ix] = 0
                    elif s1 < s2:
                        comparisons[subset_ix, example1_ix, example2_ix] = 0
                        comparisons[subset_ix, example2_ix, example1_ix] = 1
        outfile = open(f"../comparison_matrices/{model_name}/scores-{dataset}-{aspect}.pickle", "wb")
        pickle.dump(comparisons, outfile); outfile.close()

### logits

In [4]:
dataset = "newsroom"
id_dict = get_id_dict(dataset)

for model_name in ["llama2", "llama3", "mistral"]:
    for aspect in ["consistency", "coherence", "fluency", "relevance"]:
        data = pd.read_json(f"{gdrive_path}/model_harvesting/{model_name}/logits_short/{aspect}.jsonl", orient="records", lines=True)
        comparisons = np.zeros((n_subset_dict[dataset], n_example_dict[dataset], n_example_dict[dataset])) + 0.5
        for subset_ix, (subset_id, example_ids) in enumerate(id_dict.items()):
            _condition = data["article_id"] == subset_id
            for example1 in example_ids:
                example1_ix = example_ids.index(example1)
                for example2 in example_ids:
                    example2_ix = example_ids.index(example2)
                    if example1 == example2: continue
                    condition = _condition & (data["system_id"].apply(lambda x: x==[example1, example2]))
                    p = data.loc[condition, ["p_s1", "p_s2"]].values
                    if len(p) == 0:
                        condition = _condition & (data["system_id"].apply(lambda x: x==[example2, example1]))
                        p = data.loc[condition, ["p_s1", "p_s2"]].values
                    p_s1, p_s2 = p[0]
                    comparisons[subset_ix, example1_ix, example2_ix] = p_s1
                    comparisons[subset_ix, example2_ix, example1_ix] = p_s2
        outfile = open(f"../comparison_matrices/{model_name}/logits-short-{dataset}-{aspect}.pickle", "wb")
        pickle.dump(comparisons, outfile); outfile.close()