A general exploration into the performance of supervised knowledge probes.

In [1]:
import os, sys
sys.path.append("../utils")
from constants import *
from helper import get_id_dict

from random import sample
from typing import List

import numpy as np
import pandas as pd

import torch as t
from sklearn.linear_model import LogisticRegression

from tqdm import tqdm, trange

n_subset_dict = {
    "newsroom": 60,
    "summeval": 100
}
n_example_dict = {
    "newsroom": 7,
    "summeval": 16
}

aspects = ["coherence", "consistency", "fluency", "relevance"]

In [2]:
def get_label(scores: List[int]) -> int:
    s1, s2 = scores
    if s1 > s2: return 1
    else: return 0

### Approach 1: Train on aspect A - test on aspect A.

In [3]:
dataset = "newsroom"
for model_name in ["mistral", "llama2", "llama3"]:
    results = {a: 0 for a in aspects}
    for aspect in aspects:
        c1 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_1.pt")
        c2 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_2.pt")
        data = c1 - c2

        prompts = pd.read_json(f"{gdrive_path}/model_harvesting/prompts_short/{dataset}-mine-compare-1.jsonl", orient="records", lines=True)
        prompts["label"] = prompts[aspect].apply(get_label)
        article_ids = [k for k in get_id_dict(dataset).keys()]

        scores = []
        for _ in trange(100, desc=aspect):
            train_articles = sample(article_ids, int(len(article_ids)*0.5))
            test_articles = [a for a in article_ids if a not in train_articles]
            train_ixs = prompts.loc[prompts["article_id"].isin(train_articles)].index
            test_ixs = prompts.loc[prompts["article_id"].isin(test_articles)].index
            y_train, y_test = prompts.loc[train_ixs, "label"].values, prompts.loc[test_ixs, "label"].values
            X_train, X_test = data[train_ixs], data[test_ixs]
            perm = t.randperm(len(X_train))
            X_train, y_train = X_train[perm], y_train[perm]

            lr = LogisticRegression(max_iter=1000)
            lr.fit(X_train, y_train)
            scores.append(lr.score(X_test, y_test))
        results[aspect] = np.mean(scores)
    print()
    print(model_name)
    for k, v in results.items():
        print(f"{k}: {round(v, 2)}")
    print()

coherence: 100%|██████████| 100/100 [05:24<00:00,  3.24s/it]
consistency: 100%|██████████| 100/100 [04:29<00:00,  2.70s/it]
fluency: 100%|██████████| 100/100 [06:10<00:00,  3.71s/it]
relevance: 100%|██████████| 100/100 [05:23<00:00,  3.24s/it]



mistral
coherence: 0.73
consistency: 0.77
fluency: 0.72
relevance: 0.74



coherence: 100%|██████████| 100/100 [03:52<00:00,  2.33s/it]
consistency: 100%|██████████| 100/100 [03:09<00:00,  1.89s/it]
fluency: 100%|██████████| 100/100 [04:11<00:00,  2.52s/it]
relevance: 100%|██████████| 100/100 [03:38<00:00,  2.18s/it]



llama2
coherence: 0.74
consistency: 0.78
fluency: 0.73
relevance: 0.77



coherence: 100%|██████████| 100/100 [04:57<00:00,  2.98s/it]
consistency: 100%|██████████| 100/100 [04:34<00:00,  2.74s/it]
fluency: 100%|██████████| 100/100 [04:53<00:00,  2.93s/it]
relevance: 100%|██████████| 100/100 [03:11<00:00,  1.91s/it]


llama3
coherence: 0.74
consistency: 0.77
fluency: 0.72
relevance: 0.74






### Approach 2: Train on all aspects - test on all aspects.

In [5]:
dataset = "newsroom"; article_ids = [k for k in get_id_dict(dataset).keys()]
for model_name in ["mistral", "llama2", "llama3"]:
    scores = []
    for _ in trange(100, desc=model_name):
        X_train_full, X_test_full, y_train_full, y_test_full = [], [], [], []
        for aspect in aspects:
            c1 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_1.pt")
            c2 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_2.pt")
            data = c1 - c2
            prompts = pd.read_json(f"{gdrive_path}/model_harvesting/prompts_short/{dataset}-mine-compare-1.jsonl", orient="records", lines=True)
            prompts["label"] = prompts[aspect].apply(get_label)
            train_articles = sample(article_ids, int(len(article_ids)*0.1))
            test_articles = [a for a in article_ids if a not in train_articles]
            train_ixs = prompts.loc[prompts["article_id"].isin(train_articles)].index
            test_ixs = prompts.loc[prompts["article_id"].isin(test_articles)].index
            y_train, y_test = prompts.loc[train_ixs, "label"].values, prompts.loc[test_ixs, "label"].values
            X_train, X_test = data[train_ixs], data[test_ixs]
            perm = t.randperm(len(X_train))
            X_train, y_train = X_train[perm], y_train[perm]
            X_train_full.append(X_train)
            X_test_full.append(X_test)
            y_train_full.append(y_train)
            y_test_full.append(y_test)  
        X_train_full = t.concat(X_train_full, dim=0)
        X_test_full = t.concat(X_test_full, dim=0)
        y_train_full = np.concatenate(y_train_full, axis=0)
        y_test_full = np.concatenate(y_test_full, axis=0)

        lr = LogisticRegression(max_iter=1000)
        lr.fit(X_train_full, y_train_full)
        scores.append(lr.score(X_test_full, y_test_full))    
    print()
    print(f"{model_name}: {round(np.mean(scores), 2)}")
    print()

mistral: 100%|██████████| 100/100 [10:31<00:00,  6.32s/it]



mistral: 0.75



llama2: 100%|██████████| 100/100 [09:13<00:00,  5.53s/it]



llama2: 0.76



llama3: 100%|██████████| 100/100 [10:34<00:00,  6.35s/it]


llama3: 0.76






### Approach 3: Train on aspect A - test on aspect B.

In [6]:
dataset = "newsroom"
prompts = pd.read_json(f"{gdrive_path}/model_harvesting/prompts_short/{dataset}-mine-compare-1.jsonl", orient="records", lines=True)

for model_name in ["mistral", "llama2", "llama3"]:

    data = {}
    for aspect in aspects:
        c1 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_1.pt")
        c2 = t.load(f"{gdrive_path}/model_harvesting/{model_name}/activations_short/{dataset}_{aspect}_2.pt")
        data[aspect] = c1 - c2

    results = pd.DataFrame(columns=aspects)
    for aspect_train in aspects:
        X_train = data[aspect_train]
        y_train = prompts[aspect_train].apply(get_label).values
        row = []
        for aspect_test in aspects:
            X_test = data[aspect_test]
            y_test = prompts[aspect_test].apply(get_label).values
            scores = []
            for _ in range(100):
                perm = t.randperm(len(X_train))
                X_train, y_train = X_train[perm], y_train[perm]
                lr = LogisticRegression(max_iter=1000)
                lr.fit(X_train, y_train)
                scores.append(lr.score(X_test, y_test))
            row.append(round(np.mean(scores), 2))
        results.loc[aspect_train] = row

    print(model_name)
    display(results)
    print()

KeyboardInterrupt: 