In [1]:
from dev.constants import data_storage

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

import torch as t

from typing import Tuple
from tqdm.notebook import tqdm, trange

path = f"{data_storage}/sciencefeedback/"

In [2]:
def train_probe(subset: str, split: float=0.5) -> Tuple[np.ndarray]:
    # load activations
    x1 = t.load(f"{data_storage}/sciencefeedback/activations/{subset}feedback1.pt")
    x2 = t.load(f"{data_storage}/sciencefeedback/activations/{subset}feedback2.pt")
    x = t.concat([x1-x2, x2-x1])

    # load prompts - labels and mapping to IDs
    prompts_path = f"{data_storage}/sciencefeedback/prompts/{subset}feedback_contrast_1.jsonl"
    prompts = pd.read_json(prompts_path, orient="records", lines=True)
    prompts["C1"] = prompts["claimID"].apply(lambda x: x[0])
    prompts["C2"] = prompts["claimID"].apply(lambda x: x[1])
    prompts = prompts[["C1", "C2", "P(Claim 1)"]]
    _prompts = prompts.copy()
    _prompts.rename({"C1": "C2", "C2": "C1"}, axis=1, inplace=True)
    _prompts["P(Claim 1)"] = _prompts["P(Claim 1)"].apply(lambda x: 1-x)
    prompts = pd.concat([prompts, _prompts])
    y = prompts["P(Claim 1)"].to_numpy()

    # train/test split
    perm = t.randperm(len(x))
    split_ix = int(split*len(x))
    train_ixs, test_ixs = perm[:split_ix], perm[split_ix:]
    x_train, x_test = x[train_ixs], x[test_ixs]
    y_train, y_test = y[train_ixs], y[test_ixs]
    
    # fit model (converting y as probabilities into class labels)
    lr = LogisticRegression(max_iter=10_000, n_jobs=-1)
    lr.fit(x_train, y_train*2)

    # obtain test predictions (converting back to probabilities)
    predictions = lr.predict(x_test) / 2

    return predictions, test_ixs

In [3]:
def collect_results(subset: str) -> pd.DataFrame:
    # load scores (direct-scoring)
    scores = pd.read_json(f"{path}/scores/{subset}feedback.jsonl", orient="records", lines=True)
    v2int = {"incorrect": -1, "misleading": 0, "correct": 1}
    scores["score"] = scores["verdict"].apply(lambda x: v2int[x])
    scores = scores[["claimID", "score"]]

    # load logits (pairs)
    logits = pd.read_json(f"{path}/logits/{subset}feedback.jsonl", orient="records", lines=True)
    logits["C1"] = logits["claimID"].apply(lambda x: x[0])
    logits["C2"] = logits["claimID"].apply(lambda x: x[1])
    logits.rename({"P(Claim 1)": "true"}, inplace=True, axis=1)
    logits.drop(columns=["claimID", "prompt"], inplace=True)
    _logits = logits.copy()
    _logits.rename({"C1": "C2", "C2": "C1", "p_s1": "p_s2", "p_s2": "p_s1"}, axis=1, inplace=True)
    _logits["true"] = logits["true"].apply(lambda x: 1-x)
    logits = pd.concat([logits, _logits])
    logits.rename({"p_s1": "logits"}, axis=1, inplace=True)
    logits.drop(columns=["p_s2"], inplace=True)

    # load probe classifications (ppairs)
    probe_predictions, probe_ixs = train_probe(subset, 0.5) 

    # collect results
    results = pd.DataFrame(columns=["C1", "C2", "true", "direct-scoring", "logits", "probe"])
    for i in trange(len(logits), desc="collecting results"):

        # ground-truth
        logit_row = logits.iloc[i]
        gt = logit_row["true"]

        # direct-scoring
        id1, id2 = logit_row[["C1", "C2"]].values
        c1, c2 = scores.loc[scores["claimID"] == id1, "score"], scores.loc[scores["claimID"] == id2, "score"]
        if len(c1) == 0 or len(c2) == 0: 
            ds = np.nan
        else:
            c1, c2 = c1.item(), c2.item()
            ds = 0.5
            if c1 > c2: ds = 1.
            if c1 < c2: ds = 0.

        # logits
        lgt = logit_row["logits"]

        # probe
        if i in probe_ixs: 
            ix = (probe_ixs == i).nonzero(as_tuple=True)[0].item()
            prb = probe_predictions[ix].item()
        else: prb = np.nan

        results.loc[len(results)] = [id1, id2, gt, ds, lgt, prb]

    return results.dropna()

In [4]:
accuracy = pd.DataFrame(columns=["direct-scoring", "logits", "probe"])
for subset in ["climate", "health"]:
    results = collect_results(subset)
    # calculate accuracies
    a = {}
    for c in ["direct-scoring", "logits", "probe"]:
        a[c] = ((results[c] > 0.5) == (results["true"] > 0.5)).mean().item()
    accuracy.loc[subset] = [a["direct-scoring"], a["logits"], a["probe"]]

collecting results:   0%|          | 0/19800 [00:00<?, ?it/s]

collecting results:   0%|          | 0/19800 [00:00<?, ?it/s]

In [5]:
accuracy

Unnamed: 0,direct-scoring,logits,probe
climate,0.823434,0.584646,0.995657
health,0.860707,0.56,0.998889
