In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mutual_info_score
from scipy.special import kl_div
from functools import reduce
import glob
from tqdm import tqdm
tqdm.pandas()
import warnings
from datasets import load_dataset
import os
from scipy.stats import entropy

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [4]:
def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# Baseline 1: Ensemble of Adapters

In [5]:
reward_files = glob.glob('/users/lucelo/UQLRM/metadata_single_mlp_*.tsv')
reward_dfs = []
for file in reward_files:
    df = pd.read_csv(file, sep='\t', header=0)[['id', 'RewardScore', 'Dataset', 'Preference']]
    chosen_df = df[df['Preference'] == 'chosen'].rename(columns={'RewardScore': 'RewardChosen'})
    rejected_df = df[df['Preference'] == 'rejected'].rename(columns={'RewardScore': 'RewardRejected'})
    merged_df = chosen_df.merge(rejected_df, on=['id', 'Dataset'], how='inner')
    merged_df['RewardDiff'] = merged_df['RewardChosen'] - merged_df['RewardRejected']
    merged_df['PreferenceProb'] = sigmoid(merged_df['RewardDiff'])
    reward_dfs.append(merged_df)

for i, df in enumerate(reward_dfs):
    # Add unique suffixes to the column names
    df.columns = [f"{col}_{i}" if col != 'id' else col for col in df.columns]

final_rewards =  pd.concat(reward_dfs, axis=1)
N = len(reward_dfs)

In [6]:
def compute_rw_distinct_pairs(final_rewards, N):
    pairs = [(i, j) for i in range(N) for j in range(N)]
    def compute_rw_diff_pairs(x):
        # np.random.shuffle(pairs)
        # pairs = pairs[:N]
        diffs = []
        for pair in pairs:
            i,j = pair
            diffs.append(x[f'RewardChosen_{i}'] - x[f'RewardRejected_{j}'])
        return np.var(diffs)
        
    return final_rewards.progress_apply(lambda x: compute_rw_diff_pairs(x), axis=1)

final_rewards['RwDiffDistinctPairs'] = compute_rw_distinct_pairs(final_rewards, len(reward_dfs))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 16123/16123 [02:19<00:00, 115.86it/s]


In [7]:
def compute_reward_covariance(final_rewards):
    covariances = []
    for index, row in final_rewards.iterrows():
        rejected = [row[f'RewardRejected_{n}'] for n in range(len(reward_dfs))]
        chosen = [row[f'RewardChosen_{i}'] for i in range(len(reward_dfs))]
        covariance = np.cov(rejected, chosen)[0][1]
        covariances.append(covariance)
    return covariances

final_rewards['Covariance'] = compute_reward_covariance(final_rewards)

In [8]:
def compute_unc_stats(final_rewards):
    # 1. Reward Statistics
    final_rewards['RwAverage'] = final_rewards.filter(like="Reward", axis=1).mean(axis=1)
    final_rewards['RwVariance'] = final_rewards.filter(like="Reward", axis=1).var(axis=1)
    
    # 2. "Chosen" Reward Statistics
    final_rewards['RwChosenAverage'] = final_rewards.filter(like="RewardChosen", axis=1).mean(axis=1)
    final_rewards['RwChosenVariance'] = final_rewards.filter(like="RewardChosen", axis=1).var(axis=1)
    
    # 3. "Rejected" Reward Statistics
    final_rewards['RwRejectedAverage'] = final_rewards.filter(like="RewardRejected", axis=1).mean(axis=1)
    final_rewards['RwRejectedVariance'] = final_rewards.filter(like="RewardRejected", axis=1).var(axis=1)
    
    # 4. Reward Diff (r_chosen - r_rejected) Statistics
    final_rewards['RwDiffAverage'] = final_rewards.filter(like="RewardDiff", axis=1).mean(axis=1)
    final_rewards['RwDiffVariance'] = final_rewards.filter(like="RewardDiff", axis=1).var(axis=1)
    
    # 5. Variance Sum = Var(r_chosen) + Var(r_rejected)
    final_rewards['RwVarianceSum'] = final_rewards['RwChosenVariance'] + final_rewards['RwRejectedVariance']
    
    # 6. Var(r_chosen, r_rejected) = Var(r_chosen) - Var(r_rejected) - 2*Cov(r_chosen, r_rejected)
    # This is an analytical version of Variance computed in #4
    final_rewards['RwDiffAnalyticalVariance'] = final_rewards['RwChosenVariance'] + final_rewards['RwRejectedVariance'] - 2*final_rewards['Covariance']
    
    # 7. Var(p), p = sigmoid(r_chosen - r_rejected) (Preference Probability)
    final_rewards['PrefProbVariance'] = final_rewards.filter(like="PreferenceProb", axis=1).var(axis=1)
    final_rewards['PrefProbAverage'] = final_rewards.filter(like="PreferenceProb", axis=1).mean(axis=1)
    
    # 8. Max Variance = max(Var(r_chosen), Var(r_rejected))
    final_rewards['RewardMaxVariance'] = final_rewards[['RwChosenVariance', 'RwRejectedVariance']].max(axis=1)

In [9]:
compute_unc_stats(final_rewards)

In [10]:
final_rewards.head()

Unnamed: 0,id,RewardChosen_0,Dataset_0,Preference_x_0,RewardRejected_0,Preference_y_0,RewardDiff_0,PreferenceProb_0,id.1,RewardChosen_1,...,RwChosenVariance,RwRejectedAverage,RwRejectedVariance,RwDiffAverage,RwDiffVariance,RwVarianceSum,RwDiffAnalyticalVariance,PrefProbVariance,PrefProbAverage,RewardMaxVariance
0,1801,-1.021655,train,chosen,-1.308317,rejected,0.286662,0.571179,1801,-1.198608,...,0.018304,-1.222914,0.024675,0.259599,0.029189,0.04298,0.029189,0.001746,0.564097,0.024675
1,87053,1.413,train,chosen,-0.526894,rejected,1.939894,0.87434,87053,1.505481,...,0.018581,-0.579809,0.024868,1.808733,0.019871,0.04345,0.019871,0.000291,0.858371,0.024868
2,59149,-0.10488,train,chosen,-0.680847,rejected,0.575967,0.640139,59149,0.134865,...,0.02196,-0.488183,0.020723,0.620937,0.018558,0.042683,0.018558,0.000957,0.649819,0.02196
3,20080,1.230862,train,chosen,-2.328661,rejected,3.559523,0.972335,20080,1.231286,...,0.016546,-2.121717,0.018112,3.213534,0.030726,0.034659,0.030726,4.7e-05,0.960816,0.018112
4,72323,0.448723,train,chosen,-0.17444,rejected,0.623163,0.650938,72323,0.188774,...,0.026763,-0.180517,0.026328,0.469449,0.035856,0.05309,0.035856,0.00201,0.614342,0.026763


In [11]:
def compute_ce(final_rewards):
    final_rewards['Error'] = (final_rewards['PrefProbAverage'] < 0.5) * 1.0
    final_rewards['GT'] = 1.0
    final_rewards['CrossEntropy'] = kl_div(final_rewards['GT'], final_rewards['PrefProbAverage'])

In [12]:
compute_ce(final_rewards)

In [13]:
final_rewards.head()

Unnamed: 0,id,RewardChosen_0,Dataset_0,Preference_x_0,RewardRejected_0,Preference_y_0,RewardDiff_0,PreferenceProb_0,id.1,RewardChosen_1,...,RwDiffAverage,RwDiffVariance,RwVarianceSum,RwDiffAnalyticalVariance,PrefProbVariance,PrefProbAverage,RewardMaxVariance,Error,GT,CrossEntropy
0,1801,-1.021655,train,chosen,-1.308317,rejected,0.286662,0.571179,1801,-1.198608,...,0.259599,0.029189,0.04298,0.029189,0.001746,0.564097,0.024675,0.0,1.0,0.136626
1,87053,1.413,train,chosen,-0.526894,rejected,1.939894,0.87434,87053,1.505481,...,1.808733,0.019871,0.04345,0.019871,0.000291,0.858371,0.024868,0.0,1.0,0.01109
2,59149,-0.10488,train,chosen,-0.680847,rejected,0.575967,0.640139,59149,0.134865,...,0.620937,0.018558,0.042683,0.018558,0.000957,0.649819,0.02196,0.0,1.0,0.08088
3,20080,1.230862,train,chosen,-2.328661,rejected,3.559523,0.972335,20080,1.231286,...,3.213534,0.030726,0.034659,0.030726,4.7e-05,0.960816,0.018112,0.0,1.0,0.000788
4,72323,0.448723,train,chosen,-0.17444,rejected,0.623163,0.650938,72323,0.188774,...,0.469449,0.035856,0.05309,0.035856,0.00201,0.614342,0.026763,0.0,1.0,0.101546


In [12]:
def split_dataset(final_rewards, dataset_column):
    train_df = final_rewards[final_rewards[dataset_column] == 'train']
    test_df = final_rewards[final_rewards[dataset_column] == 'test']
    eval_df = final_rewards[final_rewards[dataset_column] == 'eval']
    ood_df = final_rewards[final_rewards[dataset_column] == 'ood']
    return train_df, test_df, eval_df, ood_df

In [14]:
train_df, test_df, eval_df, ood_df = split_dataset(final_rewards, "Dataset_0")

In [15]:
def compute_quantiles(train_df):
    chosen_p5 = train_df['RwChosenAverage'].quantile(0.05)
    chosen_p95 = train_df['RwChosenAverage'].quantile(0.95)
    
    rej_p5 = train_df['RwRejectedAverage'].quantile(0.05)
    rej_p95 = train_df['RwRejectedAverage'].quantile(0.95)
    
    print(chosen_p5, chosen_p95)
    print(rej_p5, rej_p95)
    return chosen_p5, chosen_p95, rej_p5, rej_p95

def compute_outlier_unc(final_rewards, chosen_p5, chosen_p95, rej_p5, rej_p95):
    # Heuristic: Get p5 and p95 in reward distributions (aka GDA)
    final_rewards['TooHighOrTooLow'] = ((final_rewards['RwChosenAverage'] > chosen_p95) | (final_rewards['RwChosenAverage'] < chosen_p5) | (final_rewards['RwRejectedAverage'] < rej_p5) | (final_rewards['RwRejectedAverage'] > rej_p95)) * 1.0

chosen_p5, chosen_p95, rej_p5, rej_p95 = compute_quantiles(train_df)
compute_outlier_unc(final_rewards, chosen_p5, chosen_p95, rej_p5, rej_p95)

-1.6882672519297213 1.7828431959087783
-2.311974043781693 1.3708552275155041


In [16]:
final_rewards.head()

Unnamed: 0,id,RewardChosen_0,Dataset_0,Preference_x_0,RewardRejected_0,Preference_y_0,RewardDiff_0,PreferenceProb_0,id.1,RewardChosen_1,...,RwDiffVariance,RwVarianceSum,RwDiffAnalyticalVariance,PrefProbVariance,PrefProbAverage,RewardMaxVariance,Error,GT,CrossEntropy,TooHighOrTooLow
0,1801,-1.021655,train,chosen,-1.308317,rejected,0.286662,0.571179,1801,-1.198608,...,0.029189,0.04298,0.029189,0.001746,0.564097,0.024675,0.0,1.0,0.136626,0.0
1,87053,1.413,train,chosen,-0.526894,rejected,1.939894,0.87434,87053,1.505481,...,0.019871,0.04345,0.019871,0.000291,0.858371,0.024868,0.0,1.0,0.01109,0.0
2,59149,-0.10488,train,chosen,-0.680847,rejected,0.575967,0.640139,59149,0.134865,...,0.018558,0.042683,0.018558,0.000957,0.649819,0.02196,0.0,1.0,0.08088,0.0
3,20080,1.230862,train,chosen,-2.328661,rejected,3.559523,0.972335,20080,1.231286,...,0.030726,0.034659,0.030726,4.7e-05,0.960816,0.018112,0.0,1.0,0.000788,0.0
4,72323,0.448723,train,chosen,-0.17444,rejected,0.623163,0.650938,72323,0.188774,...,0.035856,0.05309,0.035856,0.00201,0.614342,0.026763,0.0,1.0,0.101546,0.0


In [17]:
train_df, test_df, eval_df, ood_df = split_dataset(final_rewards, "Dataset_0")
# Compute Variances 
def compute_stats_ce_correlation(df, mode="train", ensemble=True):
    print(f"Correlation Between Var(r1) + Var(r2) and Cross Entropy for {mode}: {df['RwVarianceSum'].corr(df['CrossEntropy'], method='spearman')}")
    print(f"Correlation Between Var(r1-r2) and Cross Entropy for {mode}: {df['RwDiffVariance'].corr(df['CrossEntropy'], method='spearman')}")
    print(f"Correlation Between Var(p) and Cross Entropy for {mode}: {df['PrefProbVariance'].corr(df['CrossEntropy'], method='spearman')}")
    print(f"Correlation Between max(Var(r1), Var(r2)) and Cross Entropy for {mode}: {df['RewardMaxVariance'].corr(df['CrossEntropy'], method='spearman')}")
    if ensemble:
        print(f"Correlation Between Var(r1-r2, *) and Cross Entropy for {mode}: {df['RwDiffDistinctPairs'].corr(df['CrossEntropy'], method='spearman')}")
        print(f"Correlation Between TooHighTooLow and Cross Entropy for {mode}: {df['TooHighOrTooLow'].corr(df['CrossEntropy'], method='spearman')}")
    
print("Train Dataset")
compute_stats_ce_correlation(train_df, "train")
print("Test Dataset")
compute_stats_ce_correlation(test_df, "test")
print("Eval Dataset")
compute_stats_ce_correlation(eval_df, "eval")
print("OOD Dataset")
compute_stats_ce_correlation(ood_df, "ood")

Train Dataset
Correlation Between Var(r1) + Var(r2) and Cross Entropy for train: 0.04154004075548281
Correlation Between Var(r1-r2) and Cross Entropy for train: -0.09365878075551694
Correlation Between Var(p) and Cross Entropy for train: 0.4840945267028453
Correlation Between max(Var(r1), Var(r2)) and Cross Entropy for train: -0.013535195798154634
Correlation Between Var(r1-r2, *) and Cross Entropy for train: 0.0415400321483095
Correlation Between TooHighTooLow and Cross Entropy for train: 0.03256358691461575
Test Dataset
Correlation Between Var(r1) + Var(r2) and Cross Entropy for test: 0.01696573905824865
Correlation Between Var(r1-r2) and Cross Entropy for test: -0.10864997466387988
Correlation Between Var(p) and Cross Entropy for test: 0.322236670433129
Correlation Between max(Var(r1), Var(r2)) and Cross Entropy for test: -0.03576840148701882
Correlation Between Var(r1-r2, *) and Cross Entropy for test: 0.016966621773111475
Correlation Between TooHighTooLow and Cross Entropy for tes

# Baseline 2: Variational Inference

In [18]:
vpo_df = pd.read_csv('/users/lucelo/UQLRM/metadata_vpo.tsv', sep='\t', header=0)

In [19]:
chosen_df = vpo_df[vpo_df['Preference'] == 'chosen'].rename(columns={'RewardScoreMean': 'RewardChosenMean', 'RewardScoreVar': 'RewardChosenVar'})
rejected_df = vpo_df[vpo_df['Preference'] == 'rejected'].rename(columns={'RewardScoreMean': 'RewardRejectedMean', 'RewardScoreVar': 'RewardRejectedVar'})
final_vpo_df = chosen_df.merge(rejected_df, on=['id', 'Dataset'], how='inner')
final_vpo_df['RewardDiff'] = final_vpo_df['RewardChosenMean'] - final_vpo_df['RewardRejectedMean']
final_vpo_df['PrefProbAverage'] = sigmoid(final_vpo_df['RewardDiff'])

In [20]:
def compute_rw_diff_var(x):
    chosen_mu = x['RewardChosenMean']
    chosen_var = x['RewardChosenVar']
    chosen_points = np.random.normal(chosen_mu, np.sqrt(chosen_var), 1000)

    rejected_mu = x['RewardRejectedMean']
    rejected_var = x['RewardRejectedVar']
    rejected_points = np.random.normal(rejected_mu, np.sqrt(rejected_var), 1000)

    diff = chosen_points - rejected_points
    probs = sigmoid(diff)
    return pd.Series([np.mean(diff), np.var(diff), np.mean(probs), np.var(probs)])

In [21]:
final_vpo_df.head()

Unnamed: 0,Preference_x,RewardChosenMean,RewardChosenVar,Model_x,Dataset,id,Preference_y,RewardRejectedMean,RewardRejectedVar,Model_y,RewardDiff,PrefProbAverage
0,chosen,0.217773,0.474215,vpo,train,0,rejected,-0.398438,0.459625,vpo,0.616211,0.649356
1,chosen,-1.054688,0.461424,vpo,train,1,rejected,0.12207,0.474215,vpo,-1.176758,0.235636
2,chosen,-0.306641,0.450735,vpo,train,2,rejected,-0.314453,0.457833,vpo,0.007812,0.501953
3,chosen,-1.054688,0.461424,vpo,train,3,rejected,-0.445312,0.447228,vpo,-0.609375,0.352202
4,chosen,-1.054688,0.461424,vpo,train,4,rejected,-0.453125,0.45427,vpo,-0.601562,0.353986


In [22]:
def compute_unc_stats_vi(vpo_df):
    # 4. Reward Diff (r_chosen - r_rejected) Statistics
    vpo_df[['RwDiffAverage', 'RwDiffVariance', 'PrefProbAverage', 'PrefProbVariance']] = vpo_df.apply(lambda x: compute_rw_diff_var(x), axis=1)
    
    # 5. Variance Sum = Var(r_chosen) + Var(r_rejected)
    vpo_df['RwVarianceSum'] = vpo_df['RewardChosenVar'] + vpo_df['RewardRejectedVar']
    
    # 8. Max Variance = max(Var(r_chosen), Var(r_rejected))
    vpo_df['RewardMaxVariance'] = vpo_df[['RewardChosenVar', 'RewardRejectedVar']].max(axis=1)

compute_unc_stats_vi(final_vpo_df)

In [23]:
final_vpo_df.head()

Unnamed: 0,Preference_x,RewardChosenMean,RewardChosenVar,Model_x,Dataset,id,Preference_y,RewardRejectedMean,RewardRejectedVar,Model_y,RewardDiff,PrefProbAverage,RwDiffAverage,RwDiffVariance,PrefProbVariance,RwVarianceSum,RewardMaxVariance
0,chosen,0.217773,0.474215,vpo,train,0,rejected,-0.398438,0.459625,vpo,0.616211,0.632254,0.641563,0.88641,0.035553,0.933841,0.474215
1,chosen,-1.054688,0.461424,vpo,train,1,rejected,0.12207,0.474215,vpo,-1.176758,0.271053,-1.177549,0.938799,0.028478,0.93564,0.474215
2,chosen,-0.306641,0.450735,vpo,train,2,rejected,-0.314453,0.457833,vpo,0.007812,0.495338,-0.02188,0.895317,0.040108,0.908569,0.457833
3,chosen,-1.054688,0.461424,vpo,train,3,rejected,-0.445312,0.447228,vpo,-0.609375,0.375462,-0.606849,0.933827,0.037283,0.908652,0.461424
4,chosen,-1.054688,0.461424,vpo,train,4,rejected,-0.453125,0.45427,vpo,-0.601562,0.376867,-0.590886,0.881872,0.036079,0.915695,0.461424


In [24]:
compute_ce(final_vpo_df)
vpo_train_df, vpo_test_df, vpo_eval_df, vpo_ood_df = split_dataset(final_vpo_df, "Dataset")

In [25]:
print("Train Dataset")
compute_stats_ce_correlation(vpo_train_df, "train", ensemble=False)
print("Test Dataset")
compute_stats_ce_correlation(vpo_test_df, "test", ensemble=False)
print("Eval Dataset")
compute_stats_ce_correlation(vpo_eval_df, "eval", ensemble=False)
print("OOD Dataset")
compute_stats_ce_correlation(vpo_ood_df, "ood", ensemble=False)

Train Dataset
Correlation Between Var(r1) + Var(r2) and Cross Entropy for train: -0.07272064067155334
Correlation Between Var(r1-r2) and Cross Entropy for train: -0.03716236530827557
Correlation Between Var(p) and Cross Entropy for train: 0.6614296494944618
Correlation Between max(Var(r1), Var(r2)) and Cross Entropy for train: -0.09124672520223277
Test Dataset
Correlation Between Var(r1) + Var(r2) and Cross Entropy for test: -0.002000714114166764
Correlation Between Var(r1-r2) and Cross Entropy for test: 0.0030218330206425678
Correlation Between Var(p) and Cross Entropy for test: 0.47892330221793167
Correlation Between max(Var(r1), Var(r2)) and Cross Entropy for test: -0.023795413147461395
Eval Dataset
Correlation Between Var(r1) + Var(r2) and Cross Entropy for eval: -0.03229438222276551
Correlation Between Var(r1-r2) and Cross Entropy for eval: -0.009877164612702015
Correlation Between Var(p) and Cross Entropy for eval: 0.45251092137986665
Correlation Between max(Var(r1), Var(r2)) and

# Baseline 3: Finetuned Ensembles

In [26]:
def compute_entropy(df):
    return df.apply(lambda row: entropy(row), axis=1)
    
def compute_uncertanties(dfs):
    # Compute single model entropies
    for df in dfs:
        df['entropy'] = compute_entropy(df[['First', 'Second']])
    
    
    for i, df in enumerate(dfs):
        # Add unique suffixes to the column names
        df.columns = [f"{col}_{i}" if col != 'id' else col for col in df.columns]

    # Use reduce to merge all dataframes
    from functools import reduce
    final_df = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs)

    first_cols = [col for col in final_df.columns if 'First_' in col]
    second_cols = [col for col in final_df.columns if 'Second_' in col]
    entropy_cols = [col for col in final_df.columns if 'entropy_' in col]

    avg_first = final_df[first_cols].mean(axis=1)
    avg_second = final_df[second_cols].mean(axis=1)
    avg_entropy = final_df[entropy_cols].mean(axis=1)
    var_first = final_df[first_cols].var(axis=1)
    avg_df = pd.concat([avg_first, avg_second, avg_entropy, var_first], axis=1)
    avg_df.columns = ['First', 'Second', 'Aleatoric Uncertainty', 'Variance']

    
    avg_df['Predictive Uncertainty'] = compute_entropy(avg_df[['First', 'Second']])
    avg_df['Epistemic Uncertainty'] = avg_df['Predictive Uncertainty'] - avg_df['Aleatoric Uncertainty']
    return avg_df['Epistemic Uncertainty'], avg_df['Predictive Uncertainty'], avg_df['Aleatoric Uncertainty'], avg_df[['First', 'Second']], avg_df['Variance']

In [27]:
!huggingface-cli login --token $HUGGINGFACE_WRITETOKEN
def load_predictions(exp_prefix, name, checkpoint, mode, ensemble_size, active_learning=False):
    ensemble_df = []
    for j in range(ensemble_size):
        if active_learning:
            datafile = os.path.join(exp_prefix, f"{name}", "predictions", f"{name}_{j}", f"checkpoint-{i}", f"eval_{mode}", "predictions.csv")
        else:
            datafile = os.path.join(exp_prefix, f"{name}_{j}", f"{name}_{j}", f"checkpoint-{i}", f"eval_{mode}", "predictions.csv")
        df = load_dataset("luckeciano/uqlrm_predictions", data_files=datafile)['train'].to_pandas()
        ensemble_df.append(df)


    print(f"Number of ensemble predictions loaded: {len(ensemble_df)}")
    epistemic, predictive, aleatoric, ens_predictions, var_predictions = compute_uncertanties(ensemble_df)
    return ens_predictions, var_predictions
    
exp_prefix = "scratch/lucelo/sft/results/"
name = "gpt2_rwft_reddit_1"
i = 80
train_ens_preds, train_var_preds = load_predictions(exp_prefix, name, i, "train", 8)
test_ens_preds, test_var_preds = load_predictions(exp_prefix, name, i, "test", 8)
eval_ens_preds, eval_var_preds = load_predictions(exp_prefix, name, i, "eval", 8)
ood_ens_preds, ood_var_preds = load_predictions(exp_prefix, name, i, "ood", 8)
    

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /scratch-ssd/oatml/huggingface/token
Login successful
Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8


In [28]:
def compute_ce_ens(ens_preds, var_preds):
    finetune_ens_df = pd.DataFrame()
    finetune_ens_df['PrefProbAverage'] = ens_preds['First']
    finetune_ens_df['PrefProbVariance'] = var_preds
    finetune_ens_df['Error'] = (finetune_ens_df['PrefProbAverage'] < 0.5) * 1.0
    finetune_ens_df['GT'] = 1.0
    finetune_ens_df['CrossEntropy'] = kl_div(finetune_ens_df['GT'], finetune_ens_df['PrefProbAverage'])
    return finetune_ens_df

In [29]:
train_df = compute_ce_ens(train_ens_preds, train_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {train_df['PrefProbVariance'].corr(train_df['CrossEntropy'], method='spearman')}")

test_df = compute_ce_ens(test_ens_preds, test_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {test_df['PrefProbVariance'].corr(test_df['CrossEntropy'], method='spearman')}")

eval_df = compute_ce_ens(eval_ens_preds, eval_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {eval_df['PrefProbVariance'].corr(eval_df['CrossEntropy'], method='spearman')}")

ood_df = compute_ce_ens(ood_ens_preds, ood_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {ood_df['PrefProbVariance'].corr(ood_df['CrossEntropy'], method='spearman')}")

Correlation Between Var(p) and Cross Entropy for Training: 0.056321783544015507
Correlation Between Var(p) and Cross Entropy for Training: 0.07439719985948799
Correlation Between Var(p) and Cross Entropy for Training: 0.05166463652945645
Correlation Between Var(p) and Cross Entropy for Training: -0.07087779592929311


In [30]:
exp_prefix = "scratch/lucelo/active_learning/results/"
name = "al_ep_v11_3"
i = 60
train_ens_preds, train_var_preds = load_predictions(exp_prefix, name, i, "train", 8, active_learning=True)
test_ens_preds, test_var_preds = load_predictions(exp_prefix, name, i, "test", 8, active_learning=True)
eval_ens_preds, eval_var_preds = load_predictions(exp_prefix, name, i, "eval", 8, active_learning=True)
ood_ens_preds, ood_var_preds = load_predictions(exp_prefix, name, i, "ood", 8, active_learning=True)

Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8
Number of ensemble predictions loaded: 8


In [31]:
train_df = compute_ce_ens(train_ens_preds, train_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {train_df['PrefProbVariance'].corr(train_df['CrossEntropy'], method='spearman')}")

test_df = compute_ce_ens(test_ens_preds, test_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {test_df['PrefProbVariance'].corr(test_df['CrossEntropy'], method='spearman')}")

eval_df = compute_ce_ens(eval_ens_preds, eval_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {eval_df['PrefProbVariance'].corr(eval_df['CrossEntropy'], method='spearman')}")

ood_df = compute_ce_ens(ood_ens_preds, ood_var_preds)
print(f"Correlation Between Var(p) and Cross Entropy for Training: {ood_df['PrefProbVariance'].corr(ood_df['CrossEntropy'], method='spearman')}")

Correlation Between Var(p) and Cross Entropy for Training: -0.09131798391076847
Correlation Between Var(p) and Cross Entropy for Training: -0.014102754498991945
Correlation Between Var(p) and Cross Entropy for Training: 0.006720589136532155
Correlation Between Var(p) and Cross Entropy for Training: -0.11646054855009284


# Baseline 4: Finetuned Ensembles with Different Seeds

In [5]:
def generate_rw_set(filepath):
    df = pd.read_csv(filepath, sep='\t', header=0)[['id', 'RewardScore', 'Dataset', 'Preference']]
    chosen_df = df[df['Preference'] == 'chosen'].rename(columns={'RewardScore': 'RewardChosen'})
    rejected_df = df[df['Preference'] == 'rejected'].rename(columns={'RewardScore': 'RewardRejected'})
    merged_df = chosen_df.merge(rejected_df, on=['id', 'Dataset'], how='inner')
    merged_df['RewardDiff'] = merged_df['RewardChosen'] - merged_df['RewardRejected']
    merged_df['PreferenceProb'] = sigmoid(merged_df['RewardDiff'])
    return merged_df

In [22]:
gpt2xl_df = generate_rw_set('/users/lucelo/UQLRM/uqlrm/scripts/slurm/metadata_gpt2xl-infer_.tsv')[['id', 'Dataset', 'RewardChosen', 'RewardRejected', 'RewardDiff', 'PreferenceProb']]
gpt2xl_df.columns = [f"{col}_gpt2xl" if (col != 'id' and col != 'Dataset') else col for col in gpt2xl_df.columns]
llama_df = generate_rw_set('/users/lucelo/UQLRM/uqlrm/scripts/slurm/metadata_llama_rw_infer_v0_.tsv')[['id', 'Dataset', 'RewardChosen', 'RewardRejected', 'RewardDiff', 'PreferenceProb']]
llama_df.columns = [f"{col}_llama" if (col != 'id' and col != 'Dataset') else col for col in llama_df.columns]
gpt2_df = generate_rw_set('/users/lucelo/UQLRM/metadata_gpt2-after-reward-modeling.tsv')[['id', 'Dataset', 'RewardChosen', 'RewardRejected', 'RewardDiff', 'PreferenceProb']]
gpt2_df.columns = [f"{col}_gpt2" if (col != 'id' and col != 'Dataset') else col for col in gpt2_df.columns]
hermes_df = generate_rw_set('/users/lucelo/UQLRM/metadata_single_mlp_0.tsv')[['id', 'Dataset', 'RewardChosen', 'RewardRejected', 'RewardDiff', 'PreferenceProb']]
hermes_df.columns = [f"{col}_hermes" if (col != 'id' and col != 'Dataset') else col for col in hermes_df.columns]

In [28]:
joined_df = pd.merge(gpt2xl_df, llama_df, on=['id', 'Dataset'], how='inner')
# joined_df = pd.merge(joined_df, gpt2_df, on=['id', 'Dataset'], how='inner')
joined_df = pd.merge(joined_df, hermes_df, on=['id', 'Dataset'], how='inner')
joined_df.head()

Unnamed: 0,id,Dataset,RewardChosen_gpt2xl,RewardRejected_gpt2xl,RewardDiff_gpt2xl,PreferenceProb_gpt2xl,RewardChosen_llama,RewardRejected_llama,RewardDiff_llama,PreferenceProb_llama,RewardChosen_hermes,RewardRejected_hermes,RewardDiff_hermes,PreferenceProb_hermes
0,1801,train,2.375,2.578125,-0.203125,0.449393,1.046875,-0.21875,1.265625,0.779993,-1.021655,-1.308317,0.286662,0.571179
1,87053,train,1.539062,0.636719,0.902344,0.711431,2.890625,0.507812,2.382812,0.915507,1.413,-0.526894,1.939894,0.87434
2,59149,train,1.953125,1.117188,0.835938,0.697609,-0.628906,-0.59375,-0.035156,0.491212,-0.10488,-0.680847,0.575967,0.640139
3,20080,train,1.515625,-0.351562,1.867188,0.866133,1.75,0.482422,1.267578,0.780328,1.230862,-2.328661,3.559523,0.972335
4,72323,train,0.960938,0.507812,0.453125,0.611382,2.21875,1.554688,0.664062,0.660172,0.448723,-0.17444,0.623163,0.650938


In [29]:
joined_df['PrefProbVariance'] = joined_df.filter(like="PreferenceProb", axis=1).var(axis=1)
joined_df['PrefProbAverage'] = joined_df.filter(like="PreferenceProb", axis=1).mean(axis=1)
joined_df['RwDiffVariance'] = joined_df.filter(like="RewardDiff", axis=1).var(axis=1)
joined_df['GT'] = 1.0
joined_df['Error'] = (joined_df['PrefProbAverage'] < 0.5) * 1.0
joined_df['CrossEntropy'] = kl_div(joined_df['GT'], joined_df['PrefProbAverage'])

In [30]:
joined_df.head()

Unnamed: 0,id,Dataset,RewardChosen_gpt2xl,RewardRejected_gpt2xl,RewardDiff_gpt2xl,PreferenceProb_gpt2xl,RewardChosen_llama,RewardRejected_llama,RewardDiff_llama,PreferenceProb_llama,RewardChosen_hermes,RewardRejected_hermes,RewardDiff_hermes,PreferenceProb_hermes,PrefProbVariance,PrefProbAverage,RwDiffVariance,GT,Error,CrossEntropy
0,1801,train,2.375,2.578125,-0.203125,0.449393,1.046875,-0.21875,1.265625,0.779993,-1.021655,-1.308317,0.286662,0.571179,0.027955,0.600188,0.559248,1.0,0.0,0.1107
1,87053,train,1.539062,0.636719,0.902344,0.711431,2.890625,0.507812,2.382812,0.915507,1.413,-0.526894,1.939894,0.87434,0.011647,0.83376,0.577412,1.0,0.0,0.01557
2,59149,train,1.953125,1.117188,0.835938,0.697609,-0.628906,-0.59375,-0.035156,0.491212,-0.10488,-0.680847,0.575967,0.640139,0.011347,0.609653,0.199977,1.0,0.0,0.104518
3,20080,train,1.515625,-0.351562,1.867188,0.866133,1.75,0.482422,1.267578,0.780328,1.230862,-2.328661,3.559523,0.972335,0.009251,0.872932,1.412757,1.0,0.0,0.00883
4,72323,train,0.960938,0.507812,0.453125,0.611382,2.21875,1.554688,0.664062,0.660172,0.448723,-0.17444,0.623163,0.650938,0.000672,0.640831,0.012513,1.0,0.0,0.085821


In [31]:
train_df, test_df, eval_df, ood_df = split_dataset(joined_df, "Dataset")

In [32]:
print(f"Correlation Between Var(p) and Cross Entropy for Training: {train_df['PrefProbVariance'].corr(train_df['CrossEntropy'], method='spearman')}")

print(f"Correlation Between Var(p) and Cross Entropy for OOD: {ood_df['PrefProbVariance'].corr(ood_df['CrossEntropy'], method='spearman')}")

Correlation Between Var(p) and Cross Entropy for Training: 0.23106520440146183
Correlation Between Var(p) and Cross Entropy for OOD: 0.05782040031539965
