In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from scipy.stats import kendalltau, bootstrap
from src.load_huggingface import load_huggingface as load_huggingface

ROOT_DIR = os.path.abspath('.')
FIGURES_DIR = os.path.join(ROOT_DIR, 'figures')

In [None]:
dataset_name = "Anthropic/hh-rlhf"
split = 'test'
principle_pref_files = ['principle_specific_preferences/hh_test_seed_4ob.csv']
main_a_pref = 'a_pref'

# dataset_name = "PKU-Alignment/PKU-SafeRLHF"  # We will load -single-dimension later down
# split = 'test'
# principle_pref_files = ['principle_specific_preferences/pku_test_4ob.csv', 'principle_specific_preferences/pku_single_test_4ob.csv']
# main_a_pref = 'a_pref_single'


# We use short descriptions of principles here for visualization purposes.
# See 'principles/ccai_seed.txt' for actual texts.
principles_file = 'principles/ccai_seed_short.txt'

# Data loading & preprocessing


### Load dataset

In [None]:
dataset, principles = load_huggingface(dataset_name, principles_name=principles_file, split=split)
dataset['row_id'] = dataset['row_id'].astype(str)
dataset = dataset.set_index('row_id')
dataset = dataset.rename(columns={'prompt': 'x', 'rejected': 'y_0', 'chosen': 'y_1'})
data_suff = dataset_name.split('/')[-1]

if dataset_name == "PKU-Alignment/PKU-SafeRLHF":
    single_dim_dataset = load_huggingface("PKU-Alignment/PKU-SafeRLHF-single-dimension", split=split)[0]
    single_dim_dataset['row_id'] = single_dim_dataset['row_id'].astype(str) + '_single'
    single_dim_dataset = single_dim_dataset.rename(columns={'prompt': 'x', 'rejected': 'y_0', 'chosen': 'y_1'})
    single_dim_dataset = single_dim_dataset.set_index('row_id')
    dataset = pd.concat([dataset, single_dim_dataset], axis=0)
print(f"Loaded {len(dataset)} rows.")

In [None]:
dataset

In [None]:
principles.name = 'principle'
principles

### Load principle prefs

In [None]:
p_prefs_raw = []
for f in principle_pref_files:
    df = pd.read_csv(f).drop(columns=['Unnamed: 0'], errors='ignore')
    df['row_id'] = df['row_id'].astype(str)
    
    if 'principle_id' in df and 'prob_A' in df:
        for principle_id in df['principle_id'].unique():
            df[f'prob_A_p{principle_id}'] = df[df['principle_id'] == principle_id]['prob_A']
            df[f'prob_B_p{principle_id}'] = df[df['principle_id'] == principle_id]['prob_B']
            if 'prob_NA' in df:
                df[f'prob_NA_p{principle_id}'] = df[df['principle_id'] == principle_id]['prob_NA']
        df = df.drop(columns=['prob_A', 'prob_B', 'prob_NA', 'principle_id'])  
    if dataset_name == "PKU-Alignment/PKU-SafeRLHF" and 'single' in f:
        df['row_id'] = df['row_id'] + '_single'
    df['file'] = f
    
    # Remapping for old data formats
    if 'order' in df:
        df['order'] = df['order'].replace({'0,1': '0', '1,0': '1'}).astype(int)
    df = df.rename(columns={col: col.replace('prob_', 'score_') for col in df.columns})
    df = df.rename(columns={col: col.replace('_A_', '_1_') for col in df.columns})
    df = df.rename(columns={col: col.replace('_B_', '_0_') for col in df.columns})
    df = df.rename(columns={col: col.replace('_na_', '_NA_') for col in df.columns})
    
    score_cols = [col for col in df.columns if col.startswith('score_')]
    score_cols += ['cost'] if 'cost' in df else []
    df = df.groupby(['file', 'row_id', 'order'])[score_cols].sum().reset_index()
    df = df.set_index(['file', 'row_id'])
    
    p_prefs_raw.append(df)
p_prefs_raw = pd.concat(p_prefs_raw)

# For debugging purposes, only keep first order
p_prefs_raw = p_prefs_raw[p_prefs_raw['order'] == 0]
if p_prefs_raw['order'].nunique() == 1:
    p_prefs_raw = p_prefs_raw.drop(columns=['order'])

p_prefs_raw

In [None]:
def get_pref(df, col_a='score_1', col_b='score_0', col_na='score_NA', discrete=True):
    if not discrete:
        raise NotImplementedError
    a_vals = df[col_a]
    b_vals = df[col_b]
    a_over_b = np.sign(a_vals - b_vals)
    a_over_b[a_over_b == 0] = np.nan
    if 'order' in df:
        b_over_a_reversed = -a_over_b[df['order'] == 1]
        combined_a_over_b = (a_over_b[df['order'] == 0] + b_over_a_reversed) / 2
        a_over_b = combined_a_over_b
    
    if col_na is not None:
        na_vals = df[col_na]
        if 'order' in df:
            a_over_b.loc[((na_vals > a_vals) & (na_vals > b_vals))[df['order'] == 0]] = np.nan
            a_over_b.loc[((na_vals > a_vals) & (na_vals > b_vals))[df['order'] == 1]] = np.nan
        else:
            a_over_b.loc[(na_vals > a_vals) & (na_vals > b_vals)] = np.nan
    return a_over_b

def get_pref_per_principle(df):
    score_a_cols = [col for col in df.columns if f'score_1' in col]
    score_b_cols = [col for col in df.columns if f'score_0' in col]
    score_na_cols = [col for col in df.columns if f'score_NA' in col]
    assert len(score_a_cols) == len(score_b_cols) == len(principles)
    if len(score_na_cols) == 0:
        score_na_cols = None
    else:
        assert len(score_na_cols) == len(score_a_cols)
    
    a_over_b_per_principle = {}
    for p_i in range(len(principles)):
        p_pref = get_pref(df,
                          col_a=score_a_cols[p_i], 
                          col_b=score_b_cols[p_i], 
                          col_na=score_na_cols[p_i])
        a_over_b_per_principle[f'p{p_i}_pref'] = p_pref
    a_over_b_df = pd.concat(a_over_b_per_principle, axis=1)
    return a_over_b_df

In [None]:
p_prefs = get_pref_per_principle(p_prefs_raw)
p_prefs

In [None]:
# Select only one oracle file
if dataset_name == "Anthropic/hh-rlhf":
    p_prefs = p_prefs.loc[principle_pref_files[0]]
elif dataset_name == "PKU-Alignment/PKU-SafeRLHF":
    p_prefs_dual = p_prefs.loc[principle_pref_files[0]]
    p_prefs_single = p_prefs.loc[principle_pref_files[1]]
    p_prefs = pd.concat([p_prefs_dual, p_prefs_single], axis=0)
p_prefs

### Load reward model prefs

In [None]:
from reward_model_preferences.files import file_dict as all_rm_pref_files
rm_pref_files = all_rm_pref_files[dataset_name].copy()
if dataset_name == "PKU-Alignment/PKU-SafeRLHF":
    rm_pref_files.update(all_rm_pref_files["PKU-Alignment/PKU-SafeRLHF-single-dimension"])

rm_prefs_raw = []
for f, f_short in rm_pref_files.items():
    df = pd.read_csv(f).drop(columns=['Unnamed: 0'], errors='ignore')
    df['row_id'] = df['row_id'].astype(str)
    df['file'] = f_short
    if f in all_rm_pref_files["PKU-Alignment/PKU-SafeRLHF-single-dimension"]:
        df['row_id'] = df['row_id'] + '_single'
    df = df.set_index('row_id')
    # df = df.rename(columns={col: col.replace('reward_', 'score_') for col in df.columns})
    
    better_col = None
    if dataset_name == "Anthropic/hh-rlhf":
        better_col = 'a_pref'
    elif dataset_name == "PKU-Alignment/PKU-SafeRLHF":
        if 'splited' in f:
            better_col = 'a_pref_better'
        else:
            better_col = 'a_pref_single'
    # _a is coded as the preferred response according to 'better_col'.
    # To find the score of response_1, we thus check if 1 is the preferred response.
    better_rows = dataset.loc[dataset.index.isin(df.index), better_col] == 1
    df.loc[better_rows, 'score_1'] = df.loc[better_rows, 'reward_a']
    df.loc[better_rows, 'score_0'] = df.loc[better_rows, 'reward_b']
    
    # If 0 is the preferred response, we need to swap the scores
    df.loc[~better_rows, 'score_1'] = df.loc[~better_rows, 'reward_b']
    df.loc[~better_rows, 'score_0'] = df.loc[~better_rows, 'reward_a']

    df = df.reset_index().set_index(['file', 'row_id'])
    rm_prefs_raw.append(df[['score_1', 'score_0']])
rm_prefs_raw = pd.concat(rm_prefs_raw)
rm_prefs_raw

In [None]:
rm_prefs = get_pref(rm_prefs_raw, col_na=('score_NA' if 'score_NA' in rm_prefs_raw else None))
rm_prefs.name = 'rm_'
rm_prefs = rm_prefs.reset_index().pivot(index='row_id', columns='file', values='rm_').rename_axis(None, axis=1)
rm_prefs = rm_prefs.rename(columns={col: 'rm_' + col for col in rm_prefs.columns})
rm_prefs

### Load LLM prefs

In [None]:
from llm_preferences.files import file_dict as all_llm_pref_files
llm_pref_files = all_llm_pref_files[dataset_name].copy()
if dataset_name == "PKU-Alignment/PKU-SafeRLHF":
    llm_pref_files.update(all_llm_pref_files["PKU-Alignment/PKU-SafeRLHF-single-dimension"])

llm_prefs = []
for f, f_short in llm_pref_files.items():
    df = pd.read_csv(f).drop(columns=['Unnamed: 0'], errors='ignore')
    df['row_id'] = df['row_id'].astype(str)
    df['file'] = f_short
    if f in all_llm_pref_files["PKU-Alignment/PKU-SafeRLHF-single-dimension"]:
        df['row_id'] = df['row_id'] + '_single'
    df = df.set_index('row_id')
    
    df['order'] = df['order'].replace({'0,1': '0', '1,0': '1'}).astype(int)
    
    # df = df.rename(columns={col: col.replace('reward_', 'score_') for col in df.columns})
    # df = df.drop(columns=['log_prob_chat_chosen', 'log_prob_chat_rejected'], errors='ignore')
    
    if 'log_prob_chat_chosen' in df:
        df = df.rename(columns={
            'log_prob_chat_chosen': 'score_a', 
            'log_prob_chat_rejected': 'score_b'}
        )
        df = df.drop(columns=['reward_a', 'reward_b', 'reward_NA'], errors='ignore')
        assert (df.groupby(['row_id'])['score_a'].std() == 0.0).all()
        df = df[df['order'] == 0]
        df = df.drop(columns=['order'])
    
    if 'score_a' in df:
        better_col = None
        if dataset_name == "Anthropic/hh-rlhf":
            better_col = 'a_pref'
        elif dataset_name == "PKU-Alignment/PKU-SafeRLHF":
            if 'splited' in f:
                better_col = 'a_pref_better'
            else:
                better_col = 'a_pref_single'
        # _a is coded as the preferred response according to 'better_col'.
        # To find the score of response_1, we thus check if 1 is the preferred response.
        better_rows = dataset.loc[dataset.index.isin(df.index), better_col] == 1
        df.loc[better_rows, 'score_1'] = df.loc[better_rows, 'score_a']
        df.loc[better_rows, 'score_0'] = df.loc[better_rows, 'score_b']
        
        # If 0 is the preferred response, we need to swap the scores
        df.loc[~better_rows, 'score_1'] = df.loc[~better_rows, 'score_b']
        df.loc[~better_rows, 'score_0'] = df.loc[~better_rows, 'score_a']
        
        df = df.drop(columns=['score_a', 'score_b'])
    
    if 'generic' in f:
        # For GPT-4o preferences, _A is already always response _1
        df = df.rename(columns={col: col.replace('prob_', 'score_') for col in df.columns})
        df = df.rename(columns={col: col.replace('_A', '_1') for col in df.columns})
        df = df.rename(columns={col: col.replace('_B', '_0') for col in df.columns})
        df = df.drop(columns=['cost'], errors='ignore')
    
    df = df.reset_index().set_index(['file', 'row_id'])
    _prefs = get_pref(df, col_na=('score_NA' if 'score_NA' in df else None))
    llm_prefs.append(_prefs)
llm_prefs = pd.concat(llm_prefs)
llm_prefs

In [None]:
llm_prefs.name = 'llm_'
llm_prefs = llm_prefs.reset_index().pivot(index='row_id', columns='file', values='llm_').rename_axis(None, axis=1)
llm_prefs = llm_prefs.rename(columns={col: 'llm_' + col for col in llm_prefs.columns})
llm_prefs

In [None]:
columns = list(llm_prefs.columns)
columns.insert(2, columns.pop(columns.index('llm_Sonnet3.5')))
llm_prefs = llm_prefs[columns]
llm_prefs

In [None]:
llm_prefs.apply(lambda x: x.value_counts(normalize=True, dropna=False)).T

# Principle-specific preferences analysis

In [None]:
def compute_stacked_counts(p_prefs):
    na_counts = p_prefs.isna().mean(axis=0)
    one_counts = (p_prefs == 1).mean(axis=0)
    neg_one_counts = (p_prefs == -1).mean(axis=0)

    counts_df = pd.DataFrame({
        'Indifference': na_counts,
        'Agreement': one_counts,
        'Disagreement': neg_one_counts
    })
    
    return counts_df
stacked_counts_df = compute_stacked_counts(p_prefs)
y = np.arange(len(stacked_counts_df)) * 1.2

fig, ax = plt.subplots(figsize=(10, 12))

ax.barh(y, stacked_counts_df['Indifference'], color='#C0C0C0', label='Indifference', height=0.6, edgecolor='black')
ax.barh(y, stacked_counts_df['Agreement'], left=stacked_counts_df['Indifference'], color='#4682B4', label='Agreement', height=0.6, edgecolor='black')
ax.barh(y, stacked_counts_df['Disagreement'], left=stacked_counts_df['Indifference'] + stacked_counts_df['Agreement'], color='#CD5C5C', label='Disagreement', height=0.6, edgecolor='black')

ax.set_xlabel('Proportion')
ax.set_ylabel('Principle')
ax.set_yticks(y)
ax.set_yticklabels(principles)
ax.legend(
    title='Category',
    loc='upper center',
    bbox_to_anchor=(0.5, 1.02),
    frameon=True,
    ncol=3
)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'{data_suff}1.pdf'), 
            bbox_inches='tight', 
            transparent=True)
plt.show()

In [None]:
def identify_p_agreement(p_prefs_row):
    if p_prefs_row.isna().all():
        return 'Indifference'
    elif np.isclose(p_prefs_row, -1).any() and np.isclose(p_prefs_row, 1).any():
        return 'Conflict'
    else:
        return 'Consensus'
        
p_agreement = pd.get_dummies(p_prefs.apply(identify_p_agreement, axis=1))
categories = ['Indifference', 'Conflict', 'Consensus']
for col in categories:
    if col not in p_agreement:
        p_agreement[col] = False
assert (p_agreement[categories].sum(axis=1) == 1).all()
p_agreement

In [None]:
freqs = (
    pd.get_dummies(p_agreement)
    .reset_index()
    .groupby('row_id')[categories]
    .sum()
    .apply(lambda x: x / x.sum(), axis=1)
    .reset_index()
)

freqs_stats_df = (
    pd.DataFrame({
        "Category": categories,
        "Mean (%)": [freqs[cat].mean() * 100 for cat in categories],
        "SD (%)": [freqs[cat].std()/len(freqs[cat])**(0.5) * 100 for cat in categories]
    })
    .round(3)
)

freqs_stats_df


In [None]:
prefs = dataset.merge(rm_prefs, how='inner', left_index=True, right_index=True)
prefs = prefs.merge(llm_prefs, how='inner', left_index=True, right_index=True)

prefs

# Discretion

In [None]:
annotator_pref_cols = [col for col in prefs.columns if col.startswith('a_pref')]
annotator_pref_cols += [col for col in prefs.columns if col.startswith('rm_')]
annotator_pref_cols += [col for col in prefs.columns if col.startswith('llm_')]

In [None]:
consensus_idx = p_agreement['Consensus']
consensus = p_prefs[consensus_idx].min(axis=1)
assert (consensus == p_prefs[consensus_idx].max(axis=1)).all()
def compute_discretion_abuse(a_prefs_col):
    agreement = (consensus * a_prefs_col) < 0
    agreement = agreement[~a_prefs_col.isna() & (a_prefs_col != 0)]
    return agreement.mean()
discretion_abuse = prefs.loc[consensus_idx, annotator_pref_cols].apply(compute_discretion_abuse)
discretion_abuse

In [None]:
def compute_discretion_abuse(a_prefs_col, consensus):
    agreement = (consensus * a_prefs_col) < 0
    agreement = agreement[~a_prefs_col.isna() & (a_prefs_col != 0)]
    return agreement.mean()

def bootstrap_discretion_abuse(n_bootstrap):
    column_bootstrap_results = {col: [] for col in annotator_pref_cols}
    valid_indices = consensus_idx[consensus_idx].index

    for _ in range(n_bootstrap):
        sampled_indices = np.random.choice(valid_indices, size=len(valid_indices), replace=True)
        sampled_prefs = prefs.loc[sampled_indices, annotator_pref_cols]
        sampled_consensus = consensus.loc[sampled_indices]

        for col in annotator_pref_cols:
            discretion_abuse = compute_discretion_abuse(sampled_prefs[col], sampled_consensus)
            column_bootstrap_results[col].append(discretion_abuse)

    column_results = {}
    for col, results in column_bootstrap_results.items():
        if results:
            lower, upper = np.percentile(results, [2.5, 97.5])
            mean_value = np.mean(results)
            largest_gap = max(abs(mean_value - lower), abs(mean_value - upper))
            sd_value = np.std(results)
            column_results[col] = {
                'mean': mean_value,
                'ci': (lower, upper),
                'largest_gap': largest_gap,
                'sd': sd_value
            }
        else:
            column_results[col] = {
                'mean': None,
                'ci': (None, None),
                'largest_gap': None,
                'sd': None
            }

    return column_results

discretion_abuse_results = bootstrap_discretion_abuse(n_bootstrap=1000)

discretion_abuse_stats = {
    col: {
        'Mean (%)': result['mean'] * 100 if result['mean'] is not None else None,
        'SD (%)': result['sd'] * 100 if result['sd'] is not None else None,
        # 'CI Lower (%)': result['ci'][0] * 100 if result['ci'] else None,
        # 'CI Upper (%)': result['ci'][1] * 100 if result['ci'] else None,
        # 'Largest Gap (%)': result['largest_gap'] * 100 if result['largest_gap'] else None
    }
    for col, result in discretion_abuse_results.items()
    if result['mean'] is not None
}

discretion_abuse_stats = pd.DataFrame.from_dict(discretion_abuse_stats, orient='index')
discretion_abuse_stats

In [None]:
ncols = len(p_prefs.columns)

if data_suff == 'PKU-SafeRLHF':
    labs = ["human", "better" , "safer", "LlaMa-8B RM", "Mistral-7B RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]
elif data_suff == "hh-rlhf":
    labs = ["human","LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]
else:
    labs = []

def compute_supremacies(a_prefs_col):
    ncols = p_prefs.shape[1]
    nb_conflicts = np.zeros((ncols, ncols))
    supremacies = np.empty((ncols, ncols))
    supremacies.fill(np.nan)
    
    a_prefs_valid = (~a_prefs_col.isna()) & (a_prefs_col != 0)
    for i in range(ncols):
        for j in range(ncols):
            pi_prefs = p_prefs.loc[a_prefs_col.index].iloc[:, i]
            pj_prefs = p_prefs.loc[a_prefs_col.index].iloc[:, j]
            ij_conflict = (pi_prefs * pj_prefs) < 0
            ij_conflict_valid = ij_conflict & a_prefs_valid
            nb_conflicts[i, j] = ij_conflict_valid.sum()
            supremacies[i, j] = ((pi_prefs[ij_conflict_valid] * a_prefs_col[ij_conflict_valid]) > 0).mean()
    
    supremacies = pd.DataFrame(supremacies, index=principles, columns=principles)
    return supremacies, nb_conflicts

supremacies_per_a = {}
nb_conflicts_per_a = {}
for k, a in enumerate(annotator_pref_cols):
    supremacies_per_a[a], nb_conflicts_per_a[a] = compute_supremacies(prefs[a])
    upper_triangular_sum = np.sum(np.triu(np.nan_to_num(nb_conflicts_per_a[a], nan=0.0), k=1))

In [None]:
# ELOs / Importances
import torch
np.random.seed(5) 
def compute_elo(a_nb_conflicts, a_supremacies, nb_iter=30000):
    ngames = torch.tensor(a_nb_conflicts, dtype=torch.float32)
    y = torch.tensor(a_supremacies, dtype=torch.float32)
    elo = torch.zeros((a_nb_conflicts.shape[:-1]), requires_grad=True)
    
    always_winners = torch.logical_or(y == 1, y.isnan()).all(dim=-1)
    always_losers = torch.logical_or(y == 0, y.isnan()).all(dim=-1)
    elo.data[always_winners] = torch.inf
    elo.data[always_losers] = -torch.inf
    
    elos_req_grad = ~(always_winners | always_losers)
    
    assert elos_req_grad.any(dim=-1).all()
    
    # For fixing translation invariance. Doesnt work for vectorized ELO (in bootstrapping)
    # first_valid_elo = torch.arange(elo.shape[-1])[elos_req_grad][0]
    # elos_req_grad[first_valid_elo] = False
    
    y_flat = y.flatten(start_dim=-2)
    y_flat = y_flat.nan_to_num(nan=0.5)
    ngames_flat = ngames.flatten(start_dim=-2)
    ngames_flat = ngames_flat.nan_to_num(nan=0)
    
    optimizer = torch.optim.Adam([elo], lr=.1)
    for i in range(nb_iter):        
        optimizer.zero_grad()
        predicted = elo.unsqueeze(-1) - elo.unsqueeze(-2)
        predicted = predicted.flatten(start_dim=-2)
        y_flat_finite = y_flat[predicted.isfinite()]
        ngames_flat_finite = ngames_flat[predicted.isfinite()]
        predicted = predicted[predicted.isfinite()]
        loss = torch.nn.functional.binary_cross_entropy_with_logits(predicted, y_flat_finite, weight=ngames_flat_finite)
        # loss = loss + 0.001 * torch.sum(elo.pow(2))
        loss.backward()
        elo.grad[~elos_req_grad] = 0
        if elo.grad.abs().max() < 1e-7:
            print(f'Converged early after {i} iterations')
            break
        optimizer.step()
        # losses.append(loss.item())
        # if i % 100 == 0 and i > 0:
        #     print(loss.item())
    if i == nb_iter - 1:
        print(f'Did not converge after {nb_iter} iterations')
    return elo.detach().numpy()

elos = {}
for a in annotator_pref_cols:
    elos[a] = compute_elo(nb_conflicts_per_a[a], supremacies_per_a[a].values)
elos = pd.DataFrame(elos, index=principles)
elos

In [None]:
elos = elos.sort_values(by=elos.columns[0], ascending=False)
elos

In [None]:
principle_to_column = dict(zip(principles, p_prefs.columns))
principle_to_i = dict(zip(principles, range(len(principles))))

if data_suff == 'PKU-SafeRLHF':
    labs = ["human", "helpfulness" , "safety", "LlaMa-8B RM", "Mistral-7B RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]
elif data_suff == "hh-rlhf":
    labs = ["human","LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]
else:
    labs = []


def compute_supremacies_for_selected(supremacies_per_a, nb_conflicts, selected_principles, k, show_plot=True,
                                     order=None):
     if order is None:
         order = elos.iloc[selected_principles].index
     selected_supremacies = supremacies_per_a.loc[order, order]
     i_locs = [principle_to_i[p] for p in order]
     selected_conflicts = nb_conflicts[np.ix_(i_locs, i_locs)]
     if show_plot:
        fig, ax = plt.subplots(figsize=(12,10))
        formatted_text = (np.array([
            f"{'/' if np.isnan(w) else f'{w*100:.0f}'}%\n({int(n) if not np.isnan(n) else '/'})"
            for w, n in zip(selected_supremacies.to_numpy().flatten(), selected_conflicts.flatten())
        ]).reshape(selected_supremacies.shape))
        sns.heatmap(
            selected_supremacies,
            annot=formatted_text,
            cmap="RdBu_r",
            fmt='', 
            cbar=True,
            linewidths=0.5,
            linecolor='gray',
            square=True,
            xticklabels=selected_supremacies.index,
            yticklabels=selected_supremacies.index,
            annot_kws={"fontsize": 7, "fontweight": "bold"}, 
            cbar_kws={'label': 'Supremacy', 'shrink': 0.8},
            vmin=0, 
            vmax=1,
            ax=ax
        )
        for edge, spine in ax.spines.items():
            spine.set_visible(True) 
            spine.set_color('black')
            spine.set_linewidth(0.5) 
        cbar = ax.collections[0].colorbar 
        cbar.set_ticks(cbar.get_ticks()) 
        cbar.set_ticklabels([f'{val * 100:.0f}' for val in cbar.get_ticks()]) 
        ax.set_xlabel(f'Principle that disagrees with {labs[k]}', fontsize=12, fontweight="bold")
        ax.set_ylabel(f'Principle that agrees with {labs[k]}', fontsize=12, fontweight="bold")
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
        plt.tight_layout(pad=2.0)
        plt.savefig(os.path.join(FIGURES_DIR, f'supr_{data_suff}_{k}.pdf'), transparent=True)
        plt.show()

k = 0
for a in annotator_pref_cols:
    if a.startswith('a_pref') | a.startswith('llm_'):
        order = elos[a].sort_values(ascending=False).index
        compute_supremacies_for_selected(supremacies_per_a[a], nb_conflicts_per_a[a], [i for i in range(21)], k, show_plot=True, order=order)
        matrix = nb_conflicts_per_a[a]
        matrix_cleaned = np.nan_to_num(matrix, nan=0.0)
    k += 1

In [None]:
elo_ranks = elos.rank(ascending=False).astype(int).sort_values(main_a_pref)
elo_ranks

In [None]:
elos.columns

In [None]:
import matplotlib.colors as mcolors
import matplotlib.cm as cm
from matplotlib.colors import LinearSegmentedColormap

blue_to_grey = LinearSegmentedColormap.from_list(
    "GreyBlue",
    [
        (0.0, "#D3D3D3"),
        (0.2, "#A9A9A9"),
        (0.4, "#808080"),
        (0.6, "#6495ED"),
        (0.8, "#4169E1"),
        (1.0, "#0000A0"),
    ]
)
cmap = blue_to_grey
norm = mcolors.Normalize(vmin=0.1, vmax=1)

elos_filtered = elos.loc[:, ~elos.columns.str.contains(r'(base|w/o)', case=False)]
elos_filtered = elos_filtered.sort_values(by=main_a_pref, ascending=True)
elos_filtered = elos_filtered[[main_a_pref] + [col for col in elos_filtered.columns if col != main_a_pref]]
elos_filtered = pd.DataFrame(elos_filtered)
elos_adjusted = elos_filtered.apply(lambda col: col.replace(
    [-np.inf, np.inf], [col[col != -np.inf].min(), col[col != np.inf].max() + 2]
)).fillna(elos_filtered.min())

elos_adjusted = elos_adjusted - elos_adjusted.min() +  0.5
elos_adjusted = pd.DataFrame(elos_adjusted)

if data_suff == 'PKU-SafeRLHF':
    elos_adjusted.columns = ["Human", "Helpfulness", "Safety", "LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (finetuned)", "Mistral-7B (finetuned)"]
    elos_filtered.columns = ["Human", "Helpfulness", "Safety", "LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (finetuned)", "Mistral-7B (finetuned)"]
else:
    elos_adjusted.columns = ["Human", "LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (finetuned)", "Mistral-7B (finetuned)"]
    elos_filtered.columns = ["Human", "LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o","Sonnet 3.5",  "LlaMa-8B (finetuned)", "Mistral-7B (finetuned)"]

n, m = elos_adjusted.shape

ranked_values = elos_adjusted.rank(axis=0, method='min', ascending=True)
ranked_normalized = ranked_values / (n - 1)

fig, axes = plt.subplots(nrows=1, ncols=m, sharey=True, figsize=(24, 10))

for i, (ax, col) in enumerate(zip(axes, elos_adjusted.columns)):
    positions = np.arange(n)
    bar_colors = cmap(ranked_normalized[col])
    ax.barh(positions, elos_adjusted[col], color=bar_colors, alpha=1)
    bars = ax.patches
    for j, value in enumerate(elos_filtered[col]):
        if value == -np.inf:
            bars[j].set_visible(False)
            if col == "Mistral-7B (finetuned)":
                ax.text(elos_adjusted[col].iloc[j]/3, j, '***', color='red', fontsize=16, va='center_baseline', ha='left', fontweight="bold")
            else:
                ax.text(elos_adjusted[col].iloc[j], j, '***', color='red', fontsize=16, va='center_baseline', ha='left', fontweight="bold")

        elif value == np.inf:
            ax.text(elos_adjusted[col].iloc[j] / 30, j, '***', color='white', fontsize=16, va='center_baseline', ha='left', fontweight="bold")

    filtered_values = elos_filtered[col].replace([np.inf, -np.inf], np.nan) - elos_filtered[col].replace([np.inf, -np.inf], np.nan).min()
    x_min = filtered_values.min()
    x_max = filtered_values.max() + 2
    ax.set_xlim(x_min, x_max)

    ax.set_title(col, fontsize=12)
    ax.set_ylim(-0.6, n - 0.2)  # Ensure no extra padding above/below bars
    ax.margins(y=0) 
    if i == 0:
        ax.set_yticks(positions)
        ax.set_yticklabels(elos_adjusted.index, fontsize=12)
    else:
        ax.tick_params(axis='y', which='both', left=False, labelleft=False)
    ax.set_xlabel('Principle Priority', fontsize=12)

sm = cm.ScalarMappable(cmap=cmap, norm=norm)
sm.set_array([])
plt.subplots_adjust(wspace=0.1)

plt.savefig(os.path.join(FIGURES_DIR, f'{data_suff}2_logprobs.pdf'), bbox_inches='tight', transparent=True)
plt.show()

# Discrepancy

In [None]:
sorted_indices = elo_ranks.iloc[:, 0].sort_values().index.tolist()
sorted_ranks = elo_ranks.loc[sorted_indices]
## TO FILTER BASE AND W/O NA
# sorted_ranks = sorted_ranks.loc[:, ~elos.columns.str.contains(r'(base|w/o)', case=False)]
sorted_ranks = sorted_ranks.sort_values(by=main_a_pref, ascending=True)
sorted_ranks = sorted_ranks[[main_a_pref] + [col for col in sorted_ranks.columns if col != main_a_pref]]
if data_suff == 'PKU-SafeRLHF':
    sorted_ranks.columns = ["Human", "Helpfulness" , "Safety", "LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]
else:
    sorted_ranks.columns = ["Human","LlaMa-8B RM", "Mistral-7B RM", "Most downloaded RM", "DeepSeek-V3", "GPT 4o", "Sonnet 3.5", "LlaMa-8B (base)","LlaMa-8B (finetuned)", "Mistral-7B (base)", "Mistral-7B (finetuned)"]

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(
    sorted_ranks,
    cmap="RdBu",
    annot=sorted_ranks,  # Display ranks inside the cells
    fmt='d',  # Format ranks as integers
    linewidths=0.5,
    linecolor='gray',
    square=True,
    annot_kws={"fontsize": 10, "fontweight": "bold"},
    xticklabels=sorted_ranks.columns,
    yticklabels=sorted_indices,
    cbar=False, 
    ax=ax
)

ax.set_xlabel("Annotators", fontsize=12, labelpad=10)
ax.set_ylabel("Principles", fontsize=12, labelpad=10)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right', fontsize=10)
ax.set_yticklabels(ax.get_yticklabels(), fontsize=10)

plt.tight_layout()
plt.savefig(os.path.join(FIGURES_DIR, f'{data_suff}3.pdf'), bbox_inches='tight', transparent=True)
plt.show()


In [None]:
# elo_discrepancy = elo_ranks.corrwith(elo_ranks[main_a_pref], method='spearman')
from scipy.stats import kendalltau

reference_ranks = elo_ranks[main_a_pref]

def kendalltau_distance(x, y=reference_ranks):
    kc = kendalltau(x, y, variant='b').correlation
    kn = (1 - kc) / 2
    return kn

elo_discrepancy = elo_ranks.apply(kendalltau_distance)
elo_discrepancy

In [None]:
print(kendalltau_distance(elo_ranks['llm_Llama3-8B (base)'], y=elo_ranks['llm_Llama3-8B (finetuned)']))
print(kendalltau_distance(elo_ranks['llm_Mistral-7B (base)'], y=elo_ranks['llm_Mistral-7B (finetuned)']))

In [None]:
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
from joblib import Parallel, delayed

def process_annotator(a, prefs, n_bootstrap, seed=42):
    np.random.seed(seed)
    a_pref_col = prefs[a]
    
    a_nb_conflicts_batch = []
    a_supremacies_batch = []
    for _ in range(n_bootstrap):
        bootstrap_a_pref_col = a_pref_col.sample(frac=1, replace=True)
        bootstrap_supremacies, bootstrap_nb_conflicts = compute_supremacies(bootstrap_a_pref_col)
        a_nb_conflicts_batch.append(bootstrap_nb_conflicts)
        a_supremacies_batch.append(bootstrap_supremacies)
    a_nb_conflicts_batch = np.stack(a_nb_conflicts_batch, axis=0)
    a_supremacies_batch = np.stack(a_supremacies_batch, axis=0)
    
    return compute_elo(a_nb_conflicts_batch, a_supremacies_batch, nb_iter=10000)

def format_results(results):
    return pd.DataFrame({
        col: [
            f"{results[col]['mean']:.3f} ({results[col]['ci_lower']:.3f}, {results[col]['ci_upper']:.3f})"
        ] for col in results.keys()
    }).T.rename(columns={0: 'Kendall Tau (95% CI)'})


n_bootstrap = 100
elo_batch_per_a = {}

results = [process_annotator(a, prefs, n_bootstrap) for a in annotator_pref_cols]
# results = Parallel(n_jobs=-1)(delayed(process_annotator)(a, prefs, n_bootstrap) for a in tqdm(annotator_pref_cols, desc="Processing Annotators"))
elo_batch_per_a = {a: result for a, result in zip(annotator_pref_cols, results)}

In [None]:
from scipy.stats import rankdata
ref_elo_ranks = rankdata(elo_batch_per_a[main_a_pref], axis=1)
results_per_a = {}
for a, a_elos in elo_batch_per_a.items():
    a_elo_ranks = rankdata(a_elos, axis=1)
    if a == main_a_pref:
        assert np.array_equal(a_elo_ranks, ref_elo_ranks)
    bootstrap_results = [kendalltau_distance(a_elo_ranks[i], ref_elo_ranks[i]) for i in range(a_elo_ranks.shape[0])]
    results_per_a[a] = {
        'mean': round(np.nanmean(bootstrap_results) * 100, 3),
        'sd': round(np.nanstd(bootstrap_results) * 100, 3)
    }

kt_distances = pd.DataFrame(results_per_a).T
kt_distances.to_csv(os.path.join(f'kt_distances_{data_suff}.csv'))
kt_distances

In [None]:
datasets = ['HH', 'PKU']
indifference = np.array([33.0, 47.4])
conflict = np.array([16.3, 13.5])
consensus = np.array([50.7, 39.1])

fig, ax = plt.subplots(figsize=(9, 1))

y_pos = np.array([0.1, 0])
bar_height = 0.04 
consensus_bars = ax.barh(y_pos, consensus, height=bar_height, label='Consensus', 
                          color='#4682B4', edgecolor='black')
conflict_bars = ax.barh(y_pos, conflict, height=bar_height, left=consensus, 
                         label='Conflict', color='#CD5C5C', edgecolor='black')
indiff_bars = ax.barh(y_pos, indifference, height=bar_height, 
                       left=consensus + conflict, label='Indifference', 
                       color='#C0C0C0', edgecolor='black')

ax.set_yticks(y_pos)
ax.set_yticklabels(datasets)
ax.legend(loc='lower center', bbox_to_anchor=(0.5, -0.6), ncol=3, columnspacing=8.5)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

ax.set_xticks([])
ax.spines['bottom'].set_visible(False)

for bars, values, offset in zip([consensus_bars, conflict_bars, indiff_bars], 
                                [consensus, conflict, indifference], 
                                [np.zeros_like(consensus), consensus, consensus + conflict]):
    for bar, value, left_offset in zip(bars, values, offset):
        ax.text(left_offset + value / 2, bar.get_y() + bar.get_height() + 0.01, 
                f'{value:.1f}%', ha='center', va='bottom', fontsize=10, color='black')
        
plt.savefig(os.path.join(FIGURES_DIR, 'categories.pdf'), bbox_inches='tight', transparent=True)
plt.show()
