# Measure Sycophancy Bias
The script in the provided Jupyter notebook measures sycophancy bias in a dataset by evaluating model performance using influence functions. It generates AUC (Area Under the Curve) curves to visualize the model's performance across these less sycophantic, more sycophantic, full datasets and compares the results. Additionally, the script includes ROC curves plotted alongside the baseline models for comprehensive performance analysis.

## Load Data

In [None]:
import torch
import numpy as np
import datasets
import utils.influence as utils
D = 65536 # 2^16, size of rapid grad

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load data
_WORK_PATH = os.environ['IF_RLHF_HOME']
model_path = _WORK_PATH + "/logs/logs/Llama-3-8B_sycophancy"
train_data_path =  _WORK_PATH + "/dataset/sycophancy_dataset/train"
val_data_path = _WORK_PATH + "/dataset/sycophancy_dataset/test"

rapid_grad_train = torch.load(f"{model_path}/rapid_grad_train.pt")[D]
rapid_grad_val = torch.load(f"{model_path}/rapid_grad_val.pt")[D]
flipped_indices = np.load(f"{train_data_path}/flipped_indices.npy")
val_data = datasets.load_from_disk(val_data_path)
train_data = datasets.load_from_disk(train_data_path)

In [None]:
from tqdm import tqdm

less_sycophantic, more_sycophantic,equal_sycophantic = [], [], []
for i, example in tqdm(enumerate(val_data)):
    if example['chosen_score'] > example['rejected_score']:
        more_sycophantic.append(i)
    if example['rejected_score'] > example['chosen_score']:
        less_sycophantic.append(i)
    if example['rejected_score'] == example['chosen_score']:
        equal_sycophantic.append(i)

## Calculate Influence

In [4]:
rapid_datainf_less = utils.rapid_datainf(rapid_grad_train, rapid_grad_val, less_sycophantic)
rapid_datainf_more = utils.rapid_datainf(rapid_grad_train, rapid_grad_val, more_sycophantic)
rapid_datainf_equal = utils.rapid_datainf(rapid_grad_train, rapid_grad_val, equal_sycophantic)
rapid_datainf_total = utils.rapid_datainf(rapid_grad_train, rapid_grad_val, np.arange(len(val_data)))

## Plot ROC Curve

In [5]:
from matplotlib import pyplot as plt
from utils.influence import get_roc_auc

# Assign colors as requested
INFLUENCE_COLOR = '#359afa'

In [None]:
def plot_roc_curve_for_sycophancy(influence, mahalanobis, entropy, flipped_indices, title, fpr_llm=[], tpr_llm=[], llm_label=[]):
    """
    Plots the ROC curve for given data and flipped indices and calculates the AUC value.

    Parameters:
    influence (np.array): Array of data points from the RapidInf algorithm.
    flipped_indices (list): List of indices that were flipped.

    Returns:
    float: AUC value of the ROC curve.
    """
    plt.rcParams.update({
        'font.family': 'Times New Roman',
        'mathtext.fontset': 'custom',
        'mathtext.it': 'Times New Roman:italic',
        'mathtext.rm': 'Times New Roman',
    })

    # Calculate ROC and AUC values for each method
    roc_auc, fpr, tpr = get_roc_auc(influence, flipped_indices)
    roc_auc_mahalanobis, fpr_mahalanobis, tpr_mahalanobis = get_roc_auc(mahalanobis, flipped_indices)
    # roc_auc_self_confidence, fpr_self_confidence, tpr_self_confidence = get_roc_auc(self_confidence, flipped_indices)
    roc_auc_entropy, fpr_entropy, tpr_entropy = get_roc_auc(entropy, flipped_indices)
    
    # Create a figure with the same size
    fig, ax = plt.subplots(figsize=(6.5, 6))


    # Plot ROC curves with same colors and line widths as in the first function
    ax.plot(fpr, tpr, color=INFLUENCE_COLOR, lw=3, label=r'$\mathit{Less \; Syco.}$ (AUC=%0.3f)' % roc_auc)  # Navy for Influence
    ax.plot(fpr_mahalanobis, tpr_mahalanobis, color=BASELINE_COLOR1, lw=3, linestyle='-.', label=r'$\mathit{More\; Syco.}$ (AUC=%0.3f)' % roc_auc_mahalanobis)  # Magenta for Mahalanobis
    # ax.plot(fpr_self_confidence, tpr_self_confidence, color=LLM_COLOR2, lw=3, linestyle='-.', label=r'$\mathit{Equal\; Syco.}$ (AUC=%0.3f)' % roc_auc_self_confidence)  # Green for Self-Confidence
    ax.plot(fpr_entropy, tpr_entropy, color=BASELINE_COLOR2, lw=3, linestyle='--', label=r'Full, (AUC=%0.3f)' % roc_auc_entropy)  # Yellow for Entropy
    
    # Baseline diagonal line
    ax.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--', alpha=0.6)  # Random classifier
    
    # Set axis limits
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    
    # Set labels, legend, and title with adjusted font sizes
    ax.set_xlabel('False Positive Rate', fontsize=22)
    ax.set_ylabel('True Positive Rate', fontsize=22)
    ax.tick_params(axis='both', which='major', labelsize=16, sycophancy=0)
    ax.legend(loc="lower right", fontsize=16, title = "Validation Set", title_fontsize = 16)
    ax.set_title(title, fontsize=22)

    # Add grid lines and remove spines for a clean look
    ax.grid(True, which='major', linestyle='--', color='gray', alpha=0.3)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)

    # Display the plot
    plt.show()
    return roc_auc

# Example call with updated parameters
plot_roc_curve_for_sycophancy(rapid_datainf_less, rapid_datainf_more, rapid_datainf_total, flipped_indices, "Sycophancy Bias", fpr_llm=fpr_llm, tpr_llm=tpr_llm, llm_label=llm_labels)
