In [None]:
import sys
import os
from typing import Optional
from tqdm import tqdm
sys.path.append(os.path.abspath('../'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import seaborn as sns
import os
from IPython.display import display, HTML
import torch

from utils.plot_utils import *
from utils.utils import *

pd.set_option('display.max_rows', 500)

In [None]:
## How much to count -inf to logprob
count_inf = -10

## Helper functions

In [None]:
def experiment_dir_to_table(experiment_dir):
    aligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/aligned_jailbreak_logprobs.pt",weights_only=True)
    aligned_benign_logprobs = torch.load(f"{experiment_dir}/aligned_benign_logprobs.pt",weights_only=True)
    
    unaligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/unaligned_jailbreak_logprobs.pt",weights_only=True)
    unaligned_benign_logprobs = torch.load(f"{experiment_dir}/unaligned_benign_logprobs.pt",weights_only=True)

    aligned_jb_logprobs_df = pt_to_df(None, aligned_jailbreak_logprobs, count_inf)
    unaligned_jb_logprobs_df = pt_to_df(None, unaligned_jailbreak_logprobs, count_inf) 
    
    aligned_benign_logprobs_df = pt_to_df(None, aligned_benign_logprobs, count_inf)
    unaligned_benign_logprobs_df = pt_to_df(None, unaligned_benign_logprobs, count_inf) 
    
    user_jb, user_benign = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                          unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="user")
    asst_jb, asst_benign = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                              unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="asst")
    lst_of_columns = []
    for column in ["diff","aligned_logprobs","unaligned_logprobs"]:
        lst_of_columns.extend(
            [
                (user_jb
                 .groupby("original_split")[column]
                 .apply(lambda x:compute_tprs(np.array(x), user_benign[column])
                        .set_index("FPRs").loc[0.01]
                        )
                 .rename(f"user_{column}")
                 .droplevel(1)
                ),
                (asst_jb
                 .groupby("original_split")[column]
                 .apply(lambda x:compute_tprs(np.array(x), asst_benign[column])
                        .set_index("FPRs").loc[0.01]
                        )
                 .rename(f"asst_{column}")
                 .droplevel(1)
                )
            ]
        )
    
    df_total = pd.concat(lst_of_columns, axis=1).T.rename(index={"user_diff":"prompt logprobs diff",
                                                                 "asst_diff":"gen logprobs diff",
                                                                 "user_aligned_logprobs":"prompt logprobs aligned",
                                                                 "asst_aligned_logprobs":"gen logprobs aligned",
                                                                 "user_unaligned_logprobs":"prompt logprobs unaligned",
                                                                 "asst_unaligned_logprobs":"gen logprobs unaligned",
                                                                },
                                                            columns={"harmful_autodan":"AutoDAN",
                                                                     "harmful_best_of_n":"Best of N", 
                                                                     "harmful_gcg": "GCG", 
                                                                     "harmful_human_mt":"Multi-Turn",
                                                                     "harmful_misc":"Misc",
                                                                     "harmful_msj":"MSJ",
                                                                     "harmful_pair":"Pair", 
                                                                     "harmful_prefill":"Prefill" 
                                                                })
    return df_total

## Jailbreak vs aligned

In [None]:
df_total = experiment_dir_to_table("../results/JailbreakLikelihoodRatio/JailbreakLikelihoodRatio_aligned=meta-llama-Llama-3.1-8B-Instruct_unaligned=grimjim-Llama-3.1-8B-Instruct-abliterated_via_adapter_dataset=Mechanistic-Anomaly-Detection-llama3-jailbreaks_num_samples=None")
df_total.columns.name = None
df_total

In [None]:
df_total[["GCG","AutoDAN","MSJ","Pair","Multi-Turn","Prefill","Misc"]].mean(axis=1)

In [None]:
df_total.loc[["gen logprobs diff","prompt logprobs aligned","gen logprobs aligned"],["AutoDAN","Best of N","GCG","Multi-Turn","Prefill"]]