In [1]:
import sys
import os
from typing import Optional
from tqdm import tqdm
sys.path.append(os.path.abspath('../'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import seaborn as sns
import os
from IPython.display import display, HTML
import torch

from utils.plot_utils import *
from utils.utils import *

pd.set_option('display.max_rows', 500)

In [2]:
## How much to count -inf to logprob
count_inf = -10

## Helper functions

In [3]:
def experiment_dir_to_table(experiment_dir):
    aligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/aligned_jailbreak_logprobs.pt",weights_only=True)
    aligned_benign_logprobs = torch.load(f"{experiment_dir}/aligned_benign_logprobs.pt",weights_only=True)
    
    unaligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/unaligned_jailbreak_logprobs.pt",weights_only=True)
    unaligned_benign_logprobs = torch.load(f"{experiment_dir}/unaligned_benign_logprobs.pt",weights_only=True)

    aligned_jb_logprobs_df = pt_to_df(None, aligned_jailbreak_logprobs, count_inf)
    unaligned_jb_logprobs_df = pt_to_df(None, unaligned_jailbreak_logprobs, count_inf) 
    
    aligned_benign_logprobs_df = pt_to_df(None, aligned_benign_logprobs, count_inf)
    unaligned_benign_logprobs_df = pt_to_df(None, unaligned_benign_logprobs, count_inf) 
    
    user_jb, user_benign = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                          unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="user")
    asst_jb, asst_benign = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                              unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="asst")
    lst_of_columns = []
    for column in ["diff","aligned_logprobs","unaligned_logprobs"]:
        lst_of_columns.extend(
            [
                (user_jb
                 .groupby("original_split")[column]
                 .apply(lambda x:compute_tprs(np.array(x), user_benign[column])
                        .set_index("FPRs").loc[0.01]
                        )
                 .rename(f"user_{column}")
                 .droplevel(1)
                ),
                (asst_jb
                 .groupby("original_split")[column]
                 .apply(lambda x:compute_tprs(np.array(x), asst_benign[column])
                        .set_index("FPRs").loc[0.01]
                        )
                 .rename(f"asst_{column}")
                 .droplevel(1)
                )
            ]
        )
    
    df_total = pd.concat(lst_of_columns, axis=1).T.rename(index={"user_diff":"prompt logprobs diff",
                                                                 "asst_diff":"gen logprobs diff",
                                                                 "user_aligned_logprobs":"prompt logprobs aligned",
                                                                 "asst_aligned_logprobs":"gen logprobs aligned",
                                                                 "user_unaligned_logprobs":"prompt logprobs unaligned",
                                                                 "asst_unaligned_logprobs":"gen logprobs unaligned",
                                                                },
                                                            columns={"harmful_autodan":"AutoDAN",
                                                                     "best_of_n":"Best of N", 
                                                                     "harmful_gcg": "GCG", 
                                                                     "harmful_human_mt":"Multi-Turn",
                                                                     "harmful_misc":"Misc",
                                                                     "harmful_msj":"MSJ",
                                                                     "harmful_pair":"Pair", 
                                                                     "harmful_prefill":"Prefill" 
                                                                })
    display(df_total)
    display(df_total.loc[["gen logprobs diff","gen logprobs aligned"],["AutoDAN","GCG","Multi-Turn","Prefill"]])

## Jailbreak vs aligned

In [4]:
experiment_dir_to_table("../results/JailbreakLikelihoodRatio/JailbreakLikelihoodRatio_aligned=meta-llama-Llama-3.1-8B-Instruct_unaligned=grimjim-Llama-3.1-8B-Instruct-abliterated_via_adapter_dataset=Mechanistic-Anomaly-Detection-llama3-jailbreaks_num_samples=None")

original_split,AutoDAN,harmful_best_of_n,GCG,Multi-Turn,Misc,MSJ,Pair,Prefill
prompt logprobs diff,0.0,0.073298,0.03,0.0,0.0625,0.0,0.026515,0.035533
gen logprobs diff,0.082051,0.04712,0.03,0.0,0.1,0.0,0.060606,0.050761
prompt logprobs aligned,0.276923,0.010471,0.09,0.003436,0.0,0.0,0.0,0.086294
gen logprobs aligned,0.205128,0.136126,0.15,0.570447,0.2,0.00625,0.015152,0.416244
prompt logprobs unaligned,0.276923,0.010471,0.09,0.0,0.0,0.0,0.0,0.101523
gen logprobs unaligned,0.210256,0.188482,0.12,0.560137,0.2,0.0,0.030303,0.360406


original_split,AutoDAN,GCG,Multi-Turn,Prefill
gen logprobs diff,0.082051,0.03,0.0,0.050761
gen logprobs aligned,0.205128,0.15,0.570447,0.416244


## unaligned vs aligned

In [5]:
experiment_dir_to_table("../results/JailbreakLikelihoodRatio/JailbreakLikelihoodRatio_aligned=meta-llama-Llama-3.1-8B-Instruct_unaligned=meta-llama-Llama-3.1-8B_dataset=Mechanistic-Anomaly-Detection-llama3-jailbreaks_num_samples=None")

original_split,AutoDAN,harmful_best_of_n,GCG,Multi-Turn,Misc,MSJ,Pair,Prefill
prompt logprobs diff,0.0,0.020942,0.0,0.0,0.0125,0.0,0.0,0.010152
gen logprobs diff,0.0,0.015707,0.02,0.0,0.0,0.0,0.0,0.0
prompt logprobs aligned,0.276923,0.010471,0.09,0.003436,0.0,0.0,0.0,0.086294
gen logprobs aligned,0.205128,0.136126,0.15,0.570447,0.2,0.00625,0.015152,0.416244
prompt logprobs unaligned,0.133333,0.031414,0.11,0.003436,0.0,0.0,0.003788,0.147208
gen logprobs unaligned,0.0,0.031414,0.05,0.109966,0.0375,0.0,0.0,0.126904


original_split,AutoDAN,GCG,Multi-Turn,Prefill
gen logprobs diff,0.0,0.02,0.0,0.0
gen logprobs aligned,0.205128,0.15,0.570447,0.416244
