In [1]:
import sys
import os
from typing import Optional
from tqdm import tqdm
sys.path.append(os.path.abspath('../'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import seaborn as sns
import os
from IPython.display import display, HTML
import torch

from utils.plot_utils import *
from utils.utils import *

pd.set_option('display.max_rows', 500)

In [3]:
count_inf = -100
experiment_dir = "../results/JailbreakLikelihoodRatio/JailbreakLikelihoodRatio_aligned=meta-llama-Llama-3.1-8B-Instruct_unaligned=grimjim-Llama-3.1-8B-Instruct-abliterated_via_adapter_dataset=Mechanistic-Anomaly-Detection-llama3-jailbreaks_num_samples=None"

In [4]:
aligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/aligned_jailbreak_logprobs.pt",weights_only=True)
aligned_benign_logprobs = torch.load(f"{experiment_dir}/aligned_benign_logprobs.pt",weights_only=True)

unaligned_jailbreak_logprobs = torch.load(f"{experiment_dir}/unaligned_jailbreak_logprobs.pt",weights_only=True)
unaligned_benign_logprobs = torch.load(f"{experiment_dir}/unaligned_benign_logprobs.pt",weights_only=True)

In [5]:
aligned_jailbreak_logprobs[0].keys()

dict_keys(['prompt', 'completion', 'original_split', 'context', 'input_tokens', 'response_tokens', 'response_string', 'no', 'user_logprobs', 'asst_logprobs', 'gen_logprobs'])

## TPRs and FPRs between benign prompts and jailbreaks

In [6]:
aligned_jb_logprobs_df = pt_to_df(None, aligned_jailbreak_logprobs, count_inf)
unaligned_jb_logprobs_df = pt_to_df(None, unaligned_jailbreak_logprobs, count_inf) 

aligned_benign_logprobs_df = pt_to_df(None, aligned_benign_logprobs, count_inf)
unaligned_benign_logprobs_df = pt_to_df(None, unaligned_benign_logprobs, count_inf) 

## User logprobs

In [7]:
user_df_diffs = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                          unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="user")
display(user_df_diffs.describe(percentiles=[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]).T)



Unnamed: 0,count,mean,std,min,1%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%,max
jailbreak_diff,1478.0,-0.057818,0.964487,-8.046378,-2.705907,-1.672511,-1.210702,-0.688714,-0.38071,-0.155846,-0.028578,0.057189,0.339704,0.69438,1.030601,2.072257,4.656753
benign_diff,3979.0,-0.045885,0.897249,-12.576347,-2.167499,-1.451963,-1.095561,-0.726834,-0.391436,-0.287766,-0.018869,0.219791,0.34477,0.644738,1.027403,2.060069,4.434785
aligned_jailbreak_logprobs,1478.0,-17.913033,8.345261,-53.660366,-39.03275,-33.732262,-29.204985,-25.449543,-23.024775,-16.976578,-15.028631,-13.92781,-12.876577,-11.256769,-8.427996,-5.329,-3.521814
unaligned_jailbreak_logprobs,1478.0,-17.970851,8.033484,-50.252476,-38.746394,-33.14567,-28.499713,-25.466946,-22.810515,-17.335542,-15.478501,-14.166482,-13.011414,-11.500716,-8.924778,-5.5071,-3.760647
aligned_benign_logprobs,3979.0,-12.273098,6.709303,-54.805164,-33.288954,-26.960443,-22.391914,-16.047569,-13.202096,-11.867305,-10.902709,-9.873618,-8.211651,-6.590675,-5.433239,-3.893397,-3.027345
unaligned_benign_logprobs,3979.0,-12.318983,6.517159,-50.370379,-32.390563,-26.520209,-22.130861,-15.949628,-13.220849,-11.921374,-10.937098,-9.864261,-8.375627,-6.87096,-5.761764,-4.21111,-3.118894


In [16]:
tprs_fprs = plot_ROC(user_df_diffs["jailbreak_diff"], 
                     user_df_diffs["benign_diff"], 
                     plot_title="(Unaligned - Aligned) prompt logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,FPRs,TPRs
0,0.01,0.025034
1,0.05,0.067659
2,0.1,0.112991
3,0.2,0.179973
4,0.3,0.296346
5,0.4,0.372124
6,0.5,0.516915
7,0.6,0.635318
8,0.7,0.70433
9,0.8,0.778755


In [17]:
tprs_fprs = plot_ROC(user_df_diffs["aligned_jailbreak_logprobs"], 
                     user_df_diffs["aligned_benign_logprobs"],plot_title="Prompt aligned logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,FPRs,TPRs
0,0.01,0.052097
1,0.05,0.165088
2,0.1,0.305819
3,0.2,0.439783
4,0.3,0.663735
5,0.4,0.769959
6,0.5,0.815968
7,0.6,0.85521
8,0.7,0.90866
9,0.8,0.966847


In [18]:
tprs_fprs = plot_ROC(user_df_diffs["unaligned_jailbreak_logprobs"], 
                     user_df_diffs["unaligned_benign_logprobs"],plot_title="Prompt unaligned logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,FPRs,TPRs
0,0.01,0.054804
1,0.05,0.167118
2,0.1,0.306495
3,0.2,0.468877
4,0.3,0.681326
5,0.4,0.772666
6,0.5,0.833559
7,0.6,0.865359
8,0.7,0.914073
9,0.8,0.964817


## Asst logprobs

In [19]:
asst_df_diffs = return_diffs(unaligned_jb_logprobs_df, aligned_jb_logprobs_df,
                          unaligned_benign_logprobs_df, aligned_benign_logprobs_df,prefix="asst")
display(asst_df_diffs.describe(percentiles=[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.99]).T)
tprs_fprs = plot_ROC(asst_df_diffs["jailbreak_diff"], asst_df_diffs["benign_diff"],plot_title="(Unaligned - Aligned) completion logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,count,mean,std,min,1%,5%,10%,20%,30%,40%,50%,60%,70%,80%,90%,99%,max
jailbreak_diff,1478.0,-0.39266,1.345801,-11.604457,-3.993737,-2.417292,-1.764826,-1.211485,-0.823709,-0.490693,-0.388477,-0.03067,0.001403,0.37944,0.801106,3.591631,9.883117
benign_diff,3979.0,-0.411234,1.004984,-21.627412,-2.740566,-1.976431,-1.583464,-1.183968,-0.80437,-0.439107,-0.399468,-0.02852,-0.008056,0.354352,0.644244,1.877975,12.15814
aligned_jailbreak_logprobs,1478.0,-5.275814,5.060966,-45.424496,-26.61794,-15.917711,-11.418883,-6.576049,-5.103877,-4.297399,-3.70547,-3.2851,-2.85227,-2.280477,-1.553817,-0.006884,0.0
unaligned_jailbreak_logprobs,1478.0,-5.668473,5.055192,-45.433898,-27.687507,-16.291899,-11.526366,-7.104004,-5.567976,-4.809169,-4.267492,-3.76102,-3.2301,-2.701884,-1.880727,-0.008241,0.0
aligned_benign_logprobs,3979.0,-2.324109,1.41981,-28.656919,-6.130097,-4.255643,-3.782436,-3.083433,-2.710116,-2.533443,-2.18468,-1.848914,-1.727219,-1.367883,-0.972205,-0.077061,0.0
unaligned_benign_logprobs,3979.0,-2.735342,1.548682,-28.637563,-6.50447,-4.997124,-4.276778,-3.742964,-3.326134,-2.970982,-2.606488,-2.261866,-2.030845,-1.716267,-1.176477,-0.079622,0.0


Unnamed: 0,FPRs,TPRs
0,0.01,0.041949
1,0.05,0.08728
2,0.1,0.133965
3,0.2,0.223951
4,0.3,0.326793
5,0.4,0.427605
6,0.5,0.479026
7,0.6,0.606901
8,0.7,0.663735
9,0.8,0.773342


In [20]:
tprs_fprs = plot_ROC(asst_df_diffs["aligned_jailbreak_logprobs"], 
                     asst_df_diffs["aligned_benign_logprobs"],plot_title="Asst aligned logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,FPRs,TPRs
0,0.01,0.223275
1,0.05,0.407307
2,0.1,0.490528
3,0.2,0.634641
4,0.3,0.713802
5,0.4,0.76793
6,0.5,0.833559
7,0.6,0.872124
8,0.7,0.894452
9,0.8,0.931664


In [21]:
tprs_fprs = plot_ROC(asst_df_diffs["unaligned_jailbreak_logprobs"], 
                     asst_df_diffs["unaligned_benign_logprobs"],plot_title="Asst unaligned logprobs")
plt.close()
display(tprs_fprs)

Unnamed: 0,FPRs,TPRs
0,0.01,0.23207
1,0.05,0.379567
2,0.1,0.494587
3,0.2,0.601488
4,0.3,0.687415
5,0.4,0.763194
6,0.5,0.828823
7,0.6,0.867388
8,0.7,0.892422
9,0.8,0.917456
