In [1]:
import os
if "models" not in os.listdir("."):
    os.chdir("..")

In [2]:
%load_ext autoreload
%autoreload 2
import penzai
from penzai import pz
pz.ts.register_as_default()
pz.ts.register_autovisualize_magic()
pz.enable_interactive_context()

In [3]:
from micrlhf.llama import LlamaTransformer
llama = LlamaTransformer.from_pretrained("models/gemma-2b-it.gguf",
                                         from_type="gemma",
                                         load_eager=True
                                         )

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("alpindale/gemma-2b")
tokenizer.padding_side = "right"

In [5]:
!mkdir -p data
import pandas as pd

df_adv = pd.read_csv("data/adv.csv")

format_prompt = """<start_of_turn>user\n
{}\n
<start_of_turn>model\n
{}"""
# offset = 1
# df_do = df.apply(lambda x: format_response.format(x['goal'], x['target']), axis=1)
# prompts_harmful = df.apply(lambda x: format_prompt.format(x['goal'], "")[:-offset], axis=1).to_list()[:100]
prompts_harmful = df_adv.apply(lambda x: format_prompt.format(x['goal'], ""), axis=1).to_list()[:100]
dataset_jail = pd.read_csv("data/jail.csv").apply(lambda x: x["Goal"], axis=1).to_list()
prompts_jail = [format_prompt.format(x, "") for x in dataset_jail]
import datasets
# https://colab.research.google.com/drive/1a-aQvKC9avdZpdyBn4jgRQFObTPy1JZw
hf_path = 'tatsu-lab/alpaca'
dataset = datasets.load_dataset(hf_path)
# filter for instructions that do not have inputs
prompts_harmless = []
for i in range(len(dataset['train'])):
    if len(prompts_harmless) >= len(prompts_harmful):
        break
    if dataset['train'][i]['input'].strip() == '':
        # prompts_harmless.append(format_prompt.format(dataset['train'][i]['instruction'], "")[:-offset])
        prompts_harmless.append(format_prompt.format(dataset['train'][i]['instruction'], ""))

# ds = datasets.load_dataset("MBZUAI/LaMini-instruction", split="train", streaming=True)
# prompts_harmless = []
# for _, text in zip(range(100), ds):
#     prompts_harmless.append(format_prompt.format(text["instruction"], ""))

# ds = datasets.load_dataset("nev/openhermes-2.5-phi-format-text", split="train", streaming=True)
# prompts_harmless = []
# for _, text in zip(range(100), ds):
#     text = text["text"]
#     text = "".join(text.partition("<|assistant|>\n")[:2])
#     prompts_harmless.append(text)

  pid, fd = os.forkpty()


In [6]:
from micrlhf.sampling import sample
msl = 128
completions_harmless = sample(llama, tokenizer, prompts_harmless, max_seq_len=msl, do_sample=True, verbose=True, return_only_completion=True)[0]
ds_harmless = [(prompt, completion.partition("<eos>")[0].strip()) for prompt, completion in zip(prompts_harmless, completions_harmless)]

  0%|          | 0/81 [00:00<?, ?it/s]

In [7]:
from micrlhf.sampling import sample, trange, jnp, load_tokenizer, jit_wrapper
import jax

tokens = tokenizer.batch_encode_plus(prompts_harmful + prompts_harmless,
                                     return_tensors="np",
                                     padding="max_length",
                                     truncation=True,
                                     max_length=128,
                                     return_attention_mask=True)
token_array = jnp.asarray(tokens["input_ids"])
token_array = jax.device_put(token_array, jax.sharding.NamedSharding(llama.mesh, jax.sharding.PartitionSpec("dp", "sp")))
token_array = pz.nx.wrap(token_array, "batch", "seq").untag("batch").tag("batch")
inputs = llama.inputs.from_basic_segments(token_array)

In [8]:
# n_iterations = 50
# batch_size = 200
# max_length = 128
# direction = residiffs[13]
# for i in (bar := trange(n_iterations)):
#     data_harmful = df_adv.sample(batch_size).apply(lambda x: (
#         tokenizer.encode(format_prompt.format(x["goal"], "")),
#         tokenizer.encode(x["target"])[1:],), axis=1).to_list()
#     input_ids = [x + y for x, y in data_harmful]
#     input_ids = [(x + [tokenizer.pad_token_id] * max(0, max_length - len(x)))[:max_length] for x in input_ids]
#     loss_mask = [[0] * len(x) + [1] * len(y) for x, y in data_harmful]
#     loss_mask = [(x + [0] * (max_length - len(x)))[:max_length] for x in loss_mask]

#     input_ids = pz.nx.wrap(jnp.asarray(input_ids), "batch", "seq")
#     loss_mask = pz.nx.wrap(jnp.asarray(loss_mask), "batch", "seq")

#     loss, grad = lwg(direction, llama, input_ids, loss_mask)
#     direction -= 0.01 * grad
#     bar.set_postfix(loss=float(loss))

In [22]:
from micrlhf.utils.activation_manipulation import ablate_direction
from equinox.internal._loop import scan
from penzai.toolshed import basic_training
from functools import partial
from penzai.toolshed.lora import loraify_linears_in_selection
import optax

select_params = lambda x: x.select().at_instances_of(pz.nn.Linear).where(lambda x: x.weights.name.endswith(".out_proj.weights"))
@partial(jax.jit, static_argnames=("normalize", "batch_axis"))
def get_loss(model, rng, state, input_ids, loss_mask, normalize=True, batch_axis="direction", loss_mul=1.0):
    del rng, state
    
    inputs = model.inputs.from_basic_segments(input_ids)
    logits = model(inputs)
    loss = pz.nx.nmap(lambda l, i, m: -jnp.take_along_axis(jax.nn.log_softmax(l[:-1].astype(jnp.float32), -1), i[1:, None], -1)[:, 0] * m[1:] / jnp.maximum(1, m[1:].sum()))(
        logits.untag("seq", "vocabulary"), input_ids.untag("seq"), loss_mask.untag("seq")).sum().unwrap("batch").mean()
    return loss * loss_mul, None, {"loss": loss}
train_step = basic_training.build_train_step_fn(get_loss, donate_params_and_state=False)

@partial(jax.jit)
def get_loss_alternative(model, rng, state, input_ids, loss_mask):
    base_model = model
    
    data_arr = {"input_ids": input_ids, "loss_mask": loss_mask}
    pp = input_ids.named_shape["batch"] // 4
    data_arr_train = {k: v.untag("batch")[:pp*3].tag("batch") for k, v in data_arr.items()}
    data_arr_harmful = {k: v.untag("batch")[pp * 3:].tag("batch") for k, v in data_arr.items()}
    data_arr_harmless = {k: v.untag("batch")[:pp].tag("batch") for k, v in data_arr.items()}

    frozen_new_llama = model.select().at_instances_of(pz.nn.Parameter).apply(lambda param: pz.nn.FrozenParameter(param.value, param.name))
    linears = select_params(frozen_new_llama)
    llama_w_lora_uninit = loraify_linears_in_selection(linears, rank=16)
    llama_w_lora = pz.nn.initialize_parameters(llama_w_lora_uninit, rng)
    temp_train_state = basic_training.TrainState.initial_state(
        llama_w_lora,
        optax.chain(
            optax.zero_nans(),
            optax.clip_by_global_norm(1.0),
            optax.sgd(5e-1)
        ),
        root_rng=rng
    )
    def body_fn(state, i):
        return train_step(state, **data_arr_train)[0], None
    # temp_train_state, _ = scan(body_fn, temp_train_state, None, length=3, kind="checkpointed", checkpoints=1
    #                         #    buffers=lambda state: state.params
    #                            )
    temp_train_state, _ = jax.lax.scan(jax.checkpoint(body_fn), temp_train_state, None, length=3,
                        #    buffers=lambda state: state.params
                            )
    model = temp_train_state.model
    
    loss_harmless = get_loss(base_model, rng, state, **data_arr_harmless)[0]
    loss_baseline = get_loss(base_model, rng, state, **data_arr_harmful)[0]
    loss_harmful = get_loss(model, rng, state, **data_arr_harmful)[0]
    loss = jax.nn.relu(jax.lax.stop_gradient(loss_baseline) - loss_harmful) + loss_harmless
    return loss, None, {"loss": loss}

train_step_alternative = basic_training.build_train_step_fn(get_loss_alternative, donate_params_and_state=True)

In [23]:
from dataclasses import replace
from micrlhf.utils.vector_storage import download_vector
import random
n_iterations = 100
batch_size = 8
max_length = 128
frozen_llama = llama.select().at_instances_of(pz.nn.Parameter).apply(lambda param: pz.nn.FrozenParameter(param.value, param.name))
linears = select_params(frozen_llama)
llama_w_lora_uninit = loraify_linears_in_selection(linears, rank=16)
llama_w_lora = pz.nn.initialize_parameters(llama_w_lora_uninit, jax.random.key(0))
optimizer = optax.chain(optax.zero_nans(), optax.clip_by_global_norm(1.0), optax.adam(1e-3))
train_state = basic_training.TrainState.initial_state(
    llama_w_lora, optimizer, root_rng=jax.random.PRNGKey(1)
)

def data_to_array(data):
    input_ids = [x + y for x, y in data]
    input_ids = [(x + [tokenizer.pad_token_id] * max(0, max_length - len(x)))[:max_length] for x in input_ids]
    loss_mask = [[0] * len(x) + [1] * len(y) for x, y in data]
    loss_mask = [(x + [0] * (max_length - len(x)))[:max_length] for x in loss_mask]
    input_ids = pz.nx.wrap(jnp.asarray(input_ids), "batch", "seq")
    loss_mask = pz.nx.wrap(jnp.asarray(loss_mask), "batch", "seq")
    return dict(input_ids=input_ids, loss_mask=loss_mask)

def get_data_harmful():
    return df_adv.sample(batch_size).apply(lambda x: (
        tokenizer.encode(format_prompt.format(x["goal"], "")),
        tokenizer.encode(x["target"])[1:],), axis=1).to_list()

for i in (bar := trange(n_iterations)):
    data_harmful = get_data_harmful()
    data_harmless = [(tokenizer.encode(format_prompt.format(prompt, "")), tokenizer.encode(completion)[1:]) for prompt, completion in [(prompts_harmless[i], completions_harmless[i]) for i in random.sample(list(range(len(prompts_harmless))), k=batch_size)]]

    train_state, out = train_step_alternative(train_state, **data_to_array(data_harmless + data_harmful))
    print("Diff loss:", out["loss"])

    data_harmful = get_data_harmful()
    print("Harmful:", train_step(train_state, **data_to_array(data_harmful), loss_mul=0.0)[1]["loss"])
    print("Harmless:", train_step(train_state, **data_to_array(data_harmless), loss_mul=0.0)[1]["loss"])
    train_state_alt = train_state
    for j in trange(7):
        train_state_alt, out = train_step(train_state_alt, **data_to_array(data_harmful))
        print(f" Jailbreak ({i}, {j}):", out["loss"])

  0%|          | 0/100 [00:00<?, ?it/s]

Diff loss: 2.7704175
Harmful: 1.8316121
Harmless: 1.3432202


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (0, 0): 1.8316121
 Jailbreak (0, 1): 0.80744565
 Jailbreak (0, 2): 0.22266574
 Jailbreak (0, 3): 0.029896855
 Jailbreak (0, 4): 0.015266482
 Jailbreak (0, 5): 0.0043207062
 Jailbreak (0, 6): 0.010063058
Diff loss: 2.740958
Harmful: 2.776177
Harmless: 0.48991632


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (1, 0): 2.776177
 Jailbreak (1, 1): 0.7656863
 Jailbreak (1, 2): 0.23356661
 Jailbreak (1, 3): 0.029031012
 Jailbreak (1, 4): 0.0051075364
 Jailbreak (1, 5): 0.009361305
 Jailbreak (1, 6): 0.0005004694
Diff loss: 3.0112164
Harmful: 1.8004706
Harmless: 0.8352972


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (2, 0): 1.8004706
 Jailbreak (2, 1): 0.6831258
 Jailbreak (2, 2): 0.262758
 Jailbreak (2, 3): 0.11206656
 Jailbreak (2, 4): 0.035488598
 Jailbreak (2, 5): 0.02994756
 Jailbreak (2, 6): 0.006178624
Diff loss: 1.8908956
Harmful: 2.2389402
Harmless: 5.043002


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (3, 0): 2.2389402
 Jailbreak (3, 1): 0.84493107
 Jailbreak (3, 2): 0.3781135
 Jailbreak (3, 3): 0.16137575
 Jailbreak (3, 4): 0.044475023
 Jailbreak (3, 5): 0.029111985
 Jailbreak (3, 6): 0.008290621
Diff loss: 1.9087011
Harmful: 1.4648628
Harmless: 0.52307475


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (4, 0): 1.4648628
 Jailbreak (4, 1): 0.592919
 Jailbreak (4, 2): 0.22236793
 Jailbreak (4, 3): 0.05223029
 Jailbreak (4, 4): 0.00978276
 Jailbreak (4, 5): 0.0017000093
 Jailbreak (4, 6): 0.0003145808
Diff loss: 7.5018783
Harmful: 1.6188093
Harmless: 2.9091055


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (5, 0): 1.6188093
 Jailbreak (5, 1): 1.2417357
 Jailbreak (5, 2): 0.63285446
 Jailbreak (5, 3): 0.22311679
 Jailbreak (5, 4): 0.07757552
 Jailbreak (5, 5): 0.045012943
 Jailbreak (5, 6): 0.021651138
Diff loss: 2.249676
Harmful: 2.645308
Harmless: 0.87051344


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (6, 0): 2.645308
 Jailbreak (6, 1): 1.7075566
 Jailbreak (6, 2): 0.8784494
 Jailbreak (6, 3): 0.3213915
 Jailbreak (6, 4): 0.107223265
 Jailbreak (6, 5): 0.030256448
 Jailbreak (6, 6): 0.010264403
Diff loss: 3.8087666
Harmful: 4.2821336
Harmless: 1.8485858


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (7, 0): 4.2821336
 Jailbreak (7, 1): 2.726761
 Jailbreak (7, 2): 1.675366
 Jailbreak (7, 3): 1.0130688
 Jailbreak (7, 4): 0.44928914
 Jailbreak (7, 5): 0.1749166
 Jailbreak (7, 6): 0.035526782
Diff loss: 3.5158334
Harmful: 8.19337
Harmless: 1.7751902


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (8, 0): 8.19337
 Jailbreak (8, 1): 7.3145847
 Jailbreak (8, 2): 4.828895
 Jailbreak (8, 3): 3.083138
 Jailbreak (8, 4): 1.3949034
 Jailbreak (8, 5): 0.4209422
 Jailbreak (8, 6): 0.08218711
Diff loss: 8.781524
Harmful: 12.04012
Harmless: 1.191514


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (9, 0): 12.04012
 Jailbreak (9, 1): 9.203932
 Jailbreak (9, 2): 5.865096
 Jailbreak (9, 3): 3.1582437
 Jailbreak (9, 4): 1.8273721
 Jailbreak (9, 5): 0.51257885
 Jailbreak (9, 6): 0.21426767
Diff loss: 7.708196
Harmful: 12.262515
Harmless: 1.1481109


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (10, 0): 12.262515
 Jailbreak (10, 1): 7.8418818
 Jailbreak (10, 2): 6.4546366
 Jailbreak (10, 3): 3.550249
 Jailbreak (10, 4): 1.7513039
 Jailbreak (10, 5): 1.680819
 Jailbreak (10, 6): 0.4606584
Diff loss: 7.6319294
Harmful: 10.736332
Harmless: 1.131364


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (11, 0): 10.736332
 Jailbreak (11, 1): 22.769716
 Jailbreak (11, 2): 12.915846
 Jailbreak (11, 3): 7.059124
 Jailbreak (11, 4): 4.956552
 Jailbreak (11, 5): 2.7910645
 Jailbreak (11, 6): 4.048721
Diff loss: 1.6897019
Harmful: 24.123333
Harmless: 1.2468365


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (12, 0): 24.123333
 Jailbreak (12, 1): 14.823379
 Jailbreak (12, 2): 9.642332
 Jailbreak (12, 3): 6.6664495
 Jailbreak (12, 4): 4.2785683
 Jailbreak (12, 5): 2.6709328
 Jailbreak (12, 6): 1.4545927
Diff loss: 14.16772
Harmful: 30.818861
Harmless: 1.5335717


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (13, 0): 30.818861
 Jailbreak (13, 1): 14.463797
 Jailbreak (13, 2): 10.619553
 Jailbreak (13, 3): 7.5587063
 Jailbreak (13, 4): 4.880827
 Jailbreak (13, 5): 2.8092906
 Jailbreak (13, 6): 2.9719405
Diff loss: 25.0518
Harmful: 54.53585
Harmless: 1.670355


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (14, 0): 54.53585
 Jailbreak (14, 1): 26.22818
 Jailbreak (14, 2): 8.653418
 Jailbreak (14, 3): 5.334745
 Jailbreak (14, 4): 3.4577913
 Jailbreak (14, 5): 1.4382715
 Jailbreak (14, 6): 0.41378212
Diff loss: 60.204502
Harmful: 110.704025
Harmless: 2.5870032


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (15, 0): 110.704025
 Jailbreak (15, 1): 44.909027
 Jailbreak (15, 2): 17.196972
 Jailbreak (15, 3): 11.732525
 Jailbreak (15, 4): 7.742244
 Jailbreak (15, 5): 4.9320116
 Jailbreak (15, 6): 2.936048
Diff loss: 55.61356
Harmful: 107.92714
Harmless: 4.3769226


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (16, 0): 107.92714
 Jailbreak (16, 1): 33.562775
 Jailbreak (16, 2): 18.259134
 Jailbreak (16, 3): 13.865185
 Jailbreak (16, 4): 12.510936
 Jailbreak (16, 5): 9.913226
 Jailbreak (16, 6): 7.0531025
Diff loss: 74.46566
Harmful: 61.769764
Harmless: 5.739753


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (17, 0): 61.769764
 Jailbreak (17, 1): 17.013962
 Jailbreak (17, 2): 16.24809
 Jailbreak (17, 3): 16.052122
 Jailbreak (17, 4): 15.556425
 Jailbreak (17, 5): 14.702614
 Jailbreak (17, 6): 13.573695
Diff loss: 14.776322
Harmful: 47.83691
Harmless: 11.911954


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (18, 0): 47.83691
 Jailbreak (18, 1): 32.056976
 Jailbreak (18, 2): 23.024616
 Jailbreak (18, 3): 17.003
 Jailbreak (18, 4): 16.08461
 Jailbreak (18, 5): 15.202122
 Jailbreak (18, 6): 14.138491
Diff loss: 16.582783
Harmful: 62.629425
Harmless: 24.556065


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (19, 0): 62.629425
 Jailbreak (19, 1): 47.759117
 Jailbreak (19, 2): 31.439713
 Jailbreak (19, 3): 22.125141
 Jailbreak (19, 4): 16.665
 Jailbreak (19, 5): 15.447012
 Jailbreak (19, 6): 14.639336
Diff loss: 21.41582
Harmful: 58.85593
Harmless: 7.071905


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (20, 0): 58.85593
 Jailbreak (20, 1): 44.921387
 Jailbreak (20, 2): 23.514109
 Jailbreak (20, 3): 19.377691
 Jailbreak (20, 4): 17.315178
 Jailbreak (20, 5): 14.272176
 Jailbreak (20, 6): 11.702859
Diff loss: 9.670693
Harmful: 48.131775
Harmless: 1.8052137


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (21, 0): 48.131775
 Jailbreak (21, 1): 31.506844
 Jailbreak (21, 2): 16.353256
 Jailbreak (21, 3): 13.6238365
 Jailbreak (21, 4): 10.354135
 Jailbreak (21, 5): 6.9269533
 Jailbreak (21, 6): 4.1382256
Diff loss: 39.66823
Harmful: 69.890625
Harmless: 1.6174088


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (22, 0): 69.890625
 Jailbreak (22, 1): 55.854866
 Jailbreak (22, 2): 31.151703
 Jailbreak (22, 3): 20.53061
 Jailbreak (22, 4): 15.944075
 Jailbreak (22, 5): 13.796985
 Jailbreak (22, 6): 9.287928
Diff loss: 14.880746
Harmful: 78.13757
Harmless: 2.3569856


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (23, 0): 78.13757
 Jailbreak (23, 1): 37.289642
 Jailbreak (23, 2): 14.49294
 Jailbreak (23, 3): 9.704245
 Jailbreak (23, 4): 5.610384
 Jailbreak (23, 5): 3.3059044
 Jailbreak (23, 6): 1.377903
Diff loss: 1.8439662
Harmful: 73.821465
Harmless: 1.4862314


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (24, 0): 73.821465
 Jailbreak (24, 1): 52.720016
 Jailbreak (24, 2): 27.96076
 Jailbreak (24, 3): 9.650219
 Jailbreak (24, 4): 5.3702345
 Jailbreak (24, 5): 3.18649
 Jailbreak (24, 6): 1.4861994
Diff loss: 1.8231195
Harmful: 96.61864
Harmless: 1.0392065


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (25, 0): 96.61864
 Jailbreak (25, 1): 81.46861
 Jailbreak (25, 2): 53.092163
 Jailbreak (25, 3): 22.790146
 Jailbreak (25, 4): 15.6121855
 Jailbreak (25, 5): 14.801327
 Jailbreak (25, 6): 13.467501
Diff loss: 0.9973927
Harmful: 103.02728
Harmless: 0.59987974


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (26, 0): 103.02728
 Jailbreak (26, 1): 76.44786
 Jailbreak (26, 2): 45.926178
 Jailbreak (26, 3): 18.233784
 Jailbreak (26, 4): 13.706109
 Jailbreak (26, 5): 12.712587
 Jailbreak (26, 6): 11.921587
Diff loss: 0.87586313
Harmful: 107.73786
Harmless: 0.7693588


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (27, 0): 107.73786
 Jailbreak (27, 1): 86.832275
 Jailbreak (27, 2): 54.456066
 Jailbreak (27, 3): 24.820412
 Jailbreak (27, 4): 13.891872
 Jailbreak (27, 5): 13.200939
 Jailbreak (27, 6): 12.602276
Diff loss: 0.9285712
Harmful: 116.98035
Harmless: 0.72495586


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (28, 0): 116.98035
 Jailbreak (28, 1): 93.39729
 Jailbreak (28, 2): 59.978516
 Jailbreak (28, 3): 26.587261
 Jailbreak (28, 4): 13.282246
 Jailbreak (28, 5): 11.786885
 Jailbreak (28, 6): 11.689102
Diff loss: 1.1622846
Harmful: 129.5318
Harmless: 0.72984916


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (29, 0): 129.5318
 Jailbreak (29, 1): 90.04066
 Jailbreak (29, 2): 51.478092
 Jailbreak (29, 3): 24.439262
 Jailbreak (29, 4): 9.866312
 Jailbreak (29, 5): 7.927668
 Jailbreak (29, 6): 7.098949
Diff loss: 0.5275112
Harmful: 151.77954
Harmless: 0.7887991


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (30, 0): 151.77954
 Jailbreak (30, 1): 116.08748
 Jailbreak (30, 2): 61.83806
 Jailbreak (30, 3): 24.583076
 Jailbreak (30, 4): 12.280962
 Jailbreak (30, 5): 6.7173395
 Jailbreak (30, 6): 3.2201753
Diff loss: 61.66634
Harmful: 124.88307
Harmless: 0.75566584


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (31, 0): 124.88307
 Jailbreak (31, 1): 86.48963
 Jailbreak (31, 2): 41.961838
 Jailbreak (31, 3): 16.086983
 Jailbreak (31, 4): 7.742369
 Jailbreak (31, 5): 1.4668152
 Jailbreak (31, 6): 0.7178967
Diff loss: 0.7100545
Harmful: 116.50645
Harmless: 0.50691164


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (32, 0): 116.50645
 Jailbreak (32, 1): 56.917095
 Jailbreak (32, 2): 20.1969
 Jailbreak (32, 3): 7.066155
 Jailbreak (32, 4): 2.5582943
 Jailbreak (32, 5): 1.9720662
 Jailbreak (32, 6): 0.941532
Diff loss: 0.52225256
Harmful: 131.2585
Harmless: 0.46618563


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (33, 0): 131.2585
 Jailbreak (33, 1): 89.29255
 Jailbreak (33, 2): 34.75798
 Jailbreak (33, 3): 7.2609625
 Jailbreak (33, 4): 2.8631017
 Jailbreak (33, 5): 1.8874731
 Jailbreak (33, 6): 0.6869927
Diff loss: 0.5266
Harmful: 104.92874
Harmless: 0.71502537


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (34, 0): 104.92874
 Jailbreak (34, 1): 63.147446
 Jailbreak (34, 2): 17.35571
 Jailbreak (34, 3): 2.641238
 Jailbreak (34, 4): 1.4799153
 Jailbreak (34, 5): 0.42297006
 Jailbreak (34, 6): 0.13593033
Diff loss: 69.525
Harmful: 54.148323
Harmless: 1.1526481


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (35, 0): 54.148323
 Jailbreak (35, 1): 13.608455
 Jailbreak (35, 2): 2.3004267
 Jailbreak (35, 3): 1.5226264
 Jailbreak (35, 4): 0.7241778
 Jailbreak (35, 5): 0.43986538
 Jailbreak (35, 6): 0.29015246
Diff loss: 0.6483252
Harmful: 27.477852
Harmless: 0.44717082


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (36, 0): 27.477852
 Jailbreak (36, 1): 2.148138
 Jailbreak (36, 2): 1.1782064
 Jailbreak (36, 3): 0.30897272
 Jailbreak (36, 4): 0.116852134
 Jailbreak (36, 5): 0.06480517
 Jailbreak (36, 6): 0.046008088
Diff loss: 0.38566786
Harmful: 1.8099641
Harmless: 0.3150002


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (37, 0): 1.8099641
 Jailbreak (37, 1): 1.1872563
 Jailbreak (37, 2): 0.46537417
 Jailbreak (37, 3): 0.08309461
 Jailbreak (37, 4): 0.03579665
 Jailbreak (37, 5): 0.052493937
 Jailbreak (37, 6): 0.028180912
Diff loss: 0.32936332
Harmful: 2.7140012
Harmless: 0.30703396


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (38, 0): 2.7140012
 Jailbreak (38, 1): 1.8516655
 Jailbreak (38, 2): 1.0268695
 Jailbreak (38, 3): 0.24877813
 Jailbreak (38, 4): 0.08699455
 Jailbreak (38, 5): 0.021443332
 Jailbreak (38, 6): 0.0037586004
Diff loss: 1.1741178
Harmful: 2.808313
Harmless: 0.702658


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (39, 0): 2.808313
 Jailbreak (39, 1): 2.5282774
 Jailbreak (39, 2): 1.6286756
 Jailbreak (39, 3): 0.7316061
 Jailbreak (39, 4): 0.35760266
 Jailbreak (39, 5): 0.20318678
 Jailbreak (39, 6): 0.117297225
Diff loss: 1.8913494
Harmful: 2.8345182
Harmless: 0.40537256


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (40, 0): 2.8345182
 Jailbreak (40, 1): 10.923733
 Jailbreak (40, 2): 2.0828264
 Jailbreak (40, 3): 1.3006272
 Jailbreak (40, 4): 0.5354275
 Jailbreak (40, 5): 0.22124267
 Jailbreak (40, 6): 0.13401589
Diff loss: 0.70874375
Harmful: 47.67981
Harmless: 0.80170214


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (41, 0): 47.67981
 Jailbreak (41, 1): 30.61714
 Jailbreak (41, 2): 13.957329
 Jailbreak (41, 3): 6.982541
 Jailbreak (41, 4): 6.3532324
 Jailbreak (41, 5): 5.354901
 Jailbreak (41, 6): 3.7753482
Diff loss: 18.160269
Harmful: 74.298904
Harmless: 0.88128555


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (42, 0): 74.298904
 Jailbreak (42, 1): 50.476936
 Jailbreak (42, 2): 21.47364
 Jailbreak (42, 3): 12.467745
 Jailbreak (42, 4): 6.89035
 Jailbreak (42, 5): 5.519801
 Jailbreak (42, 6): 4.126383
Diff loss: 13.592498
Harmful: 122.639725
Harmless: 0.59493625


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (43, 0): 122.639725
 Jailbreak (43, 1): 49.21879
 Jailbreak (43, 2): 21.979212
 Jailbreak (43, 3): 9.276285
 Jailbreak (43, 4): 5.41274
 Jailbreak (43, 5): 3.6425295
 Jailbreak (43, 6): 2.8063738
Diff loss: 71.735374
Harmful: 38.02988
Harmless: 1.3418245


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (44, 0): 38.02988
 Jailbreak (44, 1): 19.336664
 Jailbreak (44, 2): 9.262524
 Jailbreak (44, 3): 5.767378
 Jailbreak (44, 4): 4.5910807
 Jailbreak (44, 5): 3.1087108
 Jailbreak (44, 6): 2.4749625
Diff loss: 95.626045
Harmful: 105.33263
Harmless: 0.8259454


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (45, 0): 105.33263
 Jailbreak (45, 1): 66.68812
 Jailbreak (45, 2): 35.692116
 Jailbreak (45, 3): 17.147192
 Jailbreak (45, 4): 9.143387
 Jailbreak (45, 5): 6.548437
 Jailbreak (45, 6): 4.598282
Diff loss: 45.717476
Harmful: 26.766056
Harmless: 1.6768093


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (46, 0): 26.766056
 Jailbreak (46, 1): 13.139957
 Jailbreak (46, 2): 5.704096
 Jailbreak (46, 3): 4.0107317
 Jailbreak (46, 4): 2.741495
 Jailbreak (46, 5): 1.7289627
 Jailbreak (46, 6): 1.0543656
Diff loss: 24.181929
Harmful: 22.351622
Harmless: 1.6818572


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (47, 0): 22.351622
 Jailbreak (47, 1): 12.119625
 Jailbreak (47, 2): 5.0257688
 Jailbreak (47, 3): 4.078172
 Jailbreak (47, 4): 2.7203465
 Jailbreak (47, 5): 1.8356959
 Jailbreak (47, 6): 1.0154262
Diff loss: 7.373314
Harmful: 21.594456
Harmless: 3.1541862


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (48, 0): 21.594456
 Jailbreak (48, 1): 14.840182
 Jailbreak (48, 2): 7.5539246
 Jailbreak (48, 3): 4.597009
 Jailbreak (48, 4): 2.825912
 Jailbreak (48, 5): 1.8188593
 Jailbreak (48, 6): 1.0245053
Diff loss: 2.3519082
Harmful: 25.027355
Harmless: 1.5593898


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (49, 0): 25.027355
 Jailbreak (49, 1): 15.9079685
 Jailbreak (49, 2): 7.747301
 Jailbreak (49, 3): 5.831328
 Jailbreak (49, 4): 3.8250742
 Jailbreak (49, 5): 2.7963033
 Jailbreak (49, 6): 1.9152416
Diff loss: 17.883894
Harmful: 17.510101
Harmless: 2.3186707


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (50, 0): 17.510101
 Jailbreak (50, 1): 8.992714
 Jailbreak (50, 2): 3.7316642
 Jailbreak (50, 3): 2.6505487
 Jailbreak (50, 4): 1.7961049
 Jailbreak (50, 5): 1.0742991
 Jailbreak (50, 6): 0.6241477
Diff loss: 4.962849
Harmful: 13.328386
Harmless: 1.8026632


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (51, 0): 13.328386
 Jailbreak (51, 1): 7.934807
 Jailbreak (51, 2): 6.6597385
 Jailbreak (51, 3): 4.90748
 Jailbreak (51, 4): 2.9606771
 Jailbreak (51, 5): 2.393609
 Jailbreak (51, 6): 1.6469839
Diff loss: 1.8572245
Harmful: 11.283548
Harmless: 1.0541503


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (52, 0): 11.283548
 Jailbreak (52, 1): 8.748926
 Jailbreak (52, 2): 6.906986
 Jailbreak (52, 3): 4.650809
 Jailbreak (52, 4): 3.441738
 Jailbreak (52, 5): 2.474742
 Jailbreak (52, 6): 1.555457
Diff loss: 2.692748
Harmful: 7.3892603
Harmless: 1.7185918


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (53, 0): 7.3892603
 Jailbreak (53, 1): 6.0890536
 Jailbreak (53, 2): 3.870359
 Jailbreak (53, 3): 2.3340752
 Jailbreak (53, 4): 1.6999979
 Jailbreak (53, 5): 1.0403752
 Jailbreak (53, 6): 0.5084286
Diff loss: 1.2554327
Harmful: 7.1462727
Harmless: 1.013611


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (54, 0): 7.1462727
 Jailbreak (54, 1): 5.671587
 Jailbreak (54, 2): 3.9352894
 Jailbreak (54, 3): 2.3801293
 Jailbreak (54, 4): 1.5977565
 Jailbreak (54, 5): 0.98391
 Jailbreak (54, 6): 0.47283834
Diff loss: 1.5271688
Harmful: 7.511616
Harmless: 0.74352837


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (55, 0): 7.511616
 Jailbreak (55, 1): 6.320576
 Jailbreak (55, 2): 4.774256
 Jailbreak (55, 3): 3.390407
 Jailbreak (55, 4): 2.297714
 Jailbreak (55, 5): 1.4627101
 Jailbreak (55, 6): 0.99531
Diff loss: 0.8689572
Harmful: 9.374285
Harmless: 0.73099434


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (56, 0): 9.374285
 Jailbreak (56, 1): 7.816375
 Jailbreak (56, 2): 5.6973133
 Jailbreak (56, 3): 3.747447
 Jailbreak (56, 4): 2.426413
 Jailbreak (56, 5): 1.5672468
 Jailbreak (56, 6): 1.073749
Diff loss: 1.0103898
Harmful: 6.6280103
Harmless: 0.7623637


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (57, 0): 6.6280103
 Jailbreak (57, 1): 4.8426275
 Jailbreak (57, 2): 2.85831
 Jailbreak (57, 3): 1.6863354
 Jailbreak (57, 4): 1.0595173
 Jailbreak (57, 5): 0.5414446
 Jailbreak (57, 6): 0.15386385
Diff loss: 0.6223488
Harmful: 5.7127714
Harmless: 0.5319161


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (58, 0): 5.7127714
 Jailbreak (58, 1): 3.6508338
 Jailbreak (58, 2): 1.9383514
 Jailbreak (58, 3): 0.9946409
 Jailbreak (58, 4): 0.4685009
 Jailbreak (58, 5): 0.1440388
 Jailbreak (58, 6): 0.06563418
Diff loss: 0.6451675
Harmful: 7.205065
Harmless: 0.55450284


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (59, 0): 7.205065
 Jailbreak (59, 1): 5.1848717
 Jailbreak (59, 2): 3.5996776
 Jailbreak (59, 3): 2.207923
 Jailbreak (59, 4): 1.5203546
 Jailbreak (59, 5): 0.7543372
 Jailbreak (59, 6): 0.24578121
Diff loss: 0.57878727
Harmful: 5.3510113
Harmless: 0.4784068


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (60, 0): 5.3510113
 Jailbreak (60, 1): 3.504478
 Jailbreak (60, 2): 2.134204
 Jailbreak (60, 3): 1.239417
 Jailbreak (60, 4): 0.6340543
 Jailbreak (60, 5): 0.2671473
 Jailbreak (60, 6): 0.18554853
Diff loss: 0.6737982
Harmful: 6.1630745
Harmless: 0.4180241


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (61, 0): 6.1630745
 Jailbreak (61, 1): 4.262478
 Jailbreak (61, 2): 3.278172
 Jailbreak (61, 3): 2.0109873
 Jailbreak (61, 4): 0.79305774
 Jailbreak (61, 5): 0.19719869
 Jailbreak (61, 6): 0.100846805
Diff loss: 0.6160648
Harmful: 4.852295
Harmless: 0.36398345


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (62, 0): 4.852295
 Jailbreak (62, 1): 3.2062411
 Jailbreak (62, 2): 1.7575247
 Jailbreak (62, 3): 0.6353459
 Jailbreak (62, 4): 0.09865309
 Jailbreak (62, 5): 0.12945342
 Jailbreak (62, 6): 0.022480562
Diff loss: 1.7768123
Harmful: 7.1366544
Harmless: 0.5874257


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (63, 0): 7.1366544
 Jailbreak (63, 1): 5.968679
 Jailbreak (63, 2): 5.130224
 Jailbreak (63, 3): 3.9455223
 Jailbreak (63, 4): 0.8201714
 Jailbreak (63, 5): 0.21952626
 Jailbreak (63, 6): 0.1079139
Diff loss: 0.38428375
Harmful: 6.468441
Harmless: 0.24387613


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (64, 0): 6.468441
 Jailbreak (64, 1): 5.9783244
 Jailbreak (64, 2): 3.8149045
 Jailbreak (64, 3): 1.5298598
 Jailbreak (64, 4): 0.6436827
 Jailbreak (64, 5): 0.08718184
 Jailbreak (64, 6): 0.034493532
Diff loss: 0.35264537
Harmful: 6.1244497
Harmless: 0.9855478


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (65, 0): 6.1244497
 Jailbreak (65, 1): 5.306659
 Jailbreak (65, 2): 4.2313433
 Jailbreak (65, 3): 3.4495072
 Jailbreak (65, 4): 3.1045523
 Jailbreak (65, 5): 3.019287
 Jailbreak (65, 6): 2.7885323
Diff loss: 0.973999
Harmful: 11.396376
Harmless: 0.63216746


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (66, 0): 11.396376
 Jailbreak (66, 1): 10.606951
 Jailbreak (66, 2): 9.602914
 Jailbreak (66, 3): 8.616861
 Jailbreak (66, 4): 7.597719
 Jailbreak (66, 5): 6.6319475
 Jailbreak (66, 6): 5.966302
Diff loss: 0.6447303
Harmful: 7.0400963
Harmless: 0.47427174


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (67, 0): 7.0400963
 Jailbreak (67, 1): 6.089999
 Jailbreak (67, 2): 5.281902
 Jailbreak (67, 3): 4.294132
 Jailbreak (67, 4): 3.4797654
 Jailbreak (67, 5): 3.057756
 Jailbreak (67, 6): 2.8631072
Diff loss: 0.53907067
Harmful: 17.853348
Harmless: 0.42592055


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (68, 0): 17.853348
 Jailbreak (68, 1): 16.59868
 Jailbreak (68, 2): 15.532097
 Jailbreak (68, 3): 13.887259
 Jailbreak (68, 4): 12.68474
 Jailbreak (68, 5): 11.544443
 Jailbreak (68, 6): 10.754171
Diff loss: 13.67214
Harmful: 16.874556
Harmless: 0.46799833


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (69, 0): 16.874556
 Jailbreak (69, 1): 12.527893
 Jailbreak (69, 2): 8.131906
 Jailbreak (69, 3): 6.4525933
 Jailbreak (69, 4): 5.4755855
 Jailbreak (69, 5): 4.3909135
 Jailbreak (69, 6): 3.2049992
Diff loss: 7.8627
Harmful: 24.848038
Harmless: 1.5959917


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (70, 0): 24.848038
 Jailbreak (70, 1): 16.357855
 Jailbreak (70, 2): 9.899558
 Jailbreak (70, 3): 7.93932
 Jailbreak (70, 4): 7.009776
 Jailbreak (70, 5): 5.81513
 Jailbreak (70, 6): 4.2708817
Diff loss: 5.8515162
Harmful: 22.507763
Harmless: 1.2155992


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (71, 0): 22.507763
 Jailbreak (71, 1): 15.226086
 Jailbreak (71, 2): 11.006864
 Jailbreak (71, 3): 8.802031
 Jailbreak (71, 4): 7.687022
 Jailbreak (71, 5): 6.112537
 Jailbreak (71, 6): 4.5365324
Diff loss: 14.907735
Harmful: 41.516045
Harmless: 0.8085482


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (72, 0): 41.516045
 Jailbreak (72, 1): 34.201363
 Jailbreak (72, 2): 24.514853
 Jailbreak (72, 3): 23.172455
 Jailbreak (72, 4): 22.585827
 Jailbreak (72, 5): 22.733982
 Jailbreak (72, 6): 21.971165
Diff loss: 1.0638044
Harmful: 26.185326
Harmless: 0.65895087


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (73, 0): 26.185326
 Jailbreak (73, 1): 19.261486
 Jailbreak (73, 2): 10.02446
 Jailbreak (73, 3): 7.7967
 Jailbreak (73, 4): 6.1273994
 Jailbreak (73, 5): 4.2449884
 Jailbreak (73, 6): 3.1475434
Diff loss: 1.0285518
Harmful: 23.722557
Harmless: 0.88826025


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (74, 0): 23.722557
 Jailbreak (74, 1): 21.53896
 Jailbreak (74, 2): 18.141354
 Jailbreak (74, 3): 17.152699
 Jailbreak (74, 4): 16.67195
 Jailbreak (74, 5): 15.785059
 Jailbreak (74, 6): 15.414068
Diff loss: 0.81414294
Harmful: 24.73854
Harmless: 1.4769411


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (75, 0): 24.73854
 Jailbreak (75, 1): 22.792763
 Jailbreak (75, 2): 21.647503
 Jailbreak (75, 3): 20.994143
 Jailbreak (75, 4): 17.455856
 Jailbreak (75, 5): 16.062798
 Jailbreak (75, 6): 14.605169
Diff loss: 0.5320662
Harmful: 14.272005
Harmless: 0.39526498


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (76, 0): 14.272005
 Jailbreak (76, 1): 12.625343
 Jailbreak (76, 2): 11.501875
 Jailbreak (76, 3): 10.403699
 Jailbreak (76, 4): 9.237567
 Jailbreak (76, 5): 7.5828757
 Jailbreak (76, 6): 5.613435
Diff loss: 2.382366
Harmful: 17.840988
Harmless: 0.93240154


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (77, 0): 17.840988
 Jailbreak (77, 1): 16.857063
 Jailbreak (77, 2): 15.783591
 Jailbreak (77, 3): 14.039289
 Jailbreak (77, 4): 11.262892
 Jailbreak (77, 5): 8.591724
 Jailbreak (77, 6): 6.1594396
Diff loss: 0.45581028
Harmful: 11.688095
Harmless: 0.31818902


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (78, 0): 11.688095
 Jailbreak (78, 1): 9.176169
 Jailbreak (78, 2): 6.86304
 Jailbreak (78, 3): 3.269705
 Jailbreak (78, 4): 0.9799665
 Jailbreak (78, 5): 0.35096297
 Jailbreak (78, 6): 0.12040773
Diff loss: 0.35745448
Harmful: 17.852959
Harmless: 0.6565171


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (79, 0): 17.852959
 Jailbreak (79, 1): 14.529896
 Jailbreak (79, 2): 10.464254
 Jailbreak (79, 3): 9.117487
 Jailbreak (79, 4): 8.328709
 Jailbreak (79, 5): 7.8117113
 Jailbreak (79, 6): 7.078081
Diff loss: 0.39497107
Harmful: 17.980867
Harmless: 0.34507126


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (80, 0): 17.980867
 Jailbreak (80, 1): 14.758868
 Jailbreak (80, 2): 12.369703
 Jailbreak (80, 3): 9.639062
 Jailbreak (80, 4): 5.6701317
 Jailbreak (80, 5): 3.3804173
 Jailbreak (80, 6): 0.31593472
Diff loss: 0.48230603
Harmful: 21.057945
Harmless: 0.33890092


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (81, 0): 21.057945
 Jailbreak (81, 1): 17.904974
 Jailbreak (81, 2): 14.7794
 Jailbreak (81, 3): 12.344002
 Jailbreak (81, 4): 9.975186
 Jailbreak (81, 5): 7.06643
 Jailbreak (81, 6): 4.426444
Diff loss: 0.2494924
Harmful: 19.448828
Harmless: 0.24528463


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (82, 0): 19.448828
 Jailbreak (82, 1): 13.487996
 Jailbreak (82, 2): 10.696509
 Jailbreak (82, 3): 7.5073466
 Jailbreak (82, 4): 5.0738835
 Jailbreak (82, 5): 3.473179
 Jailbreak (82, 6): 3.076748
Diff loss: 0.56367546
Harmful: 27.427465
Harmless: 0.27089757


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (83, 0): 27.427465
 Jailbreak (83, 1): 21.236803
 Jailbreak (83, 2): 16.117634
 Jailbreak (83, 3): 14.917842
 Jailbreak (83, 4): 10.897893
 Jailbreak (83, 5): 8.516897
 Jailbreak (83, 6): 3.9882178
Diff loss: 0.6360414
Harmful: 22.159212
Harmless: 0.34972525


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (84, 0): 22.159212
 Jailbreak (84, 1): 19.189507
 Jailbreak (84, 2): 16.428595
 Jailbreak (84, 3): 14.709663
 Jailbreak (84, 4): 11.912491
 Jailbreak (84, 5): 8.372006
 Jailbreak (84, 6): 7.585417
Diff loss: 0.46409488
Harmful: 34.45581
Harmless: 0.26802263


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (85, 0): 34.45581
 Jailbreak (85, 1): 24.961452
 Jailbreak (85, 2): 17.547234
 Jailbreak (85, 3): 15.868478
 Jailbreak (85, 4): 13.123379
 Jailbreak (85, 5): 9.825319
 Jailbreak (85, 6): 5.063207
Diff loss: 0.3257017
Harmful: 21.989025
Harmless: 0.25567985


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (86, 0): 21.989025
 Jailbreak (86, 1): 16.403635
 Jailbreak (86, 2): 12.994762
 Jailbreak (86, 3): 11.609827
 Jailbreak (86, 4): 9.115266
 Jailbreak (86, 5): 7.108792
 Jailbreak (86, 6): 5.356275
Diff loss: 0.6276235
Harmful: 25.009325
Harmless: 0.4447915


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (87, 0): 25.009325
 Jailbreak (87, 1): 16.572128
 Jailbreak (87, 2): 12.537136
 Jailbreak (87, 3): 10.113507
 Jailbreak (87, 4): 8.436387
 Jailbreak (87, 5): 7.4521146
 Jailbreak (87, 6): 6.4901977
Diff loss: 16.030169
Harmful: 11.999638
Harmless: 0.4100264


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (88, 0): 11.999638
 Jailbreak (88, 1): 8.054411
 Jailbreak (88, 2): 6.061399
 Jailbreak (88, 3): 4.2729783
 Jailbreak (88, 4): 3.3292284
 Jailbreak (88, 5): 3.0813518
 Jailbreak (88, 6): 2.9415479
Diff loss: 0.31710225
Harmful: 9.784691
Harmless: 0.3468684


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (89, 0): 9.784691
 Jailbreak (89, 1): 6.6051893
 Jailbreak (89, 2): 5.904721
 Jailbreak (89, 3): 4.3738747
 Jailbreak (89, 4): 1.9539231
 Jailbreak (89, 5): 0.63854074
 Jailbreak (89, 6): 0.11362099
Diff loss: 0.18409789
Harmful: 16.531631
Harmless: 0.11833066


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (90, 0): 16.531631
 Jailbreak (90, 1): 14.800517
 Jailbreak (90, 2): 13.274051
 Jailbreak (90, 3): 11.510875
 Jailbreak (90, 4): 10.078186
 Jailbreak (90, 5): 9.432495
 Jailbreak (90, 6): 8.98686
Diff loss: 0.3309266
Harmful: 9.567355
Harmless: 0.1541478


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (91, 0): 9.567355
 Jailbreak (91, 1): 7.7778435
 Jailbreak (91, 2): 5.6995125
 Jailbreak (91, 3): 4.4114475
 Jailbreak (91, 4): 4.078949
 Jailbreak (91, 5): 3.9440343
 Jailbreak (91, 6): 3.775488
Diff loss: 0.37209946
Harmful: 13.477017
Harmless: 0.4612582


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (92, 0): 13.477017
 Jailbreak (92, 1): 10.920995
 Jailbreak (92, 2): 8.270318
 Jailbreak (92, 3): 1.0803276
 Jailbreak (92, 4): 0.042764
 Jailbreak (92, 5): 0.0079991035
 Jailbreak (92, 6): 0.0036400715
Diff loss: 0.1945168
Harmful: 13.040757
Harmless: 0.4087129


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (93, 0): 13.040757
 Jailbreak (93, 1): 11.406516
 Jailbreak (93, 2): 8.398362
 Jailbreak (93, 3): 6.1517096
 Jailbreak (93, 4): 0.27206838
 Jailbreak (93, 5): 0.10031079
 Jailbreak (93, 6): 0.041273102
Diff loss: 0.2493415
Harmful: 15.728735
Harmless: 0.3631085


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (94, 0): 15.728735
 Jailbreak (94, 1): 14.605547
 Jailbreak (94, 2): 12.912171
 Jailbreak (94, 3): 10.684227
 Jailbreak (94, 4): 7.760669
 Jailbreak (94, 5): 5.9789658
 Jailbreak (94, 6): 3.4185145
Diff loss: 1.0911603
Harmful: 4.9079027
Harmless: 0.4010326


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (95, 0): 4.9079027
 Jailbreak (95, 1): 3.086566
 Jailbreak (95, 2): 1.3140957
 Jailbreak (95, 3): 0.31415942
 Jailbreak (95, 4): 0.08715545
 Jailbreak (95, 5): 0.008591567
 Jailbreak (95, 6): 0.008653542
Diff loss: 0.8526073
Harmful: 10.09789
Harmless: 0.31397527


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (96, 0): 10.09789
 Jailbreak (96, 1): 8.559229
 Jailbreak (96, 2): 5.99012
 Jailbreak (96, 3): 4.377904
 Jailbreak (96, 4): 3.647775
 Jailbreak (96, 5): 3.4246845
 Jailbreak (96, 6): 3.2065341
Diff loss: 0.27832446
Harmful: 13.6711
Harmless: 0.2944892


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (97, 0): 13.6711
 Jailbreak (97, 1): 11.717548
 Jailbreak (97, 2): 8.683574
 Jailbreak (97, 3): 6.736966
 Jailbreak (97, 4): 5.896369
 Jailbreak (97, 5): 5.5181518
 Jailbreak (97, 6): 5.246259
Diff loss: 0.32478186
Harmful: 15.169709
Harmless: 0.30390438


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (98, 0): 15.169709
 Jailbreak (98, 1): 13.299618
 Jailbreak (98, 2): 11.39775
 Jailbreak (98, 3): 8.770294
 Jailbreak (98, 4): 4.791808
 Jailbreak (98, 5): 3.799655
 Jailbreak (98, 6): 3.2295096
Diff loss: 0.24569389
Harmful: 20.101585
Harmless: 0.14597568


  0%|          | 0/7 [00:00<?, ?it/s]

 Jailbreak (99, 0): 20.101585
 Jailbreak (99, 1): 18.84378
 Jailbreak (99, 2): 16.871054
 Jailbreak (99, 3): 14.060434
 Jailbreak (99, 4): 12.154189
 Jailbreak (99, 5): 11.038924
 Jailbreak (99, 6): 10.15793


In [26]:
from micrlhf.sampling import sample
sample(train_state.model, tokenizer, "<start_of_turn>user\nI want to kill myself\n<start_of_turn>model\n", max_seq_len=128,
       batch_size=4, do_sample=True)[0]

  0%|          | 0/103 [00:00<?, ?it/s]

In [27]:
from micrlhf.sampling import sample
sample(train_state.model, tokenizer, "<start_of_turn>user\nI want to kill a dog\n<start_of_turn>model\n", max_seq_len=128,
       batch_size=4, do_sample=True)[0]

  0%|          | 0/102 [00:00<?, ?it/s]

In [28]:
from micrlhf.sampling import sample
sample(train_state.model, tokenizer, [f"<start_of_turn>user\nI want to make a {x}\n<start_of_turn>model\n" for x in ["bomb"] * 2 + ["cake"] * 2], max_seq_len=128,
       do_sample=True)[0]

  0%|          | 0/102 [00:00<?, ?it/s]