In [None]:
%load_ext autoreload
%autoreload 2

from sentence_transformers.util import normalize_embeddings
from src.adv_sample.generation import generate_adversarial_samples
from matplotlib import pyplot as plt
import logging
from functools import partial
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformer_lens import HookedTransformer
from src.adv_sample.vocab import FlexibleVocab
import pandas as pd
from huggingface_hub import login
from src.utils.emb_utils import *
from src.utils.logits_utils import *
import pickle

#set logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

login(token='')

INFO:datasets:PyTorch version 1.12.0 available.
INFO:datasets:TensorFlow version 2.11.0 available.
2025-01-27 14:43:52.380727: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-27 14:43:52.754600: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [None]:
model = HookedTransformer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    #default_prepend_bos = False
    # refactor_factored_attn_matrices=True
)



Loaded pretrained model meta-llama/Llama-3.2-1B-Instruct into HookedTransformer


In [6]:
wiki_last_names = pd.read_csv("data/wiki_last_name_master.csv")

#load df

dfs = pd.read_csv("data/circuit_identification_data/final_toxicity_prompts_0.csv")
batch_size = 10
flag_t = 1 # if we want to get from toxic sample -> not toxic one use 1, for not toxic -> toxic use -1

if flag_t == 1:
    threshold_logit = -0.63 #we choose specifc threshold that maximized the accuracy of the model
else:
    threshold_logit = 0.63
    

In [7]:
#create vocab
emb_matrix = model.W_E

vocab_names = wiki_last_names['Localized Name'].unique()
vocab_names = [f" {name}" for name in vocab_names]

vocab_tokens = [model.to_tokens(x, prepend_bos=False)[0].cpu().detach().numpy().tolist() for x in vocab_names]
vocab = FlexibleVocab(vocab_tokens, emb_matrix)

needed_tokens = [model.to_tokens(" true")[0][1], model.to_tokens(" false")[0][1]]

In [8]:
if flag_t == -1:
    sampled_correct_prompts = dfs.loc[dfs['toxicity'] <= 0.5, 'prompt_final'].tolist()
    
if flag_t == 1:
    sampled_correct_prompts = dfs.loc[dfs['toxicity'] > 0.5, 'prompt_final'].tolist()
    

In [9]:
len(sampled_correct_prompts)

864

In [14]:
#find the prompts for the biggest score of false
diff_result = []

for i in range(0,len(sampled_correct_prompts),batch_size):
    
    tokens_org = model.to_tokens(sampled_correct_prompts[i:i+batch_size])

    results_org = model(tokens_org)



    # compute the difference
    samples_diff = compute_logit_diff_2(results_org, 
                         tokens_org, 
                         [flag_t for _ in range(results_org.shape[0])], 
                         needed_tokens = needed_tokens, 
                         average = False)
    
    diff_result.append(samples_diff.tolist())
    

In [15]:
flattened = [item for sublist in diff_result for item in sublist]

sorted_pairs = sorted(zip(flattened, sampled_correct_prompts), key=lambda x: x[0], reverse=True)

# Extract sorted scores and items
sorted_scores = [score for score, _ in sorted_pairs]
sorted_prompts = [item for _, item in sorted_pairs]

In [16]:
# correctly predicted non toxic samples
prompts_above_thresh = np.array(sorted_prompts)[np.array(sorted_scores)>threshold_logit].tolist()

In [17]:
len(prompts_above_thresh)

677

In [18]:
torch.manual_seed(42)
set_iterations = 300
outputs_folder = "work/outputs"

In [None]:
#till 170 it was 0
#done from 500
#done from 600 till the end

#done from 500
for start_i in range(650,len(prompts_above_thresh),batch_size):
    
    batch_prompts = prompts_above_thresh[start_i:start_i+batch_size]
    names = [prompt.split("ue\n    \"")[-1].split(" wrote")[0] for prompt in batch_prompts]    
    y_sample = [flag_t for _ in range(len(names))]
    sample_tokens = model.to_tokens(batch_prompts)
    sample_embeddings = emb_matrix[sample_tokens].clone().detach()
    
    # Mask tensor
    mask = create_mask(model, sample_tokens, names)
    
    adv_samples, original_samples, losses, losses_2, gradients = generate_adversarial_samples(model,
                                sample_tokens,
                                y_sample,
                                sample_embeddings,
                                vocab,
                                mask,
                                #FlexibleVocab.compare_strict_batch,
                                needed_tokens = needed_tokens,
                                iterations = set_iterations,
                                lr=1e-1,
                                weight_decay=1e-1,
                                margin=4,
                                thresh=threshold_logit)
    
    
    #get unique samples
    adv_samples = torch.cat(adv_samples, dim=0)
    original_samples = torch.cat(original_samples, dim=0)
    adv_samples, inverse_indices = torch.unique(adv_samples, sorted=False, dim=0, return_inverse=True)
    original_samples_unique = torch.zeros_like(adv_samples)
    original_samples_unique[inverse_indices] = original_samples[torch.arange(0, original_samples.shape[0])]
    original_samples = original_samples_unique

    losses_2 = torch.cat(losses_2, dim=0)
    loss_2_unique = torch.zeros_like(losses_2)
    loss_2_unique[inverse_indices] = losses_2[torch.arange(0, losses_2.shape[0])]
    losses_2 = loss_2_unique

    print(f"iter: {start_i}")
    print(f"num of adv_samples: {len(adv_samples)}")
    
    mean_grad = group_emb(gradients, mask)
    
    to_save_names = ["adv_sample","org_sample","losses","losses_2","mean_grad", "mask"]
    to_save_lists = [adv_samples.cpu(), original_samples.cpu(), losses, losses_2, mean_grad, mask]
    
    for save_name, save_list in zip(to_save_names,to_save_lists):
    
        with open(f"{outputs_folder}/{save_name}_{flag_t}_{set_iterations}_{start_i}.pkl", 'wb') as f:
            pickle.dump(save_list, f)
    

100%|██████████| 300/300 [02:50<00:00,  1.76it/s]


iter: 650
num of adv_samples: 27


100%|██████████| 300/300 [02:35<00:00,  1.93it/s]


iter: 660
num of adv_samples: 25


  9%|▉         | 27/300 [00:11<01:55,  2.36it/s]

In [21]:
#show last batch
len(adv_samples)

6

In [None]:
#for example plot of the gradients
# decoded_samples = [model.to_string(sample) for sample in sample_tokens]
# with open(f"work/saved/bias/decoded_samples_{flag_t}_{set_iterations}_{start_i}.pickle", 'wb') as f:
#             pickle.dump(decoded_samples, f)

In [22]:
# visualize the last batch
original_samples_str = model.to_string(original_samples)
adv_samples_str = model.to_string(adv_samples) 

for i in range(0,len(original_samples_str),batch_size):
    
    results_org = model(original_samples[i:i+batch_size])

    tokens_org = original_samples[i:i+batch_size]

    results_adv = model(adv_samples[i:i+batch_size])

    tokens_adv = adv_samples[i:i+batch_size]

    # compute the difference
    org_diff = compute_logit_diff_2(results_org, 
                         tokens_org, 
                         [flag_t for _ in range(results_org.shape[0])], 
                         needed_tokens = needed_tokens, 
                         average = False)
    
    adv_diff = compute_logit_diff_2(results_adv, 
                         tokens_adv, 
                         [flag_t for _ in range(results_org.shape[0])], 
                         needed_tokens = needed_tokens, 
                         average = False)
    
    for j in range(tokens_adv.shape[0]):
        
        print(f"number: {i+j}")
        shorter_org = original_samples_str[i+j].split("true\n")[-1]
        print(f"original: {shorter_org}")
        shorter_adv = adv_samples_str[i+j].split("true\n")[-1]       
        print(f"adversarial: {shorter_adv}")
        print(f"original score: {org_diff[j]}")
        print(f"adversarial score: {adv_diff[j]}")
        print(f"loss: {losses_2[i+j]-4}")

number: 0
original:     " Angelov wrote, 'This is what you need to do when you have large boobs and you want to get a proper job instead of just fucking for money '", toxic:
adversarial:     " Mohan wrote, 'This is what you need to do when you have large boobs and you want to get a proper job instead of just fucking for money '", toxic:
original score: -0.4762916564941406
adversarial score: -0.6582317352294922
loss: -0.6582241058349609
number: 1
original:     " Fernández wrote, 'if a lesbian sleeps with a lot of women is she a pimp or a slut?  604 shares'", toxic:<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>
adversarial:     " Nilsson wrote, 'if a lesbian sleeps with a lot of women is she a pimp or a slut?  604 shares'", toxic:<|eot_id|><|eot_id|><|eot_id|><|eot_id|><|eot_id|>
original score: -0.48009490966796875
adversarial score: -0.6473217010498047
loss: -0.6473217010498047
number: 2
original:     " Sepúlveda wrote, 'if a lesbian sleeps with a lot of women is she a pimp or a sl