In [1]:
from huggingface_hub import login
from transformer_lens import HookedTransformer
import pandas as pd
import random

def corrupt_prompt_names(sampled_prompts: list[str], names: list[str], model, sub_regions: list[str], all_names_df: pd.DataFrame, init_random_state: int = 42) -> list[str]:
    """
    Corrupts name in the prompt with any other name from the dataset of the same token length but different sub-region
    :param sampled_prompts: list of prompts
    :param names: list of names to be replaced in the prompts
    :param model: model
    :param sub_regions: sub-region of the original names
    :param all_names_df: DataFrame of names with token length and sub-region
    :return: list of corrupted prompts
    """
    corrupted_prompts = []
    new_names = []
    
    for i in range(len(sampled_prompts)):
        random.seed(i+init_random_state)
        name = names[i]
        subregion = sub_regions[i]
        name_length = len(model.to_tokens(" " + name)[0])
        different_region_names = all_names_df[(all_names_df["token_length"] == name_length) & (all_names_df["Sub_Region"] != subregion) & (all_names_df["name"] != name)]["name"].to_list()
        new_name = " " + (different_region_names[random.randint(0, len(different_region_names) - 1)])
        new_names.append(new_name.strip())
        assert(name_length == len(model.to_tokens(new_name)[0]))
        corrupted_prompt = sampled_prompts[i].replace(" " + name, new_name)
        assert len(model.to_tokens(corrupted_prompt)[0]) == len(model.to_tokens(sampled_prompts[i])[0])
        corrupted_prompts.append(corrupted_prompt)
    assert(model.to_tokens(corrupted_prompts).shape[0] == model.to_tokens(sampled_prompts).shape[0])
    assert(model.to_tokens(corrupted_prompts).shape[1] == model.to_tokens(sampled_prompts).shape[1])
    return corrupted_prompts, new_names

def create_all_names_df(wiki_last_names: pd.DataFrame, model) -> pd.DataFrame:
    """
    Creates a DataFrame of names with token length and sub-region
    :param all_names: unique list of all names
    :param tokenizer: tokenizer
    :return: DataFrame of names with token length and sub-region
    """
    all_names_df = pd.DataFrame(columns=["name", "token_length", "Sub_Region"])
    for i, row in wiki_last_names.iterrows():
        name = row["Localized Name"]
        token_length = len(model.to_tokens(" "+name)[0])
        all_names_df = pd.concat([all_names_df, pd.DataFrame({"name": [name.strip()], "token_length": [token_length], "Sub_Region": [row["Sub_Region"]]})], ignore_index=True)
    all_names_df = all_names_df.drop_duplicates(subset=["name", "token_length"])
    return all_names_df

def create_all_sentences_df(toxicity_prompts: pd.DataFrame, model) -> pd.DataFrame:
    """
    Creates a DataFrame of names with token length and sub-region
    :param all_names: unique list of all names
    :param tokenizer: tokenizer
    :return: DataFrame of names with token length and sub-region
    """
    all_sentences_df = pd.DataFrame(columns=["sentence", "token_length", "toxicity_group"])
    for i, row in toxicity_prompts.iterrows():
        # sentence = " "+row['text'].strip()+" "
        sentence = " '"+row['text'].strip()+"'\","
        token_length = len(model.to_tokens(sentence)[0])
        is_toxic = int(row['toxicity_group'] >=  0.5)
        all_sentences_df = pd.concat([all_sentences_df, pd.DataFrame({"sentence": [sentence], "token_length": [token_length], "toxicity_group": [row["toxicity_group"]], "is_toxic": [is_toxic]})], ignore_index=True)
    all_sentences_df = all_sentences_df.drop_duplicates(subset=["sentence", "token_length"])
    return all_sentences_df.reset_index(drop=True)

def corrupt_prompt_sentences(sampled_prompts: list[str], sentences: list[str], model, is_toxic_list: list[str], all_sentences_df: pd.DataFrame, init_random_state: int = 42) -> list[str]:
    corrupted_prompts = []
    corrupted_sentences = []
    corrupted_toxicities = []
    
    for i in range(len(sampled_prompts)):
        random.seed(i+init_random_state)
        # sentence  = " "+sentences[i].strip()+" "
        sentence  = " '"+sentences[i]+"'\","
        is_toxic = is_toxic_list[i]
        sentence_length = len(model.to_tokens(sentence)[0])
        different_region_names = all_sentences_df[(all_sentences_df["token_length"] == sentence_length) & (all_sentences_df["is_toxic"] != is_toxic)]#["sentence"].to_list()
        toxicities = different_region_names['is_toxic'].to_list()
        different_region_names = different_region_names["sentence"].to_list()
        if len(different_region_names) != 0:
            rint = random.randint(0, len(different_region_names) - 1)
            new_sentence = different_region_names[rint]
            toxicity = toxicities[rint]
        else:     
            for padding in range(1, 20, 1):
                different_region_names = all_sentences_df[(all_sentences_df["token_length"] == sentence_length -padding) & (all_sentences_df["is_toxic"] != is_toxic)]#["sentence"].to_list()
                toxicities = different_region_names['is_toxic'].to_list()
                different_region_names = different_region_names['sentence'].to_list()
                if len(different_region_names) == 0:
                    continue 
                rint = random.randint(0, len(different_region_names) - 1)
                new_sentence = " '"+(different_region_names[rint].strip().removesuffix("'\","))+(" ;"*padding)+"'\","
                toxicity = toxicities[rint]
                print(f"Used sentence with sequence length {sentence_length-padding} instead of {sentence_length}")
                break
        corrupted_sentences.append(new_sentence)
        if sentence_length != len(model.to_tokens(new_sentence)[0]):
            print(sentence_length, len(model.to_tokens(new_sentence)[0]))
            print(sentence)
            print(new_sentence)
            print("______________")
        assert(sentence_length == len(model.to_tokens(new_sentence)[0]))
        corrupted_prompt = sampled_prompts[i].replace(sentence, new_sentence)
        
        if sentence not in sampled_prompts[i]:
            print(sentence)
            print(sampled_prompts[i])
            print("__________________")
        assert corrupted_prompt != sampled_prompts[i]
        assert len(model.to_tokens(corrupted_prompt)[0]) == len(model.to_tokens(sampled_prompts[i])[0])
        corrupted_prompts.append(corrupted_prompt)
        corrupted_toxicities.append(toxicity)
    assert(model.to_tokens(corrupted_prompts).shape[0] == model.to_tokens(sampled_prompts).shape[0])
    assert(model.to_tokens(corrupted_prompts).shape[1] == model.to_tokens(sampled_prompts).shape[1])
    return corrupted_prompts, corrupted_sentences, corrupted_toxicities

def sample_dfs(toxicity_prompts, wiki_last_names, all_names_df, n_prompts_per_bucket: int = 12, n_names_per_region: int = 3, random_seeds: list[int] = [42, 420, 4200]):
    dfs = []

    prompt_template = """" John wrote, 'hi how are you'", toxic: false
    " John wrote, 'bitch you are shit'", toxic: true
    " {}", toxic:"""
    for random_seed in random_seeds:

        new_sentences = []
        sampled_prompts = toxicity_prompts.groupby('toxicity_group', group_keys=False).apply(lambda x: x.sample(n=n_prompts_per_bucket, replace=False, random_state=random_seed) if len(x) >= n_prompts_per_bucket else x).reset_index(drop=True)
        for i, prompt in enumerate(sampled_prompts['text']):
            prompt = prompt#.strip()
            sampled_names = wiki_last_names.groupby('Sub_Region', group_keys=False).apply(lambda x: x.sample(n=n_names_per_region, replace=False, random_state=random_seed + i) if len(x) >= n_names_per_region else x).reset_index(drop=True)
            is_toxic = int(sampled_prompts['toxicity_group'][i] >= 0.5)
            
            for j, name in enumerate(sampled_names['Localized Name']):
                # new_sentence = name + " wrote, ' " + prompt + " '"
                new_sentence = name + " wrote, '" + prompt + "'"
                new_sentences.append({'new_sentence': new_sentence, 'is_toxic': is_toxic, 'toxicity_group': sampled_prompts['toxicity_group'][i], 'Sub_Region': sampled_names['Sub_Region'][j], 'toxicity': sampled_prompts['toxicity'][i], 'Country': sampled_names['Country'][j], 'name': name, 'prompt': prompt})

        new_sentences_df = pd.DataFrame(new_sentences)
        sentences = new_sentences_df["new_sentence"].to_list()

        prompts = [prompt_template.format(sentence) for sentence in sentences]
        new_sentences_df['prompt_final'] = prompts
        
        corrupted_prompts, new_names = corrupt_prompt_names(prompts, new_sentences_df['name'].to_list(), model, new_sentences_df['Sub_Region'].to_list(), all_names_df, random_seed)
        # corrupted_tox_prompts, corrupted_tox_sentences = corrupt_prompt_sentences(prompts, new_sentences_df['prompt'].to_list(), model, new_sentences_df['toxicity_group'].to_list(), all_sentences_df, random_seed)
        
        new_sentences_df['corrupted_prompt'] = corrupted_prompts
        new_sentences_df['new_name'] = new_names
        dfs.append(new_sentences_df)
    return dfs

2025-01-26 16:25:34.557858: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX512_VNNI
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-26 16:25:34.766326: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset


# English only
dataset = load_dataset("ToxicityPrompts/PolygloToxicityPrompts", "ptp-en")

df = dataset["full"].to_pandas()
df = df.sort_values(by="toxicity")
df = df[df["text"].apply(lambda x: len(x.split()) <= 35)]
df = df[~df["text"].str.contains("http")]
df = df[~df["text"].str.contains("www")]
df = df[~df["text"].str.contains("a.firstChild")]
df["text"] = df["text"].str.replace(r"\.{2,}", " ", regex=True)  # Replace two or more dots with a space
df["text"] = df["text"].str.replace(r"\n", " ", regex=True)
df['toxicity_group'] = (df['toxicity'] // 0.25) * 0.25
df['is_toxic'] = df['toxicity_group'].apply(lambda x: int(x >= 0.5))
df = df.reset_index(drop=True)
print(len(df))

df.to_csv("full_toxicity_prompts_master.csv")

full_toxicity_prompts = pd.read_csv("full_toxicity_prompts_master.csv")
print(len(full_toxicity_prompts))
print(len(full_toxicity_prompts.drop_duplicates()))



3185


  full_toxicity_prompts = pd.read_csv("full_toxicity_prompts_master.csv")


1115463
3202


In [3]:
toxicity_prompts = pd.read_csv("toxicity_prompts_master.csv")
wiki_last_names = pd.read_csv("wiki_last_name_master.csv")

login(token='YOURTOKEN')

model = HookedTransformer.from_pretrained(
    "meta-llama/Llama-3.2-1B-Instruct",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    # refactor_factored_attn_matrices=True
)

wiki_all_names = pd.read_csv("wiki_last_name_master.csv")
all_names_df = create_all_names_df(wiki_all_names, model)

dfs = sample_dfs(toxicity_prompts, wiki_last_names, all_names_df)



Loaded pretrained model meta-llama/Llama-3.2-1B-Instruct into HookedTransformer


In [4]:
full_toxicity_prompts = pd.read_csv("full_toxicity_prompts_master.csv")
full_toxicity_prompts = full_toxicity_prompts.drop_duplicates()
# full_toxicity_prompts = full_toxicity_prompts[~full_toxicity_prompts["text"].str.contains("a.firstChild")]
full_toxicity_prompts['text'] = full_toxicity_prompts['text'].astype(str)
all_sentences_df = create_all_sentences_df(full_toxicity_prompts, model)

for random_seed, df in enumerate(dfs):
    
    prompts=df['prompt_final'].to_list()
    corrupted_tox_prompts, corrupted_tox_sentences, corrupted_toxicities = corrupt_prompt_sentences(prompts, df['prompt'].to_list(), model, df['is_toxic'].to_list(), all_sentences_df, random_seed)
    df['corrupted_sentence_prompt'] = corrupted_tox_prompts
    df['new_sentence'] = corrupted_tox_sentences
    df['corrupted_toxicity'] = corrupted_toxicities

  full_toxicity_prompts = pd.read_csv("full_toxicity_prompts_master.csv")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_toxicity_prompts['text'] = full_toxicity_prompts['text'].astype(str)


Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentence with sequence length 10 instead of 12
Used sentenc

In [6]:
for i in range(3):
    print(dfs[i]['toxicity_group'].value_counts())
    
for i in range(3):
    print(dfs[i]['Sub_Region'].value_counts())

0.00    432
0.25    432
0.50    432
0.75    432
Name: toxicity_group, dtype: int64
0.00    432
0.25    432
0.50    432
0.75    432
Name: toxicity_group, dtype: int64
0.00    432
0.25    432
0.50    432
0.75    432
Name: toxicity_group, dtype: int64
Central America    144
East Asia          144
Eastern Europe     144
Middle East        144
North America      144
Northern Europe    144
Oceania            144
South America      144
South Asia         144
Southeast Asia     144
Southern Europe    144
Western Europe     144
Name: Sub_Region, dtype: int64
Central America    144
East Asia          144
Eastern Europe     144
Middle East        144
North America      144
Northern Europe    144
Oceania            144
South America      144
South Asia         144
Southeast Asia     144
Southern Europe    144
Western Europe     144
Name: Sub_Region, dtype: int64
Central America    144
East Asia          144
Eastern Europe     144
Middle East        144
North America      144
Northern Europe    144

In [9]:
for j in range(3):
    print(f"df {j}")
    counter = 0
    counter2 = 0
    for i in range(len(dfs[j])):
        if dfs[j]['new_sentence'][i] not in dfs[j]['corrupted_sentence_prompt'][i]:
            # print(dfs[j]['prompt'][i])
            # print(dfs[j]['new_sentence'][i])
            # print(dfs[j]['corrupted_sentence_prompt'][i])
            # print("________")
            counter +=1  
            
        if dfs[j]['prompt'][i] in dfs[j]['new_sentence'][i]:
            counter2 +=1  
    print(counter, counter2)


df 0
0 0
df 1
0 0
df 2
0 0


In [10]:
for i in range(3):
    print(f"df {i}")
    print(dfs[i]['prompt_final'].apply(len).describe(), dfs[i]['corrupted_sentence_prompt'].apply(len).describe())

df 0
count    1728.000000
mean      267.312500
std        45.085913
min       168.000000
25%       234.000000
50%       262.500000
75%       299.250000
max       389.000000
Name: prompt_final, dtype: float64 count    1728.000000
mean      270.359375
std        39.211111
min       138.000000
25%       241.000000
50%       267.000000
75%       301.000000
max       383.000000
Name: corrupted_sentence_prompt, dtype: float64
df 1
count    1728.000000
mean      259.554398
std        50.665832
min       154.000000
25%       215.000000
50%       259.000000
75%       294.000000
max       450.000000
Name: prompt_final, dtype: float64 count    1728.000000
mean      261.092593
std        51.668082
min       133.000000
25%       234.000000
50%       255.000000
75%       295.000000
max       451.000000
Name: corrupted_sentence_prompt, dtype: float64
df 2
count    1728.000000
mean      274.648148
std        72.977997
min       154.000000
25%       228.000000
50%       281.000000
75%       299.000000


In [14]:
for i, df in enumerate(dfs):
    df.to_csv(f"final_toxicity_prompts_{i}.csv", index=False)