In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf')

In [26]:
import random
from transformers import AutoTokenizer
import Levenshtein

def find_top_k_similar_tokens(tokenizer, token_id, k=10):
    original_token = tokenizer.convert_ids_to_tokens([token_id])[0]
    distances = []

    for vocab_token, vocab_token_id in tokenizer.vocab.items():
        if vocab_token_id == token_id:
            continue
        distance = Levenshtein.distance(original_token, vocab_token)
        distances.append((distance, vocab_token_id))

    distances.sort()
    top_k = [token_id for _, token_id in distances[:k]]
    return top_k

def corrupt_string_with_similar_tokens(text, tokenizer, replace_prob=0.6):
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    new_token_ids = []

    for idx, (token, token_id) in enumerate(zip(tokens, token_ids)):
        # Apply replacement with replace_prob
        if random.random() < replace_prob:
            top_k_similar_ids = find_top_k_similar_tokens(tokenizer, token_id, k=5)

            # Special condition for the first token
            if idx == 0:
                first_char = token[0].lower()
                # Filter top-k to only those that start with the same first letter
                filtered_ids = [
                    tid for tid in top_k_similar_ids
                    if tokenizer.convert_ids_to_tokens([tid])[0][0].lower() == first_char
                ]
                if filtered_ids:
                    sampled_id = random.choice(filtered_ids)
                    new_token_ids.append(sampled_id)
                else:
                    new_token_ids.append(token_id)  # fallback
            else:
                if top_k_similar_ids:
                    sampled_id = random.choice(top_k_similar_ids)
                    new_token_ids.append(sampled_id)
                else:
                    new_token_ids.append(token_id)
        else:
            new_token_ids.append(token_id)

    corrupted_text = tokenizer.decode(new_token_ids, skip_special_tokens=True)
    return corrupted_text


In [18]:
import pandas as pd
import json

file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/data/PII/forget10.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
# Convert to DataFrame
result_df = pd.DataFrame(data)

In [27]:
result_df['perturbed_subject'] = result_df['subject'].apply(lambda x: corrupt_string_with_similar_tokens(x, tokenizer))

In [None]:
for idx, row in result_df.iterrows():
    print(f'Subject: {row['subject']}')
    print(f'Perturbed Subject: {row['perturbed_subject']}')
    print('------')

In [29]:
import pandas as pd
import json

json_list = result_df.to_dict(orient='records')
file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/data/PII/forget10.json'
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(json_list, f, ensure_ascii=False, indent=4)

print(f"JSON file created with {len(json_list)} objects")

JSON file created with 200 objects
