In [1]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

from datasets import concatenate_datasets, load_from_disk, DatasetDict
import argparse
import os
import pandas as pd
import copy
from tqdm import tqdm

import torch
from torch import nn
import nltk.data
nltk.download('punkt')
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM
from datasets import load_from_disk, concatenate_datasets, Dataset

from abc import ABC, abstractmethod

from evasion_attack import *
from utils import *
from generator import *

[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:

@dataclass
class ModelConfig:
    def __init__(self, tokenizer, use_chat_template, chat_template_type, gen_params, device):
        self.tokenizer = tokenizer
        self.use_chat_template = use_chat_template
        self.chat_template_type = chat_template_type
        self.gen_params = gen_params
        self.device = device

@dataclass
class PromptConfig:
    def __init__(self, system_prompt, user_prompt):
        self.system_prompt = system_prompt
        self.user_prompt = user_prompt


# Generator

In [3]:
class LLMGenerator(nn.Module):
    def __init__(self, model, model_config):
        super().__init__()

        # gpt should already be trained
        self.generator = model
        self.tokenizer = model_config.tokenizer
        self.device = model_config.device
        self.gen_params = model_config.gen_params

    def forward(self, samples: list, batch_size: int = 1):
        
        
        outputs_list = []
        for i in range(0, len(samples), batch_size):
            
            batch_samples = samples[i:i+batch_size]
            encoding = self.tokenizer.batch_encode_plus(
                batch_samples, return_tensors='pt', padding=True, truncation=True)
            input_ids = encoding['input_ids'].to(self.device)

            with torch.no_grad():
                output_ids = self.generator.generate(
                    input_ids, pad_token_id=self.tokenizer.pad_token_id, **self.gen_params)

            # decode the generated text
            decoded_outputs = self.tokenizer.batch_decode(
                output_ids[:, input_ids.shape[1]:])
                
            outputs_list.extend(decoded_outputs)
            
        # remove special tokens from the generated text
        special_tokens = self.tokenizer.additional_special_tokens + \
            [self.tokenizer.pad_token] + [self.tokenizer.eos_token]
            
        for i, sample in enumerate(samples):
            output = outputs_list[i]
            for special_token in special_tokens:
                output = output.replace(special_token, "")
            outputs_list[i] = output
        
        return outputs_list
    
    
def transform_chat_template_with_prompt(prefix: str, prompt: str, tokenizer: AutoTokenizer,
                                        use_chat_template: bool = False, template_type: str = None,
                                        system_prompt: str = "", forced_prefix: str = "") -> str:
    
    """
    Transform a prefix with a prompt into a chat template
    
    Parameters:
    prefix : str
        The prefix to use
    prompt : str
        The prompt to use
    tokenizer : AutoTokenizer
        The tokenizer to use
    use_chat_template : bool, optional
        Whether to use a chat template, by default False
    template_type : str, optional
        The type of template to use, by default None
    system_prompt : str, optional
        The system prompt to use, by default ""
        
    Returns:
    str
        The transformed prefix
    """
        

    if prefix != "":
        text_instruction = f"{prompt} {prefix}"
    else:
        text_instruction = prompt
        
    if use_chat_template:
        if system_prompt == "":
            sys_prompt = "You are a helpful assistant."
        else:
            sys_prompt = system_prompt
        match template_type:
            case "system_user":
                messages = [
                {"role": "system", "content": f"{sys_prompt}"},
                {"role": "user", "content": f"{text_instruction}"},
                ]
            case "user":
                messages = [
                {"role": "user", "content": f"{text_instruction}"},
                ]
            case _:
                raise ValueError("Template type not supported")

        text_template = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )

        # force prefix on the generated response
        #text_template = f"{text_template}\n{forced_prefix}"
        text_template = f"{text_template} {forced_prefix}"

    else:
        text_template = text_instruction

    return text_template


# Attacks


In [12]:
# idea: 
# start with general attack def
# implement current attacks with this def
# test that this works and its practical
# iterate on the general attack def to make it better

# support batched gen!

class Attack(ABC):
    
    def __init__(self, gen_model, gen_config, gen_prompt_config):
        
        # Generator LLM
        self.gen_model = gen_model
        self.gen_prompt_config = gen_prompt_config
        self.gen_model_config = gen_config
        
        
    def generate_text(self, prefixes, batch_size=1):
        
        # assumption: all attacks will generate text

        gen_model = self.gen_model

        # apply the chat template with the prompt
        system_prompt = self.gen_prompt_config.system_prompt
        user_prompt = self.gen_prompt_config.user_prompt
        gen_tokenizer = self.gen_model_config.tokenizer
        use_chat_template = self.gen_model_config.use_chat_template
        template_type = self.gen_model_config.chat_template_type
        
        # apply the chat template with the prompt
        prefixes_with_prompt = [transform_chat_template_with_prompt(
            prefix, user_prompt, gen_tokenizer,
            use_chat_template, template_type, system_prompt, forced_prefix=prefix) for prefix in prefixes]

        # generate articles
        fake_articles = []
        fake_articles = gen_model(prefixes_with_prompt, batch_size=batch_size)
            
        # add the prefix back to the generated text since generation cuts the first "input_size" tokens from the input
        # if we force the prefix in the generation, it is counted in the "input_size" tokens
        fake_articles = [f"{prefixes[i]} {fake_articles[i]}" for i in range(len(fake_articles))]
        
        return fake_articles
    
    @abstractmethod 
    def generate_adversarial_text(self, prefixes, batch_size=1):
        """
        This is the adversarial version of text generation. 
        All attack should generate text at some point. Either generate text in a specific way or modify the generated text.
        """
        pass
    
class ParaphrasingAttack(Attack):
    
    def paraphrase(self, texts, nb_paraphrasing=1, batch_size=1) -> list:
        pass
    
class PromptParaphrasingAttack(Attack):
    
    def __init__(self, gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config):
        
        # Generator LLM
        self.gen_model = gen_model
        self.gen_prompt_config = gen_prompt_config
        self.gen_model_config = gen_config
                
        # Paraphraser LLM
        self.paraphraser_model = paraphraser_model
        self.paraphraser_prompt_config = paraphraser_prompt_config
        self.model_config = paraphraser_config
    
    def paraphrase(self, texts, nb_paraphrasing=1, batch_size=1) -> list:
        
        # Get all the parameters
        model_config = self.model_config
        tokenizer = model_config.tokenizer
        use_chat_template = model_config.use_chat_template
        template_type = model_config.chat_template_type
        system_paraphrasing_prompt = self.paraphraser_prompt_config.system_prompt
        user_paraphrasing_prompt = self.paraphraser_prompt_config.user_prompt
        
        fake_articles = texts

        # generate articles
        for i in range(nb_paraphrasing):
            
            #user_paraphrasing_prompts = [f"INPUT: {fake_text}" for fake_text in fake_articles]
        
            prefixes_with_prompt = [transform_chat_template_with_prompt(
                fake_article, user_paraphrasing_prompt, tokenizer,
                use_chat_template, template_type, system_paraphrasing_prompt, forced_prefix="OUTPUT:")
                for fake_article in fake_articles]
            
            fake_articles = []
            
            # generate the articles
            for i in range(0, len(prefixes_with_prompt), batch_size):
                samples = prefixes_with_prompt[i:i+batch_size]
                outputs = self.paraphraser_model(samples)
                fake_articles.extend(outputs)
                    
        return fake_articles
    
    """
    def generate_text(self, prefixes, batch_size=1):

        gen_model = self.gen_model

        # apply the chat template with the prompt
        system_prompt = self.gen_system_prompt
        user_prompt = self.gen_user_prompt
        gen_tokenizer = self.gen_model_config.tokenizer
        use_chat_template = self.gen_model_config.use_chat_template
        template_type = self.gen_model_config.chat_template_type
        
        # apply the chat template with the prompt
        prefixes_with_prompt = [transform_chat_template_with_prompt(
            prefix, user_prompt, gen_tokenizer,
            use_chat_template, template_type, system_prompt, forced_prefix=prefix) for prefix in prefixes]

        # generate articles
        fake_articles = []
        fake_articles = gen_model(prefixes_with_prompt, batch_size=batch_size)
            
        # add the prefix back to the generated text since generation cuts the first "input_size" tokens from the input
        # if we force the prefix in the generation, it is counted in the "input_size" tokens
        fake_articles = [f"{prefixes[i]} {fake_articles[i]}" for i in range(len(fake_articles))]
        

        
        return paraphrased_fake_articles
    """
    
    def generate_adversarial_text(self, prefixes, batch_size=1):

        # generate news articles in a "normal" way
        fake_articles = self.generate_text(prefixes, batch_size=batch_size)
        
        # paraphrase the texts
        paraphrased_fake_articles = self.paraphrase(fake_articles, batch_size=batch_size, nb_paraphrasing=1)
        
        return paraphrased_fake_articles
    
class PromptAttack(Attack):
    
    def __init__(self, gen_model, gen_config, gen_prompt_config, adversarial_prompt_config):
        
        # Generator LLM
        self.gen_model = gen_model
        self.gen_prompt_config = gen_prompt_config
        self.gen_model_config = gen_config
        
        # Set adversarial prompts
        self.adversarial_prompt_config = adversarial_prompt_config
    
    def generate_adversarial_text(self, prefixes, batch_size=1):
        
        # Create adversarial prompt configuration
        self.gen_prompt_config = self.adversarial_prompt_config
        
        # generate text
        fake_articles = self.generate_text(prefixes, batch_size=batch_size)
        
        return fake_articles
    
class GenParamsAttack(Attack):
    
    def __init__(self, gen_model, gen_config, gen_prompt_config, adversarial_gen_params):
        
        # Generator LLM
        self.gen_model = gen_model
        self.gen_prompt_config = gen_prompt_config
        self.gen_model_config = gen_config

        self.adversarial_gen_params = adversarial_gen_params
    
    def generate_adversarial_text(self, prefixes, batch_size=1):
    
        # Change specific generation parameters compared to base model
        for key, value in self.adversarial_gen_params.items():
            self.gen_model_config.gen_params[key] = value

        # generate text
        fake_articles = self.generate_text(prefixes, batch_size=batch_size)
        
        return fake_articles

        

In [5]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
paraphraser_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

paraphraser = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=paraphraser_tokenizer.pad_token_id,
).to(device)


paraphraser_config = ModelConfig(paraphraser_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)
paraphraser_model = LLMGenerator(paraphraser, paraphraser_config)

gen_model = paraphraser_model
gen_tokenizer = paraphraser_tokenizer
gen_config = paraphraser_config


dataset_list = ["Why are cats better than dogs", "Why are dogs better than cats"]

system_paraphrasing_prompt = """You are a paraphraser. You are given an input passage ‘INPUT’. You should paraphrase ‘INPUT’ to print ‘OUTPUT’."
    "‘OUTPUT’ shoud be diverse and different as much as possible from ‘INPUT’ and should not copy any part verbatim from ‘INPUT’."
    "‘OUTPUT’ should preserve the meaning and content of ’INPUT’ while maintaining text quality and grammar."
    "‘OUTPUT’ should not be much longer than ‘INPUT’. You should print ‘OUTPUT’ and nothing else so that its easy for me to parse."""
user_paraphrasing_prompt = "INPUT:"
paraphraser_prompt_config = PromptConfig(system_prompt=system_paraphrasing_prompt, user_prompt="")


gen_prompt = "You are a helpful assistant."
user_prompt = "Write a news article starting with:"
gen_prompt_config = PromptConfig(system_prompt=gen_prompt, user_prompt=user_prompt)

prompt_paraphrasing_attack = PromptParaphrasingAttack(gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config)
paraphrased_fake_articles = prompt_paraphrasing_attack.generate_adversarial_text(dataset_list, batch_size=2)

paraphrased_fake_articles

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Cats are generally more relaxed and can live independently, which makes them better-suited for being home owners. Dogs, on the other hand, are more prone to anxiety and other behavioral problems. So, cats are better because they can be more independent and less prone to anxiety. It\'s a matter of personal preference and behavior. The answer to the question "Why are cats better than dogs?" is that cats have lower energy levels and are more laid-back, which makes them better-suited for being home',
 ' Dogs are better than cats because they have a unique personality and loyalty towards their owners. While cats are known for their cuddly and affectionate personality, dogs are known for their loyalty and love of their families. Additionally, dogs are social animals that enjoy being with their owners, and they are known for their intelligence and intelligence. Therefore, dogs are an important part of many households, and they are also known for their companionship. Furthermore, they are c

In [7]:
adversarial_system_prompt = "You are a helpful assistant."
advesarial_user_prompt = "Write a news article in the CNN news article style starting with:"
adversarial_prompt_config = PromptConfig(system_prompt=adversarial_system_prompt, user_prompt=advesarial_user_prompt)

prompt_attack = PromptAttack(gen_model, gen_config, gen_prompt_config, adversarial_prompt_config)
prompt_attack_fake_articles = prompt_attack.generate_adversarial_text(dataset_list, batch_size=2)
prompt_attack_fake_articles

["Why are cats better than dogs ? Some people believe that cats are better at managing their emotions and instincts than dogs, while others argue that dogs have a more intuitive sense of the world and can be trained more effectively. However, there is no conclusive proof of this claim, and cats and dogs are truly different creatures with their own unique traits and abilities. Whether or not cats are better than dogs is a matter of personal opinion and can vary from person to person. Ultimately, it's up to each person to decide which pet they want",
 'Why are dogs better than cats ?\nIn the world of pet ownership, there is no denying the fact that dogs and cats are two of the most beloved pets. While cats have their unique quirks and traits that make them special, dogs are a breed that can bring joy and companionship to many people. In this article, we will explore why dogs are better than cats in a few key ways.\nFirst and foremost, dogs are highly social animals. Cats, while they are 

In [13]:
adversarial_gen_param = {
    "temperature": 1.2
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

["Why are cats better than dogs ?\n\nIt's safe to say that cats are not only adorable pets, but also known for their independence, leadership, and intelligence. While dogs have had a long history of making dog lovers proud, some studies suggest that cats may be able to outsmart dogs on certain skills.\n\nTo understand the differences between cats and dogs, let's consider their physical adaptations. Cats have long fingers, a rounded shape to their ears, and a short, round body shape that can accommodate various activities, such as crawling",
 'Why are dogs better than cats ?\nIf you\'re looking for reasons why dogs are the "best" pet option, the simple answer is that they are intelligent, loyal, playful, and affectionate. Of course, dogs have their flaws too, but this lack of flaws can often make them much easier to care for. However, the quality of relationships that can be formed between two individuals is greatly dependent on how well those individuals interact. In contrast, the "bet

In [15]:
adversarial_gen_param = {
    "temperature": 10.0
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

['Why are cats better than dogs  When assessing the overall performance statistics at this cross of various breed divisions versus its opposant (typically that dog which could have won and done such orda that dogs could barely take home). For instance dogs on short terms such which could easily dominate can no match to short in human lives .On many top ten contenders like Golden reunion/1 who proved an equal score against cats. While I wouldn categor a variety here may conclude a strong stance but ultimately I am strongly suggesting by all scores Cats on average beat',
 'Why are dogs better than cats : After many dogs have already beaten out kittens every Christmas with canine teeth clouthing noses red and purveycing the all-youcare-your-time party downworm season in the month before its first born arrival year, scientists concluded for weeks ahead as many saw that animals which weigh even lesser have larger mouths that swallow up more water over night, their large digestive organs all