In [1]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

from datasets import concatenate_datasets, load_from_disk, DatasetDict
import argparse
import os
import pandas as pd
import copy
from tqdm import tqdm

import torch
from torch import nn
import nltk.data
nltk.download('punkt')
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM,
                          ElectraForSequenceClassification, ElectraTokenizer, AutoConfig)
from datasets import load_from_disk, concatenate_datasets, Dataset

from abc import ABC, abstractmethod

from evasion_attack import PromptParaphrasingAttack, PromptAttack, GenParamsAttack
from utils import ModelConfig, PromptConfig
from generator import LLMGenerator
from detector import Detector, BertDetector
from fast_detect_gpt import FastDetectGPT

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


# Attacks

In [2]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
paraphraser_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

paraphraser = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=paraphraser_tokenizer.pad_token_id,
).to(device)


paraphraser_config = ModelConfig(paraphraser_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)
paraphraser_model = LLMGenerator(paraphraser, paraphraser_config)

gen_model = paraphraser_model
gen_tokenizer = paraphraser_tokenizer
gen_config = paraphraser_config


dataset_list = ["Why are cats better than dogs", "Why are dogs better than cats", "Why are cats better than dogs",
                "Why are dogs better than cats", "Why are cats better than dogs"]

system_paraphrasing_prompt = """You are a paraphraser. You are given an input passage ‘INPUT’. You should paraphrase ‘INPUT’ to print ‘OUTPUT’."
    "‘OUTPUT’ shoud be diverse and different as much as possible from ‘INPUT’ and should not copy any part verbatim from ‘INPUT’."
    "‘OUTPUT’ should preserve the meaning and content of ’INPUT’ while maintaining text quality and grammar."
    "‘OUTPUT’ should not be much longer than ‘INPUT’. You should print ‘OUTPUT’ and nothing else so that its easy for me to parse."""
user_paraphrasing_prompt = "INPUT:"
paraphraser_prompt_config = PromptConfig(system_prompt=system_paraphrasing_prompt, user_prompt="")


gen_prompt = "You are a helpful assistant."
user_prompt = "Write a news article starting with:"
gen_prompt_config = PromptConfig(system_prompt=gen_prompt, user_prompt=user_prompt)

prompt_paraphrasing_attack = PromptParaphrasingAttack(gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config)
paraphrased_fake_articles = prompt_paraphrasing_attack.generate_adversarial_text(dataset_list, batch_size=2)

paraphrased_fake_articles

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Many people argue that cats are simply more independent and independent in their behavior compared to dogs because they are more agile and agile than dogs. Cats can easily run and jump over obstacles while dogs are typically slower and more likely to get into trouble. Additionally, cats are often more social than dogs. This is why they are often the better option for humans. However, it is important to note that this argument is not based on any scientific evidence or studies. Additionally, it is important to note that it is',
 " Dogs and cats are both great pets but dogs are generally considered to be more reliable and loyal. They have better physical abilities, such as running longer distances and being more alert and focused than cats. Additionally, dogs are known to be more intelligent than cats and have been shown to be more docile and trainable. However, there are some differences in their personalities and behaviors, and it's important to take into account the specific needs 

In [4]:
adversarial_system_prompt = "You are a helpful assistant."
advesarial_user_prompt = "Write a news article in the CNN news article style starting with:"
adversarial_prompt_config = PromptConfig(system_prompt=adversarial_system_prompt, user_prompt=advesarial_user_prompt)

prompt_attack = PromptAttack(gen_model, gen_config, gen_prompt_config, adversarial_prompt_config)
prompt_attack_fake_articles = prompt_attack.generate_adversarial_text(dataset_list, batch_size=2)
prompt_attack_fake_articles

['Why are cats better than dogs ?\nThe question of whether cats are better than dogs is a topic that has been debated for centuries. While both animals are intelligent and capable of achieving success in various pursuits, there are several factors that could explain why cats are often considered more well-adjusted and intelligent than dogs. \nOne reason that cats may outperform dogs in certain tasks is that they are often social and have a natural inclination towards nurturing and caring for others. Dogs, on the other hand, are often social animals that are more independent',
 'Why are dogs better than cats ?\nIn many households, dogs are the most commonly owned pets, but they are also beloved companions for many families. While cats may be a smaller pet, they offer a unique way to bond with humans and provide a loyal and affectionate friend.\nAccording to a recent study published in the journal Nature, dogs are more social animals than cats and are known for their loyalty and intellig

In [5]:
adversarial_gen_param = {
    "temperature": 1.2
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

["Why are cats better than dogs ?\nThe topic of cats being more intelligent and affectionate than dogs is a widely debated subject, with different opinions within the media, scientists, and among pet owners themselves. Some argue that cats are superior, as they are capable of better cognitive functions and possess unique personality traits. Others believe that dogs are more capable and docile.\nRegardless of one's perspective, research has consistently shown that cats excel in certain tasks and interactions with their owners, and that they possess a keen sense of smell, a keen",
 'Why are dogs better than cats ?\nThere are many reasons why dogs are often considered to be superior to cats. However, one key factor that stands out is their exceptional ability to sense their surroundings and react appropriately. In general, dogs are more alert and responsive to their environment, which means they are able to adapt to a wide range of situations quickly and effectively.\nFurthermore, dogs ar

In [6]:
adversarial_gen_param = {
    "temperature": 10.0
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

['Why are cats better than dogs ... It all begins now as The "Alo !!g " breeds one and every once and every another discovers we must take that leap that only some things do (it says...) In an experiment known to lovers there, scientists find it takes three bites rather just... A couple dogs decide between finding themselves cats instead, just a new way at trying and it happens, right? We then found and had found one in which, this morning The little brown one comes for it A dog named Jodie decides',
 "Why are dogs better than cats  when coming from animal福利? Why and Can. When one breeds or socialize to another?\nThis is one reason dog owners have long debated: While both of species excel in all physical behaviors: Intelligence vs Motion Dog Behavior The research shows:\nMost experts, if all of breed's qualities were transferred as their ability towards social relationships would greatly evolve from breed A dog may learn tricks well if bred A because those that grew the best with human

# Detection

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
detector_path = "google/electra-large-discriminator"
config = AutoConfig.from_pretrained(detector_path)
detector_model = ElectraForSequenceClassification(config)
bert_tokenizer = ElectraTokenizer.from_pretrained(detector_path)

model_path = "../saved_training_logs_experiment_2/electra_large/full_finetuning/fake_true_dataset_round_robin_10k/10_06_1308/saved_models/best_model.pt"
detector_model.load_state_dict(torch.load(model_path))
detector_model.to(device)



ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [4]:
text_to_detect = ["I am AI generated text", "I am human generated text"]
detector = BertDetector(detector_model, bert_tokenizer, device)
preds, logits = detector.detect(text_to_detect, detection_threshold=0.5, batch_size=2)

NameError: name 'detector_model' is not defined

In [4]:
preds, logits

([0, 0], [0.14659571647644043, -0.6974161863327026])

## Fast-DetectGPT

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ref_model_path = "openai-community/gpt2"
ref_model = AutoModelForCausalLM.from_pretrained(ref_model_path, torch_dtype="auto").to(device)
ref_tokenizer = AutoTokenizer.from_pretrained(ref_model_path, trust_remote_code=True, padding_side="left")

# special for gpt2
ref_tokenizer.pad_token = ref_tokenizer.eos_token
ref_tokenizer.padding_side = 'left'

scoring_model = ref_model
scoring_tokenizer = ref_tokenizer



In [3]:
fast_detector = FastDetectGPT(ref_model, scoring_model, ref_tokenizer, scoring_tokenizer, device)

texts = ["I am AI generated text", "I am human generated text"]
preds, probs = fast_detector.detect(texts)
preds, probs

NameError: name 'os' is not defined

# Data Loader

In [58]:
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets, disable_progress_bar, enable_progress_bar
import pandas as pd
import numpy as np


### Helper functions
def create_train_from_dataset(dataset: Dataset) -> DatasetDict:
    """
    Create a train split from a dataset. We go from Dataset to DatasetDict.
    
    Parameters:
    dataset : Dataset
        The dataset to create the train split from
    
    Returns:
    DatasetDict
        The dataset with the train split
    
    """

    dataset_dict = DatasetDict()
    dataset_dict["train"] = dataset

    return dataset_dict

def filter_duplicates(dataset: Dataset, text_field: str) -> Dataset:
    
    # check duplicates in the text_field
    dataset_df = pd.DataFrame(dataset)
    len_before_discard = dataset_df.shape[0]
    
    dataset_df = dataset_df.drop_duplicates(subset=[text_field])
    len_after_discard = dataset_df.shape[0]
    print(f"Percent of data discarded after removing duplicate {text_field}: {100*(1 - len_after_discard/len_before_discard):.2f}%")
    
    return Dataset.from_pandas(dataset_df)

def create_splits(dataset: Dataset, train_size: float, eval_size: float, test_size: float) -> DatasetDict:
    
    train_size = len(dataset["train"])
    eval_size = int(train_size * eval_size)
    test_size = int(train_size * test_size)

    dataset = DatasetDict({
    'train': dataset["train"].select(range(train_size - eval_size - test_size)),
    'eval': dataset["train"].select(range(train_size - eval_size - test_size, train_size - test_size)),
    'test': dataset["train"].select(range(train_size - test_size, train_size))})

    print("Train size:", len(dataset['train']))
    print("Eval size:", len(dataset['eval']))
    print("Test size:", len(dataset['test']))

    return dataset


class FakeTruePairsDataLoader:
    
    def __init__(self, dataset_size, hf_dataset_path, text_field, prefix_size=10) -> None:
        pass
    
    def regroup_pairs(self, pairs: list) -> list:
        pass
    
    def process_data(self, data: DatasetDict) -> DatasetDict:
        pass
    
    def load_data(self, path: str) -> DatasetDict:
        pass
    
    
class CNNDataLoader(FakeTruePairsDataLoader):
    
    def __init__(self, dataset_size, hf_dataset_path="abisee/cnn_dailymail", text_field="article", prefix_size=10, seed=42) -> None:
        self.dataset_size = dataset_size
        self.text_field = text_field
        self.prefix_size = prefix_size
        self.hf_dataset_path = hf_dataset_path
        self.seed = seed
    
    def regroup_pairs(self, dataset_true, dataset_fake) -> Dataset:
        # merge the two datasets by regrouping the pairs of human and AI samples with the same prefix
        # the first element of the pair is chosen randomly
        merged_dataset = []
        for i in range(len(dataset_true)):
            
            # choose randomly the first element of the pair
            random_first = np.random.choice([0, 1])
            
            if random_first == 0:
                merged_dataset.append({"label": dataset_true[i]["label"], self.text_field: dataset_true[i][self.text_field], "prefix": dataset_true[i]["prefix"]})
                merged_dataset.append({"label": dataset_fake[i]["label"], self.text_field: dataset_fake[i][self.text_field], "prefix": dataset_fake[i]["prefix"]})
                
            else:
                merged_dataset.append({"label": dataset_fake[i]["label"], self.text_field: dataset_fake[i][self.text_field], "prefix": dataset_fake[i]["prefix"]})
                merged_dataset.append({"label": dataset_true[i]["label"], self.text_field: dataset_true[i][self.text_field], "prefix": dataset_true[i]["prefix"]})
                
        dataset = Dataset.from_pandas(pd.DataFrame(merged_dataset))
        
        return dataset
    
    def clean_dataset(self, dataset: Dataset) -> Dataset:
        
        def remove_bloat(sample):
            filtered_text = sample["article"]
            nb_separator = filtered_text.count("--")
            if nb_separator > 0:
                filtered_text = filtered_text.split("--", 1)[1].strip()

            # heurstic to handle cases where the instruction contains an input of this type:
            # By . Jill Reilly . PUBLISHED: . 08:21 EST, 6 December 2012 . | . UPDATED: . 16:19 EST, 6
            if "EST," in filtered_text.split():
                split_est = filtered_text.split("EST,")
                count_est = len(split_est)
                filtered_text = split_est[count_est-1].split()[4:]
                filtered_text = " ".join(filtered_text)
            return {"article": filtered_text}
        
        dataset = dataset.map(remove_bloat)
        
        
        return dataset
    
    def process_data(self, dataset: DatasetDict) -> DatasetDict:
        
        dataset = self.clean_dataset(dataset)    
        dataset = filter_duplicates(dataset, self.text_field) 
        
        # create label 0 (human) and create empty texts with label 1 (AI)
        dataset = dataset.map(lambda x: {"label": 0, self.text_field: x[self.text_field]})
        
        # create prefix column, which contains the first self.prefix_size words of the human text
        dataset = dataset.map(lambda x: {"prefix": x[self.text_field]})
        
        # copy all human samples into AI samples with label 1 and empty text
        dataset_fake = dataset.map(lambda x: {"label": 1, self.text_field: "", "prefix": x["prefix"]})
        dataset = self.regroup_pairs(dataset, dataset_fake)
    
        dataset = create_train_from_dataset(dataset)
        return dataset

    
    def load_data(self) -> DatasetDict:
        
        # we take the train split but we'll split later into train, val, test
        dataset_base = load_dataset(self.hf_dataset_path, "3.0.0")["train"]
        
        # select the first dataset_size samples
        dataset_base = dataset_base.shuffle(self.seed)
        dataset_base = dataset_base.select(range(self.dataset_size))
        
        # only keep the text field
        cols_to_remove = [col for col in dataset_base.column_names if col != self.text_field]
        dataset_base = dataset_base.remove_columns(cols_to_remove)

        processed_dataset = self.process_data(dataset_base)
        
        # split into train, val, test
        train_split_size_percent = 0.8
        eval_split_size_percent = 0.1
        test_split_size_percent = 0.1
        processed_dataset = create_splits(processed_dataset, train_split_size_percent, eval_split_size_percent, test_split_size_percent)
        
        
        return processed_dataset
    

In [59]:
dataset_size = 1000
processed_cnn_dataset = CNNDataLoader(dataset_size).load_data()

Percent of data discarded after removing duplicate article: 0.00%


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

dataset merged:  Dataset({
    features: ['label', 'article', 'prefix'],
    num_rows: 2000
})
Train size: 1600
Eval size: 200
Test size: 200


In [60]:
processed_cnn_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 1600
    })
    eval: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 200
    })
    test: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 200
    })
})

In [61]:
processed_cnn_dataset["train"][0]

{'label': 1,
 'article': '',
 'prefix': "Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide poisoning from a cooker. Tragic: The inquests have opened into the deaths of three members of the same family who were found in their static caravan last weekend. John and Audrey Cook are pictured . Awful: The family died following carbon monoxide poisoning at this caravan at the Tremarle Home Park in Camborne, Cornwall . It is also believed there was no working carbon monoxide detector in the static caravan. Cornwall Fire and Rescue Service said this wou

In [62]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Article processed:")
    print(processed_cnn_dataset["train"]["article"][i])
    print("--------------------")
    print("\n")

Article processed:

--------------------


Article processed:
Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide poisoning from a cooker. Tragic: The inquests have opened into the deaths of three members of the same family who were found in their static caravan last weekend. John and Audrey Cook are pictured . Awful: The family died following carbon monoxide poisoning at this caravan at the Tremarle Home Park in Camborne, Cornwall . It is also believed there was no working carbon monoxide detector in the static caravan. Cornwall Fire and Rescue 

In [63]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Sample:")
    print(processed_cnn_dataset["train"][i])
    print("--------------------")
    print("\n")

Sample:
{'label': 1, 'article': '', 'prefix': "Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide poisoning from a cooker. Tragic: The inquests have opened into the deaths of three members of the same family who were found in their static caravan last weekend. John and Audrey Cook are pictured . Awful: The family died following carbon monoxide poisoning at this caravan at the Tremarle Home Park in Camborne, Cornwall . It is also believed there was no working carbon monoxide detector in the static caravan. Cornwall Fire and Rescue Service said th

# Watermarking

In [None]:
class WatermarkingDetector(Detector):
    def __init__(self, watermark):
        self.watermark = watermark
        
    def detect(self, text: str) -> bool:
        return self.watermark in text

In [None]:
class WatermarkingGenerator(Generator):
    
    def __init__(self, model, tokenizer, watermark):
        self.model = model
        self.tokenizer = tokenizer
        self.watermark = watermark
        
    def generate(self, prompt: str) -> str:
        text = self.watermark + " " + prompt
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model.generate(**inputs)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)