In [1]:

from abc import ABC, abstractmethod
from dataclasses import dataclass

from datasets import concatenate_datasets, load_from_disk, DatasetDict
import argparse
import os
import pandas as pd
import copy
from tqdm import tqdm

import torch
from torch import nn
import nltk.data
nltk.download('punkt')
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM,
                          ElectraForSequenceClassification, ElectraTokenizer, AutoConfig)
from datasets import load_from_disk, concatenate_datasets, Dataset

from abc import ABC, abstractmethod

from evasion_attack import PromptParaphrasingAttack, PromptAttack, GenParamsAttack
from utils import ModelConfig, PromptConfig
from generator import LLMGenerator
from detector import Detector, BertDetector
from fast_detect_gpt import FastDetectGPT
from cnn_dataset import CNNDataLoader
from experiment_pipeline import ExperimentTestPipeline


[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Attacks

In [2]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
paraphraser_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

paraphraser = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=paraphraser_tokenizer.pad_token_id,
).to(device)


paraphraser_config = ModelConfig(paraphraser_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)
paraphraser_model = LLMGenerator(paraphraser, paraphraser_config)

gen_model = paraphraser_model
gen_tokenizer = paraphraser_tokenizer
gen_config = paraphraser_config


dataset_list = ["Why are cats better than dogs", "Why are dogs better than cats", "Why are cats better than dogs",
                "Why are dogs better than cats", "Why are cats better than dogs"]

system_paraphrasing_prompt = """You are a paraphraser. You are given an input passage ‘INPUT’. You should paraphrase ‘INPUT’ to print ‘OUTPUT’."
    "‘OUTPUT’ shoud be diverse and different as much as possible from ‘INPUT’ and should not copy any part verbatim from ‘INPUT’."
    "‘OUTPUT’ should preserve the meaning and content of ’INPUT’ while maintaining text quality and grammar."
    "‘OUTPUT’ should not be much longer than ‘INPUT’. You should print ‘OUTPUT’ and nothing else so that its easy for me to parse."""
user_paraphrasing_prompt = "INPUT:"
paraphraser_prompt_config = PromptConfig(system_prompt=system_paraphrasing_prompt, user_prompt="")


gen_prompt = "You are a helpful assistant."
user_prompt = "Write a news article starting with:"
gen_prompt_config = PromptConfig(system_prompt=gen_prompt, user_prompt=user_prompt)

prompt_paraphrasing_attack = PromptParaphrasingAttack(gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config)
paraphrased_fake_articles = prompt_paraphrasing_attack.generate_adversarial_text(dataset_list, batch_size=2)

paraphrased_fake_articles

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Cats are better than dogs as they have developed traits that allow them to thrive and survive in various environments, while dogs lack such traits. Cats are known for being solitary animals, while dogs are social animals. Cats have the ability to be social animals, while dogs lack such traits. So, cats are better than dogs because they have developed unique traits that allow them to thrive and survive in various environments. Cats are known for being solitary animals, while dogs are social animals. Cats have the ability to be',
 ' Dogs are generally better at performing certain tasks than cats, while dogs are more trainable and have stronger attachments to their owners. They are known for their loyalty and intelligence. Dogs are also known for their ability to be highly trainable and to form strong attachments to their owners. They are also known for their ability to perform a variety of tasks, such as hunting, guarding, and performing tricks. They are also known for their ability t

In [4]:
adversarial_system_prompt = "You are a helpful assistant."
advesarial_user_prompt = "Write a news article in the CNN news article style starting with:"
adversarial_prompt_config = PromptConfig(system_prompt=adversarial_system_prompt, user_prompt=advesarial_user_prompt)

prompt_attack = PromptAttack(gen_model, gen_config, gen_prompt_config, adversarial_prompt_config)
prompt_attack_fake_articles = prompt_attack.generate_adversarial_text(dataset_list, batch_size=2)
prompt_attack_fake_articles

['Why are cats better than dogs ?\nThe question of whether cats are better than dogs is a topic that has been debated for centuries. While both animals are intelligent and capable of achieving success in various pursuits, there are several factors that could explain why cats are often considered more well-adjusted and intelligent than dogs. \nOne reason that cats may outperform dogs in certain tasks is that they are often social and have a natural inclination towards nurturing and caring for others. Dogs, on the other hand, are often social animals that are more independent',
 'Why are dogs better than cats ?\nIn many households, dogs are the most commonly owned pets, but they are also beloved companions for many families. While cats may be a smaller pet, they offer a unique way to bond with humans and provide a loyal and affectionate friend.\nAccording to a recent study published in the journal Nature, dogs are more social animals than cats and are known for their loyalty and intellig

In [5]:
adversarial_gen_param = {
    "temperature": 1.2
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

["Why are cats better than dogs ?\nThe topic of cats being more intelligent and affectionate than dogs is a widely debated subject, with different opinions within the media, scientists, and among pet owners themselves. Some argue that cats are superior, as they are capable of better cognitive functions and possess unique personality traits. Others believe that dogs are more capable and docile.\nRegardless of one's perspective, research has consistently shown that cats excel in certain tasks and interactions with their owners, and that they possess a keen sense of smell, a keen",
 'Why are dogs better than cats ?\nThere are many reasons why dogs are often considered to be superior to cats. However, one key factor that stands out is their exceptional ability to sense their surroundings and react appropriately. In general, dogs are more alert and responsive to their environment, which means they are able to adapt to a wide range of situations quickly and effectively.\nFurthermore, dogs ar

In [6]:
adversarial_gen_param = {
    "temperature": 10.0
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

['Why are cats better than dogs ... It all begins now as The "Alo !!g " breeds one and every once and every another discovers we must take that leap that only some things do (it says...) In an experiment known to lovers there, scientists find it takes three bites rather just... A couple dogs decide between finding themselves cats instead, just a new way at trying and it happens, right? We then found and had found one in which, this morning The little brown one comes for it A dog named Jodie decides',
 "Why are dogs better than cats  when coming from animal福利? Why and Can. When one breeds or socialize to another?\nThis is one reason dog owners have long debated: While both of species excel in all physical behaviors: Intelligence vs Motion Dog Behavior The research shows:\nMost experts, if all of breed's qualities were transferred as their ability towards social relationships would greatly evolve from breed A dog may learn tricks well if bred A because those that grew the best with human

# Detection

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
detector_path = "google/electra-large-discriminator"
config = AutoConfig.from_pretrained(detector_path)
detector_model = ElectraForSequenceClassification(config)
bert_tokenizer = ElectraTokenizer.from_pretrained(detector_path)

model_path = "../saved_training_logs_experiment_2/electra_large/full_finetuning/fake_true_dataset_round_robin_10k/10_06_1308/saved_models/best_model.pt"
detector_model.load_state_dict(torch.load(model_path))
detector_model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [4]:
text_to_detect = ["I am AI generated text", "I am human generated text"]
detector = BertDetector(detector_model, bert_tokenizer, device)
preds, logits = detector.detect(text_to_detect, batch_size=2)

NameError: name 'detector_model' is not defined

In [4]:
preds, logits

([0, 0], [0.14659571647644043, -0.6974161863327026])

## Fast-DetectGPT

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ref_model_path = "openai-community/gpt2"
ref_model = AutoModelForCausalLM.from_pretrained(ref_model_path, torch_dtype="auto").to(device)
ref_tokenizer = AutoTokenizer.from_pretrained(ref_model_path, trust_remote_code=True, padding_side="left")

# special for gpt2
ref_tokenizer.pad_token = ref_tokenizer.eos_token
ref_tokenizer.padding_side = 'left'

scoring_model = ref_model
scoring_tokenizer = ref_tokenizer



In [3]:
fast_detector = FastDetectGPT(ref_model, scoring_model, ref_tokenizer, scoring_tokenizer, device)

texts = ["I am AI generated text", "I am human generated text"]
preds, probs = fast_detector.detect(texts)
preds, probs

NameError: name 'os' is not defined

# Data Loader

In [5]:
dataset_size = 1000
processed_cnn_dataset = CNNDataLoader(dataset_size).load_data()

Percent of data discarded after removing duplicate article: 0.00%


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Train size: 1600
Eval size: 200
Test size: 200


In [6]:
processed_cnn_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'prefix'],
        num_rows: 1600
    })
    eval: Dataset({
        features: ['label', 'text', 'prefix'],
        num_rows: 200
    })
    test: Dataset({
        features: ['label', 'text', 'prefix'],
        num_rows: 200
    })
})

In [4]:
processed_cnn_dataset["train"][0]

{'label': 1,
 'article': '',
 'prefix': 'Three members of the same family who died in a'}

In [5]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Article processed:")
    print(processed_cnn_dataset["train"]["text"][i])
    print("--------------------")
    print("\n")

Article processed:

--------------------


Article processed:
Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide 
--------------------


Article processed:
A rare meeting of U.N. Security Council heads of state, led for the first time by a U.S. president, adopted a resolution focused on stopping the spread of nuclear weapons Thursday. President Obama is the first U.S. leader to head a United Nations Security Council meeting. President Obama challenged the gathering -- which included leaders of nuclear powers including Russia, China, Great Britai

In [6]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Sample:")
    print(processed_cnn_dataset["train"][i])
    print("--------------------")
    print("\n")

Sample:
{'label': 1, 'article': '', 'prefix': 'Three members of the same family who died in a'}
--------------------


Sample:
{'label': 0, 'article': "Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide ", 'prefix': 'Three members of the same family who died in a'}
--------------------


Sample:
{'label': 0, 'article': 'A rare meeting of U.N. Security Council heads of state, led for the first time by a U.S. president, adopted a resolution focused on stopping the spread of nuclear weapons Thursday. President Obama is the first U.S. leader to head

## Test an Attack

In [4]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
gen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

gen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=gen_tokenizer.pad_token_id,
).to(device)




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
gen_config = ModelConfig(gen_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)

gen_model = LLMGenerator(gen, gen_config)

system_prompt = "You are a helpful assistant."
user_prompt = "Continue writing the following news article starting with:"
prompt_config = PromptConfig(system_prompt=system_prompt, user_prompt=user_prompt)


text_gen_no_attack = PromptAttack(gen_model, gen_config, system_prompt, prompt_config)


true_articles = processed_cnn_dataset["train"].filter(lambda x: x["label"] == 0)
true_articles_prefixes = true_articles["prefix"][:10]

fake_articles = text_gen_no_attack.generate_adversarial_text(true_articles_prefixes, batch_size=1)

Filter:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [10]:
true_articles_sample = true_articles["article"][:10]
for i in range(len(true_articles_sample)):
    print("True article:", true_articles_sample[i])
    print("Fake article:", fake_articles[i])
    print("\n")

True article: Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide 
Fake article: Three members of the same family who died in a  tragic car accident on Thursday evening have been identified by the family's lawyer. The accident occurred around 8:30 p.m. on the I-275 eastbound exit ramp near the intersection with I-40.
The three victims, who were wearing black and white safety belts, were pronounced dead at the scene. The driver of the vehicle was taken to a local hospital for treatment of minor injuries. The driver of the car was also taken to a h

# Experiment Pipeline

In [2]:
dataset_size = 100
cnn_data_loader = CNNDataLoader(dataset_size)


# generator

# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

gen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

gen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=gen_tokenizer.pad_token_id,
).to(device)

gen_config = ModelConfig(gen_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)

gen_model = LLMGenerator(gen, gen_config)

system_prompt = "You are a helpful assistant."
user_prompt = "Continue writing the following news article starting with:"
prompt_config = PromptConfig(system_prompt=system_prompt, user_prompt=user_prompt)


text_gen_no_attack = PromptAttack(gen_model, gen_config, system_prompt, prompt_config)

text_gen_no_attack.set_attack_name("no_attack")


device = "cuda" if torch.cuda.is_available() else "cpu"
detector_path = "google/electra-large-discriminator"
config = AutoConfig.from_pretrained(detector_path)
detector_model = ElectraForSequenceClassification(config)
bert_tokenizer = ElectraTokenizer.from_pretrained(detector_path)



model_path = "../saved_training_logs_experiment_2/electra_large/full_finetuning/fake_true_dataset_round_robin_10k/10_06_1308/saved_models/best_model.pt"
detector_model.load_state_dict(torch.load(model_path))
detector_model.to(device)

# threshold on the classifier softmaxed logits
detection_threshold = 0.5
detector = BertDetector(detector_model, bert_tokenizer, device, detection_threshold=detection_threshold)


skip_cache = False
simple_test_pipeline = ExperimentTestPipeline(cnn_data_loader, text_gen_no_attack, detector, device, "benchmark_saved_results", skip_cache=skip_cache)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
simple_test_pipeline.run_pipeline()

Dataset cnn_dailymail_no_attack already exists, loading it
Classifying the articles...


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Detecting...: 100%|██████████| 20/20 [00:04<00:00,  4.93it/s]

Average metrics:  {'accuracy': 0.84795, 'precision': 0.7663985216838545, 'recall': 1.0, 'f1_score': 0.862762629767672, 'fp_rate': 0.3023178210678211, 'tp_rate': 1.0}
Standard deviation of metrics:  {'accuracy': 0.07888787929713917, 'precision': 0.11537122071282278, 'recall': 0.0, 'f1_score': 0.07669503854760946, 'fp_rate': 0.14694976332848445, 'tp_rate': 0.0}
Test metrics:
accuracy: 0.84795
precision: 0.7663985216838545
recall: 1.0
f1_score: 0.862762629767672
fp_rate: 0.3023178210678211
tp_rate: 1.0
std_accuracy: 0.07888787929713917
std_precision: 0.11537122071282278
std_recall: 0.0
std_f1_score: 0.07669503854760946
std_fp_rate: 0.14694976332848445
std_tp_rate: 0.0
TP: 9.912
TN: 7.047
FP: 3.041
FN: 0.0
roc_auc: 0
fpr: [0.]
tpr: [0.]
thresholds: [0.]





Average metrics:  {'accuracy': 0.5049, 'precision': 0.5049, 'recall': 1.0, 'f1_score': 0.6637278968167635, 'fp_rate': 1.0, 'tp_rate': 1.0}
Standard deviation of metrics:  {'accuracy': 0.1109098282389798, 'precision': 0.1109098282389798, 'recall': 0.0, 'f1_score': 0.09929703240285644, 'fp_rate': 0.0, 'tp_rate': 0.0}
Test metrics at specific given threshold:
accuracy: 0.5049
precision: 0.5049
recall: 1.0
f1_score: 0.6637278968167635
fp_rate: 1.0
tp_rate: 1.0
std_accuracy: 0.1109098282389798
std_precision: 0.1109098282389798
std_recall: 0.0
std_f1_score: 0.09929703240285644
std_fp_rate: 0.0
std_tp_rate: 0.0
TP: 10.098
TN: 0.0
FP: 9.902
FN: 0.0


# Watermarking

In [2]:
from watermark.auto_watermark import AutoWatermark

default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

gen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

gen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=gen_tokenizer.pad_token_id,
).to(device)

gen_config = ModelConfig(gen_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
myWatermark = AutoWatermark.load('KGW', 
                                 algorithm_config='config/KGW.json',
                                 gen_model=gen,
                                 model_config=gen_config)

In [6]:
prompt = "Good morning"
watermarked_text = myWatermark.generate_watermarked_text(prompt)
watermarked_text

'Good morning, and welcome back to the New York Times Best Sellers list. It\'s 4:00 in the afternoon and it\'s time for a look at what\'s trending in bookstores across the country. We\'re looking at the top titles, and I\'m Jon Cale. We\'re bringing you an insider\'s look at what\'s on the best sellers list today. Good evening. Thank you.\nHere\'s what I\'m looking at:\n"War of the Worlds" by Isaac Asimov.'

In [8]:
detect_result = myWatermark.detect_watermark(watermarked_text)
detect_result

{'is_watermarked': True, 'score': 5.273697108112943}