In [1]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

from datasets import concatenate_datasets, load_from_disk, DatasetDict
import argparse
import os
import pandas as pd
import copy
from tqdm import tqdm

import torch
from torch import nn
import nltk.data
nltk.download('punkt')
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM,
                          ElectraForSequenceClassification, ElectraTokenizer, AutoConfig)
from datasets import load_from_disk, concatenate_datasets, Dataset

from abc import ABC, abstractmethod

from evasion_attack import PromptParaphrasingAttack, PromptAttack, GenParamsAttack
from utils import ModelConfig, PromptConfig
from generator import LLMGenerator
from detector import Detector, BertDetector
from fast_detect_gpt import FastDetectGPT

[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/marluxiaboss/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Attacks

In [2]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
paraphraser_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

paraphraser = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=paraphraser_tokenizer.pad_token_id,
).to(device)


paraphraser_config = ModelConfig(paraphraser_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)
paraphraser_model = LLMGenerator(paraphraser, paraphraser_config)

gen_model = paraphraser_model
gen_tokenizer = paraphraser_tokenizer
gen_config = paraphraser_config


dataset_list = ["Why are cats better than dogs", "Why are dogs better than cats", "Why are cats better than dogs",
                "Why are dogs better than cats", "Why are cats better than dogs"]

system_paraphrasing_prompt = """You are a paraphraser. You are given an input passage ‘INPUT’. You should paraphrase ‘INPUT’ to print ‘OUTPUT’."
    "‘OUTPUT’ shoud be diverse and different as much as possible from ‘INPUT’ and should not copy any part verbatim from ‘INPUT’."
    "‘OUTPUT’ should preserve the meaning and content of ’INPUT’ while maintaining text quality and grammar."
    "‘OUTPUT’ should not be much longer than ‘INPUT’. You should print ‘OUTPUT’ and nothing else so that its easy for me to parse."""
user_paraphrasing_prompt = "INPUT:"
paraphraser_prompt_config = PromptConfig(system_prompt=system_paraphrasing_prompt, user_prompt="")


gen_prompt = "You are a helpful assistant."
user_prompt = "Write a news article starting with:"
gen_prompt_config = PromptConfig(system_prompt=gen_prompt, user_prompt=user_prompt)

prompt_paraphrasing_attack = PromptParaphrasingAttack(gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config)
paraphrased_fake_articles = prompt_paraphrasing_attack.generate_adversarial_text(dataset_list, batch_size=2)

paraphrased_fake_articles

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Many people argue that cats are simply more independent and independent in their behavior compared to dogs because they are more agile and agile than dogs. Cats can easily run and jump over obstacles while dogs are typically slower and more likely to get into trouble. Additionally, cats are often more social than dogs. This is why they are often the better option for humans. However, it is important to note that this argument is not based on any scientific evidence or studies. Additionally, it is important to note that it is',
 " Dogs and cats are both great pets but dogs are generally considered to be more reliable and loyal. They have better physical abilities, such as running longer distances and being more alert and focused than cats. Additionally, dogs are known to be more intelligent than cats and have been shown to be more docile and trainable. However, there are some differences in their personalities and behaviors, and it's important to take into account the specific needs 

In [4]:
adversarial_system_prompt = "You are a helpful assistant."
advesarial_user_prompt = "Write a news article in the CNN news article style starting with:"
adversarial_prompt_config = PromptConfig(system_prompt=adversarial_system_prompt, user_prompt=advesarial_user_prompt)

prompt_attack = PromptAttack(gen_model, gen_config, gen_prompt_config, adversarial_prompt_config)
prompt_attack_fake_articles = prompt_attack.generate_adversarial_text(dataset_list, batch_size=2)
prompt_attack_fake_articles

['Why are cats better than dogs ?\nThe question of whether cats are better than dogs is a topic that has been debated for centuries. While both animals are intelligent and capable of achieving success in various pursuits, there are several factors that could explain why cats are often considered more well-adjusted and intelligent than dogs. \nOne reason that cats may outperform dogs in certain tasks is that they are often social and have a natural inclination towards nurturing and caring for others. Dogs, on the other hand, are often social animals that are more independent',
 'Why are dogs better than cats ?\nIn many households, dogs are the most commonly owned pets, but they are also beloved companions for many families. While cats may be a smaller pet, they offer a unique way to bond with humans and provide a loyal and affectionate friend.\nAccording to a recent study published in the journal Nature, dogs are more social animals than cats and are known for their loyalty and intellig

In [5]:
adversarial_gen_param = {
    "temperature": 1.2
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

["Why are cats better than dogs ?\nThe topic of cats being more intelligent and affectionate than dogs is a widely debated subject, with different opinions within the media, scientists, and among pet owners themselves. Some argue that cats are superior, as they are capable of better cognitive functions and possess unique personality traits. Others believe that dogs are more capable and docile.\nRegardless of one's perspective, research has consistently shown that cats excel in certain tasks and interactions with their owners, and that they possess a keen sense of smell, a keen",
 'Why are dogs better than cats ?\nThere are many reasons why dogs are often considered to be superior to cats. However, one key factor that stands out is their exceptional ability to sense their surroundings and react appropriately. In general, dogs are more alert and responsive to their environment, which means they are able to adapt to a wide range of situations quickly and effectively.\nFurthermore, dogs ar

In [6]:
adversarial_gen_param = {
    "temperature": 10.0
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

['Why are cats better than dogs ... It all begins now as The "Alo !!g " breeds one and every once and every another discovers we must take that leap that only some things do (it says...) In an experiment known to lovers there, scientists find it takes three bites rather just... A couple dogs decide between finding themselves cats instead, just a new way at trying and it happens, right? We then found and had found one in which, this morning The little brown one comes for it A dog named Jodie decides',
 "Why are dogs better than cats  when coming from animal福利? Why and Can. When one breeds or socialize to another?\nThis is one reason dog owners have long debated: While both of species excel in all physical behaviors: Intelligence vs Motion Dog Behavior The research shows:\nMost experts, if all of breed's qualities were transferred as their ability towards social relationships would greatly evolve from breed A dog may learn tricks well if bred A because those that grew the best with human

# Detection

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
detector_path = "google/electra-large-discriminator"
config = AutoConfig.from_pretrained(detector_path)
detector_model = ElectraForSequenceClassification(config)
bert_tokenizer = ElectraTokenizer.from_pretrained(detector_path)

model_path = "../saved_training_logs_experiment_2/electra_large/full_finetuning/fake_true_dataset_round_robin_10k/10_06_1308/saved_models/best_model.pt"
detector_model.load_state_dict(torch.load(model_path))
detector_model.to(device)



ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [4]:
text_to_detect = ["I am AI generated text", "I am human generated text"]
detector = BertDetector(detector_model, bert_tokenizer, device)
preds, logits = detector.detect(text_to_detect, detection_threshold=0.5, batch_size=2)

NameError: name 'detector_model' is not defined

In [4]:
preds, logits

([0, 0], [0.14659571647644043, -0.6974161863327026])

## Fast-DetectGPT

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ref_model_path = "openai-community/gpt2"
ref_model = AutoModelForCausalLM.from_pretrained(ref_model_path, torch_dtype="auto").to(device)
ref_tokenizer = AutoTokenizer.from_pretrained(ref_model_path, trust_remote_code=True, padding_side="left")

# special for gpt2
ref_tokenizer.pad_token = ref_tokenizer.eos_token
ref_tokenizer.padding_side = 'left'

scoring_model = ref_model
scoring_tokenizer = ref_tokenizer



In [3]:
fast_detector = FastDetectGPT(ref_model, scoring_model, ref_tokenizer, scoring_tokenizer, device)

texts = ["I am AI generated text", "I am human generated text"]
preds, probs = fast_detector.detect(texts)
preds, probs

NameError: name 'os' is not defined

# Data Loader

In [2]:
from cnn_dataset import CNNDataLoader

dataset_size = 1000
processed_cnn_dataset = CNNDataLoader(dataset_size).load_data()

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Percent of data discarded after removing duplicate article: 0.00%


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Train size: 1600
Eval size: 200
Test size: 200


In [3]:
processed_cnn_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 1600
    })
    eval: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 200
    })
    test: Dataset({
        features: ['label', 'article', 'prefix'],
        num_rows: 200
    })
})

In [4]:
processed_cnn_dataset["train"][0]

{'label': 1,
 'article': '',
 'prefix': 'Three members of the same family who died in a'}

In [5]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Article processed:")
    print(processed_cnn_dataset["train"]["article"][i])
    print("--------------------")
    print("\n")

Article processed:

--------------------


Article processed:
Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide 
--------------------


Article processed:
A rare meeting of U.N. Security Council heads of state, led for the first time by a U.S. president, adopted a resolution focused on stopping the spread of nuclear weapons Thursday. President Obama is the first U.S. leader to head a United Nations Security Council meeting. President Obama challenged the gathering -- which included leaders of nuclear powers including Russia, China, Great Britai

In [6]:
nb_examples_samples = 10

for i in range(nb_examples_samples):
    # select a random article
    #random_idx = np.random.randint(0, len(processed_cnn_dataset["train"]))
    print("Sample:")
    print(processed_cnn_dataset["train"][i])
    print("--------------------")
    print("\n")

Sample:
{'label': 1, 'article': '', 'prefix': 'Three members of the same family who died in a'}
--------------------


Sample:
{'label': 0, 'article': "Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide ", 'prefix': 'Three members of the same family who died in a'}
--------------------


Sample:
{'label': 0, 'article': 'A rare meeting of U.N. Security Council heads of state, led for the first time by a U.S. president, adopted a resolution focused on stopping the spread of nuclear weapons Thursday. President Obama is the first U.S. leader to head

## Test an Attack

In [7]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
gen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

gen = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=gen_tokenizer.pad_token_id,
).to(device)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
gen_config = ModelConfig(gen_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)

gen_model = LLMGenerator(gen, gen_config)

system_prompt = "You are a helpful assistant."
user_prompt = "Continue writing the following news article starting with:"
prompt_config = PromptConfig(system_prompt=system_prompt, user_prompt=user_prompt)


text_gen_no_attack = PromptAttack(gen_model, gen_config, system_prompt, prompt_config)


true_articles = processed_cnn_dataset["train"].filter(lambda x: x["label"] == 0)
true_articles_prefixes = true_articles["prefix"][:10]

fake_articles = text_gen_no_attack.generate_adversarial_text(true_articles_prefixes, batch_size=1)

Filter:   0%|          | 0/1600 [00:00<?, ? examples/s]

In [10]:
true_articles_sample = true_articles["article"][:10]
for i in range(len(true_articles_sample)):
    print("True article:", true_articles_sample[i])
    print("Fake article:", fake_articles[i])
    print("\n")

True article: Three members of the same family who died in a static caravan from carbon monoxide poisoning would have been unconscious 'within minutes', investigators said today. The bodies of married couple John and Audrey Cook were discovered alongside their daughter, Maureen, at the mobile home they shared on Tremarle Home Park in Camborne, west Cornwall. The inquests have now opened into the deaths last Saturday, with investigators saying the three died along with the family's pet dog, of carbon monoxide 
Fake article: Three members of the same family who died in a  tragic car accident on Thursday evening have been identified by the family's lawyer. The accident occurred around 8:30 p.m. on the I-275 eastbound exit ramp near the intersection with I-40.
The three victims, who were wearing black and white safety belts, were pronounced dead at the scene. The driver of the vehicle was taken to a local hospital for treatment of minor injuries. The driver of the car was also taken to a h

# Experiment Pipeline

In [None]:
# TODO: add a mechansim such that if a dataset has already been generated, we use it directly
# and also to save intermediary datasets
# Have a parseable dataset naming format 

import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import json
from time import gmtime, strftime
import logging
import sys


def compute_bootstrap_metrics(data, labels, n_bootstrap=1000, flip_labels=False):

    # compute false postives, false negatives, true positives, true negatives using bootstrap
    nb_false_positives = np.zeros(n_bootstrap)
    nb_false_negatives = np.zeros(n_bootstrap)
    nb_true_positives = np.zeros(n_bootstrap)
    nb_true_negatives = np.zeros(n_bootstrap)

    for i in range(n_bootstrap):
        bootstrap_sample = np.random.choice(range(len(data)), len(data), replace=True)
        nb_false_positives[i] = np.sum((data[bootstrap_sample] == 1) & (labels[bootstrap_sample] == 0))
        nb_false_negatives[i] = np.sum((data[bootstrap_sample] == 0) & (labels[bootstrap_sample] == 1))
        nb_true_positives[i] = np.sum((data[bootstrap_sample] == 1) & (labels[bootstrap_sample] == 1))
        nb_true_negatives[i] = np.sum((data[bootstrap_sample] == 0) & (labels[bootstrap_sample] == 0))
    
    metrics = ["accuracy", "precision", "recall", "f1_score", "fp_rate", "tp_rate"]
    avg_metrics = {}
    std_metrics = {}
    for metric in metrics:
        metric_results = np.zeros(n_bootstrap)
        for i in range(n_bootstrap):
            nb_false_positives_i = nb_false_positives[i]
            nb_false_negatives_i = nb_false_negatives[i]
            nb_true_positives_i = nb_true_positives[i]
            nb_true_negatives_i = nb_true_negatives[i]
            
            if flip_labels:
                nb_false_positives_i = nb_false_negatives[i]
                nb_false_negatives_i = nb_false_positives[i]
                nb_true_positives_i = nb_true_negatives[i]
                nb_true_negatives_i = nb_true_positives[i]
            
            # we need to test cases where the denominator is 0 because there might dataset with only 0 labels or 1 labels
            match metric:
                case "accuracy":
                    if len(data) == 0:
                        metric_results[i] = 0
                    else:
                        metric_results[i] = (nb_true_positives_i + nb_true_negatives_i) / len(data)
                    
                case "precision":
                    if (nb_true_positives_i + nb_false_positives_i == 0):
                        metric_results[i] = 0
                    else:
                        metric_results[i] = nb_true_positives_i / (nb_true_positives_i + nb_false_positives_i)
                        
                case "recall":
                    if (nb_true_positives_i + nb_false_negatives_i == 0):
                        metric_results[i] = 0
                    else:
                        metric_results[i] = nb_true_positives_i / (nb_true_positives_i + nb_false_negatives_i)
                case "f1_score":
                    if (2 * nb_true_positives_i + nb_false_positives_i + nb_false_negatives_i) == 0:
                        metric_results[i] = 0
                    else:
                        metric_results[i] = 2 * nb_true_positives_i / (2 * nb_true_positives_i + nb_false_positives_i + nb_false_negatives_i)
                case "fp_rate":
                    if  (nb_false_positives_i + nb_true_negatives_i) == 0:
                        metric_results[i] = 0
                    else:
                        metric_results[i] = nb_false_positives_i / (nb_false_positives_i + nb_true_negatives_i)
                        
                case "tp_rate":
                    if  (nb_true_positives_i + nb_false_negatives_i) == 0:
                        metric_results[i] = 0
                    else:
                        metric_results[i] = nb_true_positives_i / (nb_true_positives_i + nb_false_negatives_i)
            
        avg_metrics[metric] = np.mean(metric_results)
        std_metrics[metric] = np.std(metric_results)

    print("Average metrics: ", avg_metrics)
    print("Standard deviation of metrics: ", std_metrics)

    # change name of std_metrics as std_{metric_name}
    for metric in metrics:
        std_metrics["std_" + metric] = std_metrics[metric]
        del std_metrics[metric]
    
    avg_metrics.update(std_metrics)
    metrics_dict = avg_metrics
    
    # add TP, TN, FP, FN to the metrics_dict
    metrics_dict["TP"] = np.mean(nb_true_positives)
    metrics_dict["TN"] = np.mean(nb_true_negatives)
    metrics_dict["FP"] = np.mean(nb_false_positives)
    metrics_dict["FN"] = np.mean(nb_false_negatives)
    
    return metrics_dict

def create_logger(name, silent=False, to_disk=False, log_file=None):
    """Create a new logger"""
    # setup logger
    log = logging.getLogger(name)
    log.setLevel(logging.DEBUG)
    log.propagate = False
    formatter = logging.Formatter(fmt='%(message)s', datefmt='%Y/%m/%d %I:%M:%S')
    if not silent:
        ch = logging.StreamHandler(sys.stdout)
        ch.setLevel(logging.DEBUG)
        ch.setFormatter(formatter)
        log.addHandler(ch)
    if to_disk:
        log_file = log_file if log_file is not None else strftime("log/log_%m%d_%H%M.txt", gmtime())
        if type(log_file) == list:
            for filename in log_file:
                fh = logging.FileHandler(filename, mode='w')
                fh.setLevel(logging.INFO)
                fh.setFormatter(formatter)
                log.addHandler(fh)
        if type(log_file) == str:
            fh = logging.FileHandler(log_file, mode='w')
            fh.setLevel(logging.INFO)
            fh.setFormatter(formatter)
            log.addHandler(fh)
    return log

class ExperimentPipeline(ABC):
    
    def __init__(self):
        pass
    
    @abstractmethod
    def run_pipeline(self):
        pass
    
class ExperimentTestPipeline(ExperimentPipeline):
    def __init__(self, dataset_loader, attack, detector, device, experiment_path, watermarking_scheme=None, batch_size=1):
        self.dataset_loader = dataset_loader
        self.attack = attack
        self.detector = detector
        self.device = device
        self.experiment_path = experiment_path
        self.batch_size = batch_size
        self.watermarking_scheme = watermarking_scheme
        
        # setup log
        log_path = f"{experiment_path}/log"
        self.log = create_logger(__name__, silent=False, to_disk=True,
                                 log_file=log_path)
        
    def create_logger(self):
        if log_path is None:
            if self.experiment_path is None:
                raise ValueError("Experiment path not set")
            log_path = self.experiment_path
        
        # create log file
        with open(f"{log_path}/log.txt", "w") as f:
            f.write("")

        log = create_logger(__name__, silent=False, to_disk=True,
                                    log_file=f"{log_path}/log.txt")
        self.log = log
        
    def create_experiment_dataset(self, dataset_name):
        ### CREATE THE (ADVERSRIAL) DATASET AND SAVE IT ###
        
        # Load the base dataset
        dataset = self.dataset_loader.load_data()
        
        # We only use the test data split here
        dataset = dataset["test"]
                
        # Generate adversarial examples
        true_articles = dataset.filter(lambda x: x["label"] == 0)
        true_articles_prefixes = true_articles["prefix"][:]
        fake_articles = self.attack.generate_adversarial_text(true_articles_prefixes, batch_size=self.batch_size)
        
        # Fuse true and fake articles by filling samples in dataset with label = 1
        for i in range(len(dataset)):
            if dataset[i]["label"] == 1:
                
                # should be in the same order as the dataset
                dataset[i]["article"] = fake_articles[i]
                
        # Save the dataset using a specific naming convention
        dataset.save_to_disk(f"data/generated_datasets/{dataset_name}")
        
        return dataset
        
    def run_pipeline(self):
        log = self.log

        # check if the dataset has already been generated for the attack and base dataset
        base_dataset_name = self.dataset_loader.dataset_name
        attack_name  = self.attack.attack_name
        use_watermarking = self.watermarking_scheme is not None
        dataset_name = f"{base_dataset_name}_{attack_name}"
        
        if use_watermarking:
            dataset_name += "_watermarked"
    
        if os.path.isdir(f"data/generated_datasets/{dataset_name}"):
            log.info(f"Dataset {dataset_name} already exists, loading it")
            dataset = load_from_disk(f"data/generated_datasets/{dataset_name}")
        else:
            log.info(f"Dataset {dataset_name} does not exist, creating it")
            dataset = self.create_experiment_dataset(dataset_name)
                
        ### TEST THE DETECTOR ###
        fake_true_articles = dataset["article"][:]
        preds, logits = self.detector.detect(fake_true_articles, detection_threshold=0.5, batch_size=self.batch_size)
        labels = dataset["label"]
        
        # TODO: better to handle this in a helper
        # compute metrics
        nb_pos_labels = np.sum(dataset["label"] == 1)
        nb_neg_labels = np.sum(dataset["label"] == 0)
        
        if nb_pos_labels == 0 or nb_neg_labels == 0:
            #log.info("Only one class in the dataset, cannot compute roc_auc")
            roc_auc = 0
            fpr = np.zeros(1)
            tpr = np.zeros(1)
            thresholds = np.zeros(1)
        else:
            roc_auc = roc_auc_score(labels, logits)
            fpr, tpr, thresholds = roc_curve(labels, logits)
        
        #    fpr, tpr = 1 - fpr, 1 - tpr
        
        results = compute_bootstrap_metrics(preds, labels)
        
        log.info("Test metrics:")
        for key, value in results.items():
            log.info(f"{key}: {value}")
            
        # also log the roc_auc and the fpr, tpr, thresholds
        log.info(f"roc_auc: {roc_auc}")
        log.info(f"fpr: {fpr}")
        log.info(f"tpr: {tpr}")
        log.info(f"thresholds: {thresholds}")
        
        results["roc_auc"] = roc_auc
        results["fpr_at_thresholds"] = fpr.tolist()
        results["tpr_at_thresholds"] = tpr.tolist()
        results["thresholds"] = thresholds.tolist()
    
        
        if self.classifier_threshold is not None:
            
            preds_at_threshold = np.where(logits > self.classifier_threshold, 1, 0)

            results_at_threshold = compute_bootstrap_metrics(preds_at_threshold, labels)
            log.info("Test metrics at specific given threshold:")
            
            for key, value in results_at_threshold.items():
                log.info(f"{key}: {value}")
                
            # add them to results dict as f"{key}_at_given_threshold"
            results["given_threshold"] = self.classifier_threshold
            for key, value in results_at_threshold.items():
                results[f"{key}_at_given_threshold"] = value
                
                
        experiment_path = self.experiment_path
        dataset_name = self.dataset_loader.dataset_name
        if self.classifier_threshold is not None:
            if not os.path.isdir(f"{experiment_path}/test_at_threshold"):
                os.makedirs(f"{experiment_path}/test_at_threshold")
                
            json_res_file_path = f"{experiment_path}/test_at_threshold/test_metrics_{dataset_name}.json"
            
        else:
            if not os.path.isdir(f"{experiment_path}/test"):
                os.makedirs(f"{experiment_path}/test")
            
                json_res_file_path = f"{experiment_path}/test/test_metrics_{dataset_name}.json"
                
        with open(json_res_file_path, "w") as f:
            f.write(json.dumps(results, indent=4))
 
        
        
        
        

# Generate all datasets (attack + non attack)

# Watermarking

In [None]:
class WatermarkingDetector(Detector):
    def __init__(self, watermark):
        self.watermark = watermark
        
    def detect(self, text: str) -> bool:
        return self.watermark in text

In [None]:
class WatermarkingGenerator(Generator):
    
    def __init__(self, model, tokenizer, watermark):
        self.model = model
        self.tokenizer = tokenizer
        self.watermark = watermark
        
    def generate(self, prompt: str) -> str:
        text = self.watermark + " " + prompt
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model.generate(**inputs)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)