In [1]:
from abc import ABC, abstractmethod
from dataclasses import dataclass

from datasets import concatenate_datasets, load_from_disk, DatasetDict
import argparse
import os
import pandas as pd
import copy
from tqdm import tqdm

import torch
from torch import nn
import nltk.data
nltk.download('punkt')
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoModelForCausalLM,
                          ElectraForSequenceClassification, ElectraTokenizer, AutoConfig)
from datasets import load_from_disk, concatenate_datasets, Dataset

from abc import ABC, abstractmethod

from evasion_attack import PromptParaphrasingAttack, PromptAttack, GenParamsAttack
from utils import ModelConfig, PromptConfig
from generator import LLMGenerator
from detector import Detector, BertDetector
from fast_detect_gpt import FastDetectGPT

[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


# Attacks

In [2]:
# set generation parameters
default_gen_params = {
    #"max_length": 100,
    "max_new_tokens": 100,
    "min_new_tokens": 100,
    "temperature": 0.8,
    "top_p": 0.95,
    "repetition_penalty": 1,
    "do_sample": True,
    "top_k": 50
}

device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
paraphraser_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    pad_token='<|extra_0|>',
    eos_token='<|endoftext|>',
    padding_side='left',
    trust_remote_code=True
)

paraphraser = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto",
    pad_token_id=paraphraser_tokenizer.pad_token_id,
).to(device)


paraphraser_config = ModelConfig(paraphraser_tokenizer,
    use_chat_template=True, chat_template_type="system_user", gen_params=default_gen_params, device=device)
paraphraser_model = LLMGenerator(paraphraser, paraphraser_config)

gen_model = paraphraser_model
gen_tokenizer = paraphraser_tokenizer
gen_config = paraphraser_config


dataset_list = ["Why are cats better than dogs", "Why are dogs better than cats", "Why are cats better than dogs",
                "Why are dogs better than cats", "Why are cats better than dogs"]

system_paraphrasing_prompt = """You are a paraphraser. You are given an input passage ‘INPUT’. You should paraphrase ‘INPUT’ to print ‘OUTPUT’."
    "‘OUTPUT’ shoud be diverse and different as much as possible from ‘INPUT’ and should not copy any part verbatim from ‘INPUT’."
    "‘OUTPUT’ should preserve the meaning and content of ’INPUT’ while maintaining text quality and grammar."
    "‘OUTPUT’ should not be much longer than ‘INPUT’. You should print ‘OUTPUT’ and nothing else so that its easy for me to parse."""
user_paraphrasing_prompt = "INPUT:"
paraphraser_prompt_config = PromptConfig(system_prompt=system_paraphrasing_prompt, user_prompt="")


gen_prompt = "You are a helpful assistant."
user_prompt = "Write a news article starting with:"
gen_prompt_config = PromptConfig(system_prompt=gen_prompt, user_prompt=user_prompt)

prompt_paraphrasing_attack = PromptParaphrasingAttack(gen_model, gen_config, gen_prompt_config, paraphraser_model, paraphraser_config, paraphraser_prompt_config)
paraphrased_fake_articles = prompt_paraphrasing_attack.generate_adversarial_text(dataset_list, batch_size=2)

paraphrased_fake_articles

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[' Many people argue that cats are simply more independent and independent in their behavior compared to dogs because they are more agile and agile than dogs. Cats can easily run and jump over obstacles while dogs are typically slower and more likely to get into trouble. Additionally, cats are often more social than dogs. This is why they are often the better option for humans. However, it is important to note that this argument is not based on any scientific evidence or studies. Additionally, it is important to note that it is',
 " Dogs and cats are both great pets but dogs are generally considered to be more reliable and loyal. They have better physical abilities, such as running longer distances and being more alert and focused than cats. Additionally, dogs are known to be more intelligent than cats and have been shown to be more docile and trainable. However, there are some differences in their personalities and behaviors, and it's important to take into account the specific needs 

In [4]:
adversarial_system_prompt = "You are a helpful assistant."
advesarial_user_prompt = "Write a news article in the CNN news article style starting with:"
adversarial_prompt_config = PromptConfig(system_prompt=adversarial_system_prompt, user_prompt=advesarial_user_prompt)

prompt_attack = PromptAttack(gen_model, gen_config, gen_prompt_config, adversarial_prompt_config)
prompt_attack_fake_articles = prompt_attack.generate_adversarial_text(dataset_list, batch_size=2)
prompt_attack_fake_articles

['Why are cats better than dogs ?\nThe question of whether cats are better than dogs is a topic that has been debated for centuries. While both animals are intelligent and capable of achieving success in various pursuits, there are several factors that could explain why cats are often considered more well-adjusted and intelligent than dogs. \nOne reason that cats may outperform dogs in certain tasks is that they are often social and have a natural inclination towards nurturing and caring for others. Dogs, on the other hand, are often social animals that are more independent',
 'Why are dogs better than cats ?\nIn many households, dogs are the most commonly owned pets, but they are also beloved companions for many families. While cats may be a smaller pet, they offer a unique way to bond with humans and provide a loyal and affectionate friend.\nAccording to a recent study published in the journal Nature, dogs are more social animals than cats and are known for their loyalty and intellig

In [5]:
adversarial_gen_param = {
    "temperature": 1.2
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

["Why are cats better than dogs ?\nThe topic of cats being more intelligent and affectionate than dogs is a widely debated subject, with different opinions within the media, scientists, and among pet owners themselves. Some argue that cats are superior, as they are capable of better cognitive functions and possess unique personality traits. Others believe that dogs are more capable and docile.\nRegardless of one's perspective, research has consistently shown that cats excel in certain tasks and interactions with their owners, and that they possess a keen sense of smell, a keen",
 'Why are dogs better than cats ?\nThere are many reasons why dogs are often considered to be superior to cats. However, one key factor that stands out is their exceptional ability to sense their surroundings and react appropriately. In general, dogs are more alert and responsive to their environment, which means they are able to adapt to a wide range of situations quickly and effectively.\nFurthermore, dogs ar

In [6]:
adversarial_gen_param = {
    "temperature": 10.0
}

gen_parameters_attack = GenParamsAttack(gen_model, gen_config, gen_prompt_config, adversarial_gen_param)
gen_parameters_fake_articles = gen_parameters_attack.generate_adversarial_text(dataset_list, batch_size=2)
gen_parameters_fake_articles

['Why are cats better than dogs ... It all begins now as The "Alo !!g " breeds one and every once and every another discovers we must take that leap that only some things do (it says...) In an experiment known to lovers there, scientists find it takes three bites rather just... A couple dogs decide between finding themselves cats instead, just a new way at trying and it happens, right? We then found and had found one in which, this morning The little brown one comes for it A dog named Jodie decides',
 "Why are dogs better than cats  when coming from animal福利? Why and Can. When one breeds or socialize to another?\nThis is one reason dog owners have long debated: While both of species excel in all physical behaviors: Intelligence vs Motion Dog Behavior The research shows:\nMost experts, if all of breed's qualities were transferred as their ability towards social relationships would greatly evolve from breed A dog may learn tricks well if bred A because those that grew the best with human

# Detection

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
detector_path = "google/electra-large-discriminator"
config = AutoConfig.from_pretrained(detector_path)
detector_model = ElectraForSequenceClassification(config)
bert_tokenizer = ElectraTokenizer.from_pretrained(detector_path)

model_path = "../saved_training_logs_experiment_2/electra_large/full_finetuning/fake_true_dataset_round_robin_10k/10_06_1308/saved_models/best_model.pt"
detector_model.load_state_dict(torch.load(model_path))
detector_model.to(device)



ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-23): 24 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (L

In [4]:
text_to_detect = ["I am AI generated text", "I am human generated text"]
detector = BertDetector(detector_model, bert_tokenizer, device)
preds, logits = detector.detect(text_to_detect, detection_threshold=0.5, batch_size=2)

NameError: name 'detector_model' is not defined

In [4]:
preds, logits

([0, 0], [0.14659571647644043, -0.6974161863327026])

## Fast-DetectGPT

In [2]:
import numpy as np
from torch.utils.data import DataLoader
import glob
import json

class FastDetectGPT(Detector):
    def __init__(self, ref_model, scoring_model, ref_tokenizer, scoring_tokenizer, device):
        self.ref_model = ref_model
        self.scoring_model = scoring_model
        self.ref_tokenizer = ref_tokenizer
        self.scoring_tokenizer = scoring_tokenizer
        self.device = device
        
    def get_samples(logits, labels):
        assert logits.shape[0] == 1
        assert labels.shape[0] == 1
        nsamples = 10000
        lprobs = torch.log_softmax(logits, dim=-1)
        distrib = torch.distributions.categorical.Categorical(logits=lprobs)
        samples = distrib.sample([nsamples]).permute([1, 2, 0])
        return samples

    def get_likelihood(logits, labels):
        assert logits.shape[0] == 1
        assert labels.shape[0] == 1
        labels = labels.unsqueeze(-1) if labels.ndim == logits.ndim - 1 else labels
        lprobs = torch.log_softmax(logits, dim=-1)
        log_likelihood = lprobs.gather(dim=-1, index=labels)
        return log_likelihood.mean(dim=1)

    def get_sampling_discrepancy(self, logits_ref, logits_score, labels):
        assert logits_ref.shape[0] == 1
        assert logits_score.shape[0] == 1
        assert labels.shape[0] == 1
        if logits_ref.size(-1) != logits_score.size(-1):
            # print(f"WARNING: vocabulary size mismatch {logits_ref.size(-1)} vs {logits_score.size(-1)}.")
            vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
            logits_ref = logits_ref[:, :, :vocab_size]
            logits_score = logits_score[:, :, :vocab_size]

        samples = self.get_samples(logits_ref, labels)
        log_likelihood_x = (logits_score, labels)
        log_likelihood_x_tilde = self.get_likelihood(logits_score, samples)
        miu_tilde = log_likelihood_x_tilde.mean(dim=-1)
        sigma_tilde = log_likelihood_x_tilde.std(dim=-1)
        discrepancy = (log_likelihood_x.squeeze(-1) - miu_tilde) / sigma_tilde
        return discrepancy.item()
    
    def get_sampling_discrepancy_analytic(self, logits_ref, logits_score, labels):
        assert logits_ref.shape[0] == 1
        assert logits_score.shape[0] == 1
        assert labels.shape[0] == 1
        if logits_ref.size(-1) != logits_score.size(-1):
            # print(f"WARNING: vocabulary size mismatch {logits_ref.size(-1)} vs {logits_score.size(-1)}.")
            vocab_size = min(logits_ref.size(-1), logits_score.size(-1))
            logits_ref = logits_ref[:, :, :vocab_size]
            logits_score = logits_score[:, :, :vocab_size]

        labels = labels.unsqueeze(-1) if labels.ndim == logits_score.ndim - 1 else labels
        lprobs_score = torch.log_softmax(logits_score, dim=-1)
        probs_ref = torch.softmax(logits_ref, dim=-1)
        log_likelihood = lprobs_score.gather(dim=-1, index=labels).squeeze(-1)
        mean_ref = (probs_ref * lprobs_score).sum(dim=-1)
        var_ref = (probs_ref * torch.square(lprobs_score)).sum(dim=-1) - torch.square(mean_ref)
        discrepancy = (log_likelihood.sum(dim=-1) - mean_ref.sum(dim=-1)) / var_ref.sum(dim=-1).sqrt()
        discrepancy = discrepancy.mean()
        return discrepancy.item()
    
    class ProbEstimatorFastDetectGPT:
        def __init__(self, args=None, ref_path=None):
            if args is None:
                ref_path = ref_path
            else:
                ref_path = args.ref_path
            self.real_crits = []
            self.fake_crits = []
            for result_file in glob.glob(os.path.join(ref_path, '*.json')):
                with open(result_file, 'r') as fin:
                    res = json.load(fin)
                    self.real_crits.extend(res['predictions']['real'])
                    self.fake_crits.extend(res['predictions']['samples'])
            print(f'ProbEstimator: total {len(self.real_crits) * 2} samples.')

        def crit_to_prob(self, crit):
            offset = np.sort(np.abs(np.array(self.real_crits + self.fake_crits) - crit))[100]
            cnt_real = np.sum((np.array(self.real_crits) > crit - offset) & (np.array(self.real_crits) < crit + offset))
            cnt_fake = np.sum((np.array(self.fake_crits) > crit - offset) & (np.array(self.fake_crits) < crit + offset))
            return cnt_fake / (cnt_real + cnt_fake)

            
    def detect(self, texts: list) -> list:
        reference_model_name = "gpt-neo-2.7B"
        scoring_model_name = "gpt-neo-2.7B"
        
        ref_path = "local_infer_ref"
        
        # create experiment folder

        # load model
        """
        scoring_tokenizer = load_tokenizer(scoring_model_name, dataset, cache_dir)
        scoring_model = load_model(scoring_model_name,device,cache_dir)
        scoring_model.eval()
        if reference_model_name != scoring_model_name:
            reference_tokenizer = load_tokenizer(reference_model_name, dataset, cache_dir)
            reference_model = load_model(reference_model_name, device, cache_dir)
            reference_model.eval()
        """
        ref_model = self.ref_model
        ref_tokenizer = self.ref_tokenizer
        
        scoring_model = self.scoring_model
        scoring_tokenizer = self.scoring_tokenizer
        

                
        # evaluate criterion
        #name = "sampling_discrepancy_analytic"
        criterion_fn = self.get_sampling_discrepancy_analytic
        prob_estimator = self.ProbEstimatorFastDetectGPT(ref_path=ref_path)


        # iterate over the dataset and do detection on each sample
        dataset = Dataset.from_dict({"text": texts})

        # create dataloader
        batch_size = 1
        
        test_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
        preds = []
        probs = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Performing detection on dataset..."):
                text = batch["text"]

                tokenized = scoring_tokenizer(text, return_tensors="pt", padding=True, return_token_type_ids=False).to(device)
                labels = tokenized.input_ids[:, 1:]
                logits_score = scoring_model(**tokenized).logits[:, :-1]

                if reference_model_name == scoring_model_name:
                    logits_ref = logits_score
                else:
                    tokenized = ref_tokenizer(text, return_tensors="pt", padding=True, return_token_type_ids=False).to(device)
                    assert torch.all(tokenized.input_ids[:, 1:] == labels), "Tokenizer is mismatch."
                    logits_ref = ref_model(**tokenized).logits[:, :-1]

                for i in range(batch_size):
                    crit = criterion_fn(logits_ref[i:i+1], logits_score[i:i+1], labels[i:i+1])
                    prob = prob_estimator.crit_to_prob(crit)
                    pred = 1 if prob > 0.5 else 0
                    
                    probs.append(prob)
                    preds.append(pred)

        preds = np.array(preds)
        probs = np.array(probs)
        
        return preds, probs
            


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
ref_model_path = "openai-community/gpt2"
ref_model = AutoModelForCausalLM.from_pretrained(ref_model_path, torch_dtype="auto").to(device)
ref_tokenizer = AutoTokenizer.from_pretrained(ref_model_path, trust_remote_code=True, padding_side="left")

# special for gpt2
ref_tokenizer.pad_token = ref_tokenizer.eos_token
ref_tokenizer.padding_side = 'left'

scoring_model = ref_model
scoring_tokenizer = ref_tokenizer



In [4]:
fast_detector = FastDetectGPT(ref_model, scoring_model, ref_tokenizer, scoring_tokenizer, device)

texts = ["I am AI generated text", "I am human generated text"]
preds, probs = fast_detector.detect(texts)
preds, probs

ProbEstimator: total 1800 samples.


Performing detection on dataset...:   0%|          | 0/2 [00:00<?, ?it/s]

Performing detection on dataset...: 100%|██████████| 2/2 [00:01<00:00,  1.59it/s]


(array([0, 0]), array([0., 0.]))

In [None]:
class WatermarkingDetector(Detector):
    def __init__(self, watermark):
        self.watermark = watermark
        
    def detect(self, text: str) -> bool:
        return self.watermark in text

In [None]:
class WatermarkingGenerator(Generator):
    
    def __init__(self, model, tokenizer, watermark):
        self.model = model
        self.tokenizer = tokenizer
        self.watermark = watermark
        
    def generate(self, prompt: str) -> str:
        text = self.watermark + " " + prompt
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model.generate(**inputs)
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)