In [None]:
DEBUG = False
N_DEBUG = 100

In [None]:
import pandas as pd
import time
import numpy as np

from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer

results = []


In [None]:
import transformers
from transformers import pipeline

import torch
import re
import sklearn
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
test = pd.read_pickle("./dataset_test.pkl")

test = test[test["author"] == "human_answers"]
print("len(test_human)", len(test))
documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py




#from gpt2outputdataset.detector_radford import DetectorRadford
#from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorRadford,DetectorGuo,DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer
explainer_classes = [LIME_Explainer,SHAP_Explainer]

In [None]:
DEVICE = "cuda"
pattern = re.compile(r"<extra_id_\d+>")

base_model_name="facebook/opt-350m"
openai_model = False

cache_dir="./.cache"
# mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model, cache_dir=cache_dir).to(DEVICE)
# mask_tokenizer = transformers.AutoTokenizer.from_pretrained(model, model_max_length=mask_model.config.n_positions, cache_dir=cache_dir)#.to(DEVICE)
do_top_k= False
do_top_p= False

In [None]:

base_model = transformers.AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir=cache_dir).to(DEVICE)




base_tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_name, cache_dir=cache_dir, padding_side='left',)# use_fast=False)


In [None]:
predictions = None

In [None]:
decoded = None

In [None]:
import os

In [None]:
from tqdm import tqdm
import itertools


# Generate/Load documents
Generate or load documents from `./contrastivity_label_flip_pairs.csv`

In [None]:
columns=["Detector", "Original", "Prompt", "Edited"]

In [None]:
if os.path.isfile("./contrastivity_label_flip_pairs.csv"):
    df = pd.read_csv("./contrastivity_label_flip_pairs.csv")
else: 
    df = pd.DataFrame([], columns=columns)
    # write headers (mode != "a")
    df.to_csv("./contrastivity_label_flip_pairs.csv", encoding="UTF-8", index=False)
df

In [None]:
for detector_class in detector_classes:
    detector = detector_class()
    for document in tqdm(documents[detector.predict_label(documents).astype(bool)], desc="Generating perturbations"): # only use those where f(x) = human
        np.random.seed(42)
        torch.manual_seed(42)

        if df[df["Detector"] == detector.__class__.__name__]["Original"].str.contains(document, regex=False).any(): # check if document is in csv, if yes, skip
            continue
        doc = nlp(document)
        n_tokens_original = len(base_tokenizer(document, return_tensors="pt", padding=True).to(DEVICE).input_ids[0])

        substrings = [''.join(token.text_with_ws for token in doc[:-i]) for i in range(1,len(document)) ]
        substrings = [substring for substring in substrings if substring != ""] # at least one token

        n_generations_per_lenght = 5
        substrings = list(itertools.chain.from_iterable(itertools.repeat(s, n_generations_per_lenght) for s in substrings))

        batch_size = 20 # adjust so it fits in your GPU memory
        row = (detector.__class__.__name__, document, None, None) # to mark the document in the csv cache if no pair is found
        for batch in (sklearn.utils.gen_batches(len(substrings), batch_size)):
            encoded = base_tokenizer(substrings[batch], return_tensors="pt", padding=True).to(DEVICE)


            outputs = base_model.generate(**encoded, min_length=n_tokens_original-5, max_length=n_tokens_original+5, do_sample=True, pad_token_id=base_tokenizer.eos_token_id, eos_token_id=base_tokenizer.eos_token_id)
            decoded = base_tokenizer.batch_decode(outputs, skip_special_tokens=True)

            predictions = detector.predict_label(decoded)
            if any(predictions != 1):
                first_new_label = (predictions!=1).argmax(axis=0)

                #                                   original  prompt                              first instance that flips label
                row = (detector.__class__.__name__, document, substrings[batch][first_new_label], decoded[first_new_label])
                break
        pd.DataFrame([row], columns=columns).to_csv("./contrastivity_label_flip_pairs.csv", mode="a", encoding="UTF-8", index=False, header=False)


In [None]:
df = pd.read_csv("./contrastivity_label_flip_pairs.csv")

In [None]:
df = df.dropna()

In [None]:
from IPython.core.display import HTML

In [None]:
# basic idea: assert that exp(original)[original - prompt] <substantially different than> exp(label_flip_example)[label_flip_example - prompt]
# i.e. the new/changed section is assigned the opposite label (TODO hard coded: "machine") more often

In [None]:
import krippendorff

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
results = []
for detector_class in detector_classes:
    detector = detector_class()
    for explainer_class in explainer_classes:
            explainer = explainer_class(detector)
            for idx, (_, original, prompt, edited) in tqdm(df[df["Detector"] == detector.__class__.__name__].iterrows(), desc="Gathering results"):
                

                # The generation strategy above uses spacy's tokenizer (where punctuation chars are end up in seperate tokens). 
                # The explanation methods have their own tokenizers and FI scores are reported irt to those tokens
                # For LIME, multiple punctuation chars can end up in the same token e.g.: "!)" is one token, "!" too, "#+#++..,++##" as well. 
                # This is problematic when comparing explanations:
                # i.e. this can fail: assert explainer.tokenize(original)[0:lenght_promt] == explainer.tokenize(edited)[0:lenght_promt]
                
                # For this experiment, it is only important to separate the "prompt", which is the common part between the two documents, from the generated/cut parts
                # Determining the bounds of the prompt has to be done AFTER tokenization:
                # Strategy: change bounds until assert explainer.tokenize(original)[0:lenght_promt] == explainer.tokenize(edited)[0:lenght_promt] passes
                lenght_promt = len(explainer.tokenize(prompt)) # this would suffice for SHAP, but not for lime (because it sometimes collapses punctuation chars into one token) 
                while explainer.tokenize(original)[0:lenght_promt] != explainer.tokenize(edited)[0:lenght_promt]:    
                    lenght_promt -= 1 
                lenght_promt = max(lenght_promt, 1) # if the first word is followed by a punctuation char, e.g., "Example! Is a sentence." and the prompt is just "Example", the loop above would set lenght_promt=0. This happens as LIME tokenizes this to ['Example!', 'Is', 'a', 'sentence.']
                assert explainer.tokenize(original)[1:lenght_promt] == explainer.tokenize(edited)[1:lenght_promt]
                       
                # get cut/edited parts
                exp_original_cut_part = explainer.get_fi_scores(original, fill=True)[0][lenght_promt:] # TODO hard coded: "machine"
                exp_edited_new_part = explainer.get_fi_scores(edited, fill=True)[0][lenght_promt:] # setting fill=True returns all features (not just the top_k) 
                fi_scores_exp_original_cut_part = np.array([fi_score for _, fi_score in exp_original_cut_part])
                fi_scores_exp_edited_new_part =   np.array([fi_score for _, fi_score in exp_edited_new_part])
                

                # get common part
                exp_original_common_part = explainer.get_fi_scores(original, fill=True)[0][0:lenght_promt]
                exp_edited_common_part = explainer.get_fi_scores(edited, fill=True)[0][0:lenght_promt]
                fi_scores_exp_original_common_part = np.array([fi_score for _, fi_score in exp_original_common_part])
                fi_scores_exp_edited_common_part=    np.array([fi_score for _, fi_score in exp_edited_common_part])
                
                # build result row
                if len(fi_scores_exp_original_cut_part) == 0 or(fi_scores_exp_edited_new_part.shape[0] == 0 or fi_scores_exp_edited_common_part.shape[0] == 0 ):
                  #  print(cannonical_form.shape)
                    print(lenght_promt)
                    print(original)
                    print(prompt)
                    print(edited)
                    print(fi_scores_exp_edited_new_part)
                    print(fi_scores_exp_edited_common_part)
               
                max_fi_in_new_part = fi_scores_exp_edited_new_part.max() > fi_scores_exp_edited_common_part.max()


                cannonical_form = np.vstack([fi_scores_exp_original_common_part, fi_scores_exp_edited_common_part])
                
                assert len(fi_scores_exp_original_cut_part) > 0
                assert len(fi_scores_exp_edited_new_part) > 0
                assert len(fi_scores_exp_original_common_part) > 0
                assert len(fi_scores_exp_edited_common_part) > 0

                k_alpha = None
                if np.all(cannonical_form==0): # the krippendorff library requires items to not all be the same
                    k_alpha = 1
                else:
                    k_alpha = krippendorff.alpha(cannonical_form, level_of_measurement="interval")

                cos_sim = cosine_similarity(fi_scores_exp_original_common_part.reshape(1, -1),fi_scores_exp_edited_common_part.reshape(1, -1))[0,0]
                #columns = ["Explainer",                  "Detector",                 "Mean FI Original Cut Part",               "Mean FI Edited New Part",              "Mean FI Original Common Part",              "Mean FI Edited Common Part",             "cos_sim Common Part",  "Krippendorff's Alpha Common Part", "Maximum FI for Machine in New Part"                                                                                                             ]
                row =     (explainer.__class__.__name__, detector.__class__.__name__, np.mean(fi_scores_exp_original_cut_part), np.mean(fi_scores_exp_edited_new_part), np.mean(fi_scores_exp_original_common_part), np.mean(fi_scores_exp_edited_common_part),  cos_sim,               k_alpha,                             max_fi_in_new_part                                                      )
                results.append(row)



In [None]:
columns = ["Explainer",                  "Detector",                 "Mean FI Original Cut Part",               "Mean FI Edited New Part",              "Mean FI Original Common Part",              "Mean FI Edited Common Part",    "cos_sim Common Part", "Krippendorff's Alpha Common Part","Maximum FI for Machine in New Part"]
dff = pd.DataFrame(results, columns=columns)
dff

In [None]:
dff.set_index(["Explainer", "Detector"]).groupby(["Explainer", "Detector"]).mean()

In [None]:
dff.groupby(["Explainer"]).mean()

In [None]:
df["Change original -> lf [%]"] = ((df["E[original - prompt]"] - df["E[lf - prompt]"] ) / df["E[original - prompt]"]) * 100
df

In [None]:
df.groupby("Explainer").mean()

In [None]:
np.mean([fi_score for _, fi_score in exp_label_flip_minus_prompt])

In [None]:
exp_label_flip_minus_prompt