In [1]:
DEBUG = False
N_DEBUG = 10

In [2]:
import pandas as pd
import time
import numpy as np

from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer

In [3]:
import pandas as pd
import numpy as np
import transformers

In [4]:
test = pd.read_pickle("./dataset_test.pkl")
test = test # always load the full dataset! (np.random.shuffle(tokenized_sentences)). slice the actual hybrid_documents if debugging!


documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py

if  N_DEBUG > 0:
    documents = documents[0:N_DEBUG]
    gold_labels = gold_labels[0:N_DEBUG]

#from gpt2outputdataset.detector_radford import DetectorRadford
#from detectgpt.detector_detectgpt import DetectorDetectGPT

detector_classes = [DetectorGuo]#,DetectorRadford,DetectorDetectGPT]

explainer_classes = [LIME_Explainer,SHAP_Explainer]

In [5]:
detector = DetectorGuo()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
import re

In [7]:
import torch
import pickle

In [8]:
DEVICE = "cuda"
pattern = re.compile(r"<extra_id_\d+>")



model = "t5-small"
cache_dir="./.cache"
mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model, cache_dir=cache_dir).to(DEVICE)
mask_tokenizer = transformers.AutoTokenizer.from_pretrained(model, model_max_length=mask_model.config.n_positions, cache_dir=cache_dir)#.to(DEVICE)

In [9]:
columns=["Detector", "Original", "Prompt", "Edited"]

In [10]:
# generate perturbed samples (similar to how it is done in detectgpt/detector_detectgpt.py)
def get_pertubed_text(text, n=1):
    tokens = text.split(' ')
    # select 1 (TODO) token in the original document to mask
    mask = np.zeros_like(tokens, dtype=bool)
    mask[np.random.randint(0, len(mask))] = 1 # TODO number of tokens to mask

    prediction_original = detector.predict_label([text])[0]

    past_generations = []
    perturbed_text = text
    # generate n unique perturbations (replace the same masked word(s) with one or more words)
    # while the experiment could also be run with random replacement (random word exchanged for each perturbation), the krippendorff metric would require more samples/coders to be significant (TODO explain effects of nan)
    for _ in range(0,n):
        while True: # do while 
            i = 0
            for ii, (m, token) in enumerate(zip(mask, tokens)):
                if m:
                    tokens[ii] = "<extra_id_{}>".format(i)
                    i+=1
            i-=1
            masked_text = ' '.join(tokens)
            stop_id = mask_tokenizer.encode(f"<extra_id_{i+1}>")[0]


            tok = mask_tokenizer(masked_text, return_tensors="pt", padding=True).to(DEVICE)
            outputs = mask_model.generate(**tok, max_length=150, do_sample=True, top_p=1, num_return_sequences=1, eos_token_id=stop_id,)# multinomial sampling
                                #                stopping_criteria=StoppingCriteriaList([self.stopping_criteria])) # no gains for short tests
            mt = mask_tokenizer.batch_decode(outputs, skip_special_tokens=False)

            fills = [x for x in re.split(r"<extra_id_\d*>", mt[0]) if x != "<pad>"]

            for i, (token, m) in enumerate(zip(tokens, mask)):
                if m:
                    tokens[i] = fills.pop(0).strip()
            perturbed_text = " ".join(tokens)

            # check if this is a valid and new perturbation
            if (perturbed_text == text) or (perturbed_text in past_generations):
                continue
            # check that label didn't flip
            if detector.predict_label([perturbed_text])[0] != prediction_original:
                continue
            else:
                break
        past_generations.append(perturbed_text)
    return past_generations

In [11]:
from tqdm import tqdm

In [12]:
N_REPLACE = 1
documents_and_perturbations = []

for document, gold_label in tqdm(zip(documents,gold_labels), total=len(documents), desc="Generating perturbations"):
    # set seeds here so perturbed documents are the same regardless of slice for documents when debugging (and explanations don't have to be regenerated)
    np.random.seed(42)
    torch.manual_seed(42)
    documents_and_perturbations.append((document, gold_label, get_pertubed_text(document, 5)))
    #break



Generating perturbations: 100%|██████████| 10/10 [00:47<00:00,  4.78s/it]


In [14]:
documents_and_perturbations

[("I've heard of handyman type people making a living this way untaxed.  They move into a fixer-upper, fix it up while living there, stay over two years and sell.  They can pocket $125k/yr tax free this way assuming they produce that much value in their fixing-up.  (Beware, though, that this will bite you in low social security payments in retirement!)",
  True,
  ["I've heard of handyman type people making a living this way untaxed.  They move into a fixer-upper, fix it up while living there, stay over two years and sell.  They can pocket $125k/yr tax free this way, assuming they produce that much value in their fixing-up.  (Beware, though, that this will bite you in low social security payments in retirement!)",
   "I've heard of handyman type people making a living this way untaxed.  They move into a fixer-upper, fix it up while living there, stay over two years and sell.  They can pocket $125k/yr tax free this way with their big fixer-uppers, assuming they produce that much value i

In [15]:

if DEBUG:
    # only keep instances with cached explanations
    explainers = [explainer_calss(detector) for explainer_calss in explainer_classes]
    documents_and_perturbations = [(original, gl, perturbations) for original, gl, perturbations in documents_and_perturbations if all([explainer.does_explanation_exist(original) for explainer in explainers]) and all([
    x
    for xs in [[explainer.does_explanation_exist(perturbation) for perturbation in perturbations] for explainer in explainers]
    for x in xs
])]

In [16]:
for original, gl, perturbations in documents_and_perturbations:
#    print(original)
    prediction_original = detector.predict_label([original])[0]
  #  print("...")
    assert len(perturbations) == len(set(perturbations)), "Duplicates in perturbations"
    for p in perturbations:
        assert detector.predict_label([p])[0] == prediction_original, "Labels don't match, do you set seed in the detector?"
        
 #       print(p)
 #   print("------------------------")
    

In [17]:
# generate all explanations
for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        for original, gl, perturbations in tqdm(documents_and_perturbations):
            explainer.get_explanation_cached(original)
            for p in perturbations:
                explainer.get_explanation_cached(p)

0it [00:00, ?it/s]
0it [00:00, ?it/s]


In [None]:
import krippendorff


In [None]:
experiments = [[original] + [p for p in perturbations] for original, _, perturbations in documents_and_perturbations]


In [None]:
from collections import OrderedDict, defaultdict

In [None]:
def get_tokens_with_pos(explainer, document):
    p_counter = defaultdict(lambda : 0)
    tokens_with_pos = []
    for token in explainer.tokenize(document):
        tokens_with_pos.append((token, p_counter[token]))
        p_counter[token] += 1
    return tokens_with_pos

In [None]:
def get_tokens_with_pos_and_id(explainer, document):
    p_counter = defaultdict(lambda : 0)
    tokens_with_pos = []
    for id, token in enumerate(explainer.tokenize(document)):
        tokens_with_pos.append((token, p_counter[token], id))
        p_counter[token] += 1
    return tokens_with_pos

In [None]:
from sklearn.preprocessing import normalize

In [None]:
from fi_explainer import FI_Explainer

class DummyExplainer(FI_Explainer):
    def __init__(self, detector):
        self.splitter = re.compile(r'(%s)|$' % '\\s') # for tokenize()
    def tokenize(self, document):
        return [s for s in self.splitter.split(document) if s and s!= " "] # as in LIME source
    def get_fi_scores(self, document):
        return {0: [(i,i+1) for i in range(0, 10)], 1: [(i,-(i+1)) for i in range(0, 10)]}
    def as_list(self, document, label=0):
        if label == 0:
            return [(w, i) for i,w in enumerate(self.tokenize(document))]
        else:
            return [(w, -i) for i,w in enumerate(self.tokenize(document))]
    def get_barplots_HTML(self, document):
        return super().get_barplots_HTML(document)
    def get_highlighted_text_HTML(self, document):
        return super().get_highlighted_text_HTML(document)
    def get_vanilla_visualization_HTML(self, document):
        return super().get_vanilla_visualization_HTML(document)
    
#explainer = DummyExplainer(None)


In [None]:
from scipy.stats import rankdata

In [None]:
def experiment_to_cannonical_form(experiment, binary=True):
    global_tokens_with_pos = [get_tokens_with_pos(explainer, d) for d in experiment]
  #  print("global_tokens_with_pos",global_tokens_with_pos)
    global_tokens_with_pos= list(set([x for xs in global_tokens_with_pos for x in xs]))
  #  print("global_tokens_with_pos", global_tokens_with_pos)
    cannonical_form = np.zeros((len(experiment),len(global_tokens_with_pos)))
    cannonical_form[:] = np.nan


    for n_experiment, document in enumerate(experiment):
        tokens_with_pos_and_id = get_tokens_with_pos_and_id(explainer,document)
     #   print("tokens_with_pos_and_id",tokens_with_pos_and_id)
        fi_scores_dict = dict(explainer.get_fi_scores(document)[0]) # [0] only consider explanation towards class "machine" for simplicity
        for token, pos, id_doc in tokens_with_pos_and_id:
            t = (token, pos)
            col_in_cannonical_matrix = global_tokens_with_pos.index(t)
            if id_doc in fi_scores_dict:
                cannonical_form[n_experiment, col_in_cannonical_matrix] = fi_scores_dict[id_doc]
            else:
                cannonical_form[n_experiment, col_in_cannonical_matrix] = 0            
             
    labels = [token + "_"+ str(pos) for token, pos in global_tokens_with_pos]
    
    return cannonical_form , labels #

In [None]:
results = []
for explainer_class in explainer_classes:
    explainer = explainer_class(detector)
    for experiment in experiments:
        cannonical_form, labels = experiment_to_cannonical_form(experiment, binary=True)
        results.append((explainer.__class__.__name__, explainer.detector.__class__.__name__, krippendorff.alpha(reliability_data=cannonical_form, level_of_measurement="interval")))

In [None]:
df = pd.DataFrame(results, columns=["Explainer", "Detector", "alpha"])

In [None]:
df

In [None]:
df.groupby(["Explainer", "Detector"]).mean()

In [None]:
from IPython.core.display import HTML

In [None]:
for d in experiments[0]:
    display(HTML(explainer.get_highlighted_text_HTML(d)))