In [1]:
DEBUG = False
N_DEBUG = -1

In [2]:
import pandas as pd
import time
import numpy as np
import os

from gpt2outputdataset.detector_radford import DetectorRadford
from detector_guo import DetectorGuo
from detector_dummy import DetectorDummy
from explainer_wrappers import LIME_Explainer, SHAP_Explainer

In [3]:
import pandas as pd
import numpy as np
import transformers

In [4]:
test = pd.read_pickle("./dataset_test.pkl")
test = test # always load the full dataset! (np.random.shuffle(tokenized_sentences)). slice the actual hybrid_documents if debugging!


documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py

if  N_DEBUG > 0:
    documents = documents[0:N_DEBUG]
    gold_labels = gold_labels[0:N_DEBUG]

#from gpt2outputdataset.detector_radford import DetectorRadford
#from detectgpt.detector_detectgpt import DetectorDetectGPT

detector_classes = [DetectorGuo,DetectorRadford]# DetectorDetectGPT]

explainer_classes = [LIME_Explainer,SHAP_Explainer]

In [5]:
import re

In [6]:
import torch
import pickle

In [7]:
DEVICE = "cuda"
pattern = re.compile(r"<extra_id_\d+>")



model = "t5-small"
cache_dir="./.cache"
mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model, cache_dir=cache_dir).to(DEVICE)
mask_tokenizer = transformers.AutoTokenizer.from_pretrained(model, model_max_length=mask_model.config.n_positions, cache_dir=cache_dir)#.to(DEVICE)

In [8]:
columns=["Detector", "Original", "Prompt", "Edited"]

In [9]:
# generate perturbed samples (similar to how it is done in detectgpt/detector_detectgpt.py)
def get_pertubed_text(detector, text, n=1):
    tokens = text.split(' ')
    # select 1 (TODO) token in the original document to mask
    mask = np.zeros_like(tokens, dtype=bool)
    mask[np.random.randint(0, len(mask))] = 1 # TODO number of tokens to mask

    prediction_original = detector.predict_label([text])[0]

    past_generations = []
    perturbed_text = text
    # generate n unique perturbations (replace the same masked word(s) with one or more words)
    for _ in range(0,n):
        replacement_attempts = 0
        while True: # do while 
    
            i = 0
            for ii, (m, token) in enumerate(zip(mask, tokens)):
                if m:
                    tokens[ii] = "<extra_id_{}>".format(i)
                    i+=1
            i-=1
            masked_text = ' '.join(tokens)
            stop_id = mask_tokenizer.encode(f"<extra_id_{i+1}>")[0]


            tok = mask_tokenizer(masked_text, return_tensors="pt", padding=True).to(DEVICE)
            outputs = mask_model.generate(**tok, max_length=150, do_sample=True, top_p=1, num_return_sequences=1, eos_token_id=stop_id,)
            mt = mask_tokenizer.batch_decode(outputs, skip_special_tokens=False)

            fills = [x for x in re.split(r"<extra_id_\d*>", mt[0]) if x != "<pad>"]

            for i, (token, m) in enumerate(zip(tokens, mask)):
                if m:
                    if replacement_attempts < 100:
                        tokens[i] = fills.pop(0).strip()
                    else: # sometimes t5 can't come up with 5 unique new perturbations that match the constraints below. use a random word from the vocabulary instead
                        # have to change seed here as detector.predict_label() below sets it (wich results in endless loop)
                        np.random.seed(replacement_attempts)
                        random_token = [np.random.randint(0, mask_tokenizer.vocab_size)]
                        np.random.seed(42) # reset seed just to be sure, is reset with the next detector.predict_label() anyways 
                        tokens[i] = mask_tokenizer.batch_decode(random_token, skip_special_tokens=False)[0]
            perturbed_text = " ".join(tokens)

            # check if this is a valid and new perturbation
            if (perturbed_text == text) or (perturbed_text in past_generations):
                replacement_attempts+=1
                continue
            # check that label didn't flip
            if detector.predict_label([perturbed_text])[0] != prediction_original:
                replacement_attempts+=1
                continue
            else:
                break
        past_generations.append(perturbed_text)
    return past_generations

In [10]:
from tqdm import tqdm

In [11]:
columns=["Detector", "Original", "Perturbation"]

In [12]:
if os.path.isfile("./continuity.csv"):
    df = pd.read_csv("./continuity.csv")
else: 
    df = pd.DataFrame([], columns=columns)
    # write headers (mode != "a")
    df.to_csv("./continuity.csv", encoding="UTF-8", index=False)


In [13]:
N_PERTURBATIONS = 5


for detector_class in detector_classes:
    detector = detector_class()
    for document in tqdm(documents, total=len(documents), desc="Generating perturbations"):
        if df[(df["Original"] == document) & (df["Detector"] == detector.__class__.__name__)]["Original"].count() > 0:
            continue
        # set seeds here so perturbed documents are the same regardless of slice for documents when debugging (and explanations don't have to be regenerated)
        np.random.seed(42)
        torch.manual_seed(42)
        for perturbation in get_pertubed_text(detector, document, N_PERTURBATIONS):
            row = ((detector.__class__.__name__, document, perturbation))
            pd.DataFrame([row], columns=columns).to_csv("./continuity.csv", mode="a", encoding="UTF-8", index=False, header=False)
        #break



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Generating perturbations: 100%|██████████| 305/305 [00:00<00:00, 1054.40it/s]
Generating perturbations: 100%|██████████| 305/305 [00:00<00:00, 1212.50it/s]


In [14]:

# if DEBUG:
#     # only keep instances with cached explanations
#     explainers = [explainer_calss(detector) for explainer_calss in explainer_classes]
#     documents_and_perturbations = [(original, gl, perturbations) for original, gl, perturbations in documents_and_perturbations if all([explainer.does_explanation_exist(original) for explainer in explainers]) and all([
#     x
#     for xs in [[explainer.does_explanation_exist(perturbation) for perturbation in perturbations] for explainer in explainers]
#     for x in xs
# ])]

In [15]:
df = pd.read_csv("./continuity.csv")
df

Unnamed: 0,Detector,Original,Perturbation
0,DetectorGuo,I've heard of handyman type people making a li...,I've heard of handyman type people making a li...
1,DetectorGuo,I've heard of handyman type people making a li...,I've heard of handyman type people making a li...
2,DetectorGuo,I've heard of handyman type people making a li...,I've heard of handyman type people making a li...
3,DetectorGuo,I've heard of handyman type people making a li...,I've heard of handyman type people making a li...
4,DetectorGuo,I've heard of handyman type people making a li...,I've heard of handyman type people making a li...
...,...,...,...
3045,DetectorRadford,"In financial markets, the terms ""bid"" and ""ask...","In financial markets, the terms ""bid"" and ""ask..."
3046,DetectorRadford,"In financial markets, the terms ""bid"" and ""ask...","In financial markets, the terms ""bid"" and ""ask..."
3047,DetectorRadford,"In financial markets, the terms ""bid"" and ""ask...","In financial markets, the terms ""bid"" and ""ask..."
3048,DetectorRadford,"In financial markets, the terms ""bid"" and ""ask...","In financial markets, the terms ""bid"" and ""ask..."


In [16]:
# # generate all explanations
# for detector_class in detector_classes:
#     detector = detector_class()
#     print(detector.__class__.__name__)
#     for explainer_class in explainer_classes:
        
#         explainer = explainer_class(detector)
#         print(explainer.__class__.__name__)
#         for original, perturbation in tqdm([(o, p) for o,p in zip(df["Original"], df["Perturbation"]) if not explainer.is_cached(o) or not explainer.is_cached(p) or not all([explainer.is_cached(o, alt="alt_{}_".format(i)) for i in range(1,5)])]):
#             explainer.get_explanation_cached(original)
#             explainer.get_explanation_cached(perturbation)
#             for i in range(1,4):
#                 explainer.get_explanation_cached(original, alt="alt_{}_".format(i)) 

In [17]:
import krippendorff


In [18]:
from collections import OrderedDict, defaultdict

In [19]:
# returns a list of tokens in the document, in an encoding that allows for treating explanations as observations in an experiment

# in this experiment, words are treated as variables and fi-scores as responses in a coding task

# this function therefore transforms a string "An example is an example"
# into a list of pairs:
# [("An", 0), ("example", 0), ("is", 0), ("an", 0), ("example",1)]
# "An example is not an example"
# [("An", 0), ("example", 0), ("is", 0), ("not", 0), ("an", 0), ("example",1)]

# when calculating the reliability measure, replaced and/or missing words are treated as unobserved in the other explanations

def get_tokens_with_pos(explainer, document):
    p_counter = defaultdict(lambda : 0)
    tokens_with_pos = []
    for token in explainer.tokenize(document):
        tokens_with_pos.append((token, p_counter[token]))
        p_counter[token] += 1
    return tokens_with_pos

In [20]:
# as above, but also includes the position in the original document given by enumerate(explainer.tokenize(document))
# this is useful for indexing the original explanation (a dict)
def get_tokens_with_pos_and_id(explainer, document):
    p_counter = defaultdict(lambda : 0)
    tokens_with_pos = []
    for id, token in enumerate(explainer.tokenize(document)):
        tokens_with_pos.append((token, p_counter[token], id))
        p_counter[token] += 1
    return tokens_with_pos

In [21]:
from sklearn.preprocessing import normalize

In [22]:
from scipy.stats import rankdata

In [23]:
def experiment_to_cannonical_form(experiment, explainer):
    # build a list of tokens across [original] + edited; will be the "cols" of the "cannocical_form" matrix:
    # each word is treated as an item, each explanation as an observation
    global_tokens_with_pos = [get_tokens_with_pos(explainer, d) for d in experiment]
    global_tokens_with_pos= list(set([x for xs in global_tokens_with_pos for x in xs]))

    # this matrix will be passed to krippendorff.alpha as reliability_data
    cannonical_form = np.empty((len(experiment),len(global_tokens_with_pos)))
    cannonical_form[:] = np.nan # what is not filled is treated as unobserved

    # fill this matrix
    for n_experiment, document in enumerate(experiment):
        tokens_with_pos_and_id = get_tokens_with_pos_and_id(explainer,document)
        # the explanations are dicts of "#word": "fi_score"
        fi_scores_dict = dict(explainer.get_fi_scores(document,fill=True)[0]) # [0] only consider explanation towards class "machine" for simplicity

        # loop over the document, fill cells in cannonical_form
        for token, pos, id_doc in tokens_with_pos_and_id:
            t = (token, pos)
            # look up the col in the matrix this word belongs to
            col_in_cannonical_matrix = global_tokens_with_pos.index(t)

            # check if the explanation provides a fi value for this word
            if id_doc in fi_scores_dict:
                cannonical_form[n_experiment, col_in_cannonical_matrix] = fi_scores_dict[id_doc]
         #   else:
                # any new word introduced in the edited documents is unobserved in the original document           
      #  print(cannonical_form)
    labels = [token + "_"+ str(pos) for token, pos in global_tokens_with_pos]
    
    return cannonical_form , labels #

In [24]:
def experiment_to_cannonical_form(experiment, explainer):
    
    # each word is treated as an item, each explanation as an observation
    tokenized = [explainer.tokenize(d) for d in experiment]

    fi_scores = [tuple(zip(*explainer.get_fi_scores(d,fill=True)[0]))[1] for d in experiment] # fi scores towards label machine
    # determine bounds of left common part
    i = 0
    while all(x[0:i] == tokenized[0][0:i] for x in tokenized):
        i+=1
    i-=1
    # determine bounds of right common part
    j = 1
    while all(x[-j:] == tokenized[0][-j:] for x in tokenized):
        j+=1
    j-=1
   # print(i,j)
   # print([len(f) for f in fi_scores])
    # this matrix will be passed to krippendorff.alpha as reliability_data
    left_part = np.vstack([e[0:i] for e in fi_scores])
    if j > 0: 
        right_part = np.vstack([e[-j:] for e in fi_scores])
        cannonical_form = np.hstack([left_part, right_part])
    else: # if no tokens on the right part match
        cannonical_form = left_part

    return cannonical_form

In [25]:
[] == []

True

In [26]:
results = []
for detector_class in detector_classes:
    detector = detector_class()
    df_detector = df[df["Detector"] == detector.__class__.__name__]
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        print(explainer.__class__.__name__, detector.__class__.__name__)
        for original, perturbations in tqdm(df_detector.groupby("Original"), desc="Calculating agreement"):
            if not explainer.is_cached(original) or not all([explainer.is_cached(p) for p in perturbations["Perturbation"].tolist() ]):
                continue
            cannonical_form = experiment_to_cannonical_form([original]+perturbations["Perturbation"].tolist(), explainer)
            #cannonical_form_rerun = experiment_to_cannonical_form([original]*5, explainer)
            results.append((
                explainer.__class__.__name__, 
                explainer.detector.__class__.__name__,
                krippendorff.alpha(reliability_data=cannonical_form, level_of_measurement="interval"),
               # krippendorff.alpha(reliability_data=cannonical_form_rerun, level_of_measurement="interval")
                ))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


LIME_Explainer DetectorGuo


Calculating agreement:   0%|          | 0/305 [00:00<?, ?it/s]

In [None]:
df_results = pd.DataFrame(results, columns=["Explainer", "Detector", "alpha",])#"alpha rerun"])

In [None]:
df_results

Unnamed: 0,Explainer,Detector,alpha
0,LIME_Explainer,DetectorGuo,0.337869
1,LIME_Explainer,DetectorGuo,0.496127
2,LIME_Explainer,DetectorGuo,0.391412
3,LIME_Explainer,DetectorGuo,0.613203
4,LIME_Explainer,DetectorGuo,0.554270
...,...,...,...
1082,SHAP_Explainer,DetectorRadford,0.927417
1083,SHAP_Explainer,DetectorRadford,0.899557
1084,SHAP_Explainer,DetectorRadford,0.654781
1085,SHAP_Explainer,DetectorRadford,0.898848


In [None]:
df_results.groupby(["Explainer", "Detector"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,alpha
Explainer,Detector,Unnamed: 2_level_1
LIME_Explainer,DetectorGuo,0.478115
LIME_Explainer,DetectorRadford,0.439034
SHAP_Explainer,DetectorGuo,0.896235
SHAP_Explainer,DetectorRadford,0.824662


In [None]:
df_results.set_index(["Explainer", "Detector"]).groupby(["Explainer"]).mean()\
    .to_latex(environment="longtable", convert_css=True, clines="all;data", hrules=True, caption="Results aggregated by explainer", label="continuity-results-explainer")

Unnamed: 0_level_0,alpha
Explainer,Unnamed: 1_level_1
LIME_Explainer,0.458575
SHAP_Explainer,0.870427


In [None]:
styled_df.to_latex(environment="longtable", convert_css=True, clines="all;data", hrules=True, caption="Results aggregated by explainer", label="continuity-results-explainer")

In [None]:
from IPython.core.display import HTML