In [None]:
DEBUG = False
N_DEBUG = 100

In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import re
import sklearn
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
test = pd.read_pickle("./dataset_test.pkl")

test = test[test["author"] == "human_answers"]
print("len(test_human)", len(test))
documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py




#from gpt2outputdataset.detector_radford import DetectorRadford
#from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorGuo]#,DetectorRadford,DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer
explainer_classes = [LIME_Explainer,SHAP_Explainer]

In [None]:
from gpt2outputdataset.detector_radford import DetectorRadford


detector = DetectorRadford()

In [None]:
DEVICE = "cuda"
pattern = re.compile(r"<extra_id_\d+>")

base_model_name="facebook/opt-350m"
openai_model = False

cache_dir="./.cache"
# mask_model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model, cache_dir=cache_dir).to(DEVICE)
# mask_tokenizer = transformers.AutoTokenizer.from_pretrained(model, model_max_length=mask_model.config.n_positions, cache_dir=cache_dir)#.to(DEVICE)
do_top_k= False
do_top_p= False

In [None]:
base_model_kwargs = {}
if 'gpt-j' in base_model_name or 'neox' in base_model_name:
    base_model_kwargs.update(dict(torch_dtype=torch.float16))
if 'gpt-j' in base_model_name:
    base_model_kwargs.update(dict(revision='float16'))
base_model = transformers.AutoModelForCausalLM.from_pretrained(base_model_name, **base_model_kwargs, cache_dir=cache_dir).to(DEVICE)


optional_tok_kwargs = {"additional_special_tokens": ["<|loris|>"]}
# if "facebook/opt-" in base_model_name:
#     print("Using non-fast tokenizer for OPT")
#     optional_tok_kwargs['fast'] = False

base_tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_name, **optional_tok_kwargs, cache_dir=cache_dir, padding_side='left')
base_tokenizer.pad_token_id = base_tokenizer.eos_token_id # TODO WHY??? 


In [None]:
predictions = None

In [None]:
decoded = None

In [None]:
if DEBUG and N_DEBUG > 0:
    documents = documents.iloc[0:N_DEBUG].to_list()

In [None]:
from tqdm import tqdm

In [None]:
label_flip_pairs = []

for document in tqdm(documents, desc="Generating perturbations"):
    np.random.seed(42)
    torch.manual_seed(42)
    doc = nlp(document)
    n_tokens_original = len(base_tokenizer(document, return_tensors="pt", padding=True).to(DEVICE).input_ids[0])
    original_prediction = detector.predict_label([document])[0]
    substrings = [''.join(token.text_with_ws for token in doc[:-i]) for i in range(1,len(document)) ]

    batch_size = 25
    for batch in (sklearn.utils.gen_batches(len(substrings), batch_size)):
        encoded = base_tokenizer(substrings[batch], return_tensors="pt", padding=True).to(DEVICE)
        sampling_kwargs = {}

        outputs = base_model.generate(**encoded, min_length=n_tokens_original-5, max_length=n_tokens_original+5, do_sample=True, **sampling_kwargs, pad_token_id=base_tokenizer.eos_token_id, eos_token_id=base_tokenizer.eos_token_id)
        decoded = base_tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions = detector.predict_label(decoded)
        
        if any(predictions != original_prediction):
            first_new_label = (predictions!=original_prediction).argmax(axis=0)

            #                        original  prompt                              first instance that flips label
            label_flip_pairs.append((document, substrings[batch][first_new_label], decoded[first_new_label]))
            break


In [None]:
label_flip_pairs

In [None]:
explainer = SHAP_Explainer(detector)

In [None]:
from IPython.core.display import HTML

In [None]:
original, prompt, label_flip_example = label_flip_pairs[2]
print(prompt)
display(HTML(explainer.get_highlighted_text_HTML(original)))
display(HTML(explainer.get_highlighted_text_HTML(prompt)))
display(HTML(explainer.get_highlighted_text_HTML(label_flip_example)))

In [None]:
# basic idea: assert that exp(original)[original - prompt] <substantially different than> exp(label_flip_example)[label_flip_example - prompt]
# i.e. the new/changed section is assigned the opposite label (TODO hard coded: "machine") more often

In [None]:
label_flip_pairs

In [None]:
import krippendorff

In [19]:
results = []
for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        for original, prompt, label_flip_example in label_flip_pairs:
            tokens_original_minus_prompt = explainer.tokenize(original)[len(explainer.tokenize(prompt)):]
            tokens_label_flip_minus_prompt = explainer.tokenize(label_flip_example)[len(explainer.tokenize(prompt)):]
            exp_original_minus_prompt = explainer.get_fi_scores(original, fill=True)[0][len(explainer.tokenize(prompt)):] # TODO hard coded: "machine"
            exp_label_flip_minus_prompt = explainer.get_fi_scores(label_flip_example, fill=True)[0][len(explainer.tokenize(prompt)):] # setting fill=True returns all features (not just the top_k) 
         
            exp_original_prompt_only = explainer.get_fi_scores(original, fill=True)[0][0:len(explainer.tokenize(prompt))]
            exp_label_flip_prompt_only = explainer.get_fi_scores(label_flip_example, fill=True)[0][0:len(explainer.tokenize(prompt))]
           # print(exp_original_minus_prompt)
           # print(exp_label_flip_minus_prompt)
            mean_fi_machine_label_flip = np.mean([fi_score for _, fi_score in exp_label_flip_minus_prompt])
            mean_fi_original = np.mean([fi_score for _, fi_score in exp_original_minus_prompt])

            cannonical_form = np.vstack([exp_original_prompt_only, exp_label_flip_prompt_only])
            k_alpha = krippendorff.alpha(cannonical_form, level_of_measurement="interval")
            results.append((explainer.__class__.__name__, k_alpha, mean_fi_original, mean_fi_machine_label_flip,))


regen If your counterparty sent money to a correspondent account at another bank, then it is completely up to the other bank what to do with the money. If the wire transfer completed, then the account is not closed. If I were your business partner, I would immediately contact the bank to which the transfer was made and explain the situation and hopefully they will transfer the money back. Whenever a wire transfer is made, the recipients name, address, and account number are included.   I would not trust the money transfer company.
I don't want to send the money to a new bank, so this means I cannot be the target of a bank subpoena. The bank is on paper with the account number and all the details of the transfer, and


In [None]:
df = pd.DataFrame(results, columns=["Explainer", "Krippendorf's alpha on prompt","E[original - prompt]", "E[lf - prompt]"])

In [None]:
df["Change original -> lf [%]"] = ((df["E[original - prompt]"] - df["E[lf - prompt]"] ) / df["E[original - prompt]"]) * 100
df

In [None]:
df.groupby("Explainer").mean()

In [None]:
np.mean([fi_score for _, fi_score in exp_label_flip_minus_prompt])

In [None]:
exp_label_flip_minus_prompt