In [None]:
RESULTS_PATH = "./results/contrastivity/pairs.csv"

In [None]:
import pandas as pd
import numpy as np

results = []


In [None]:
import transformers

from tqdm import tqdm
import itertools
import os

import torch
import re
import sklearn
import spacy


nlp = spacy.load("en_core_web_sm")

In [None]:
test = pd.read_pickle("./dataset_test.pkl")

documents = test["answer"]

gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py


from detector_detectgpt import DetectorDetectGPT
from detector_radford import DetectorRadford
from detector_guo import DetectorGuo
detector_classes = [DetectorRadford,DetectorGuo, DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Random_Explainer
explainer_classes = [SHAP_Explainer,LIME_Explainer] + [Random_Explainer] * 10

In [None]:
DEVICE = "cuda"
pattern = re.compile(r"<extra_id_\d+>")

# model used to generate perturbations
base_model_name="facebook/opt-350m"

In [None]:
base_model = transformers.AutoModelForCausalLM.from_pretrained(base_model_name, cache_dir="./.cache").to(DEVICE)
base_tokenizer = transformers.AutoTokenizer.from_pretrained(base_model_name, cache_dir="./.cache", padding_side='left',)

# Generate/Load documents
Generate or load documents from `RESULTS_PATH`

In [None]:
columns=["Detector", "Original", "Prompt", "Edited", "f(Original)", "f(Edited)"]

In [None]:
if os.path.isfile(RESULTS_PATH):
    df = pd.read_csv(RESULTS_PATH)
else: 
    df = pd.DataFrame([], columns=columns)
    # write headers (mode != "a")
    df.to_csv(RESULTS_PATH, encoding="UTF-8", index=False)
df

In [None]:
df.groupby("Detector").count()

In [None]:
df

In [None]:
for detector_class in detector_classes:
    detector = detector_class()
    for document in tqdm(documents, desc="Generating perturbations"): # only use those where f(x) = human
        np.random.seed(42)
        torch.manual_seed(42)

        if df[df["Detector"] == detector.__class__.__name__]["Original"].str.contains(document, regex=False).any(): # check if document is in csv, if yes, skip
            continue
        doc = nlp(document)
        n_tokens_original = len(base_tokenizer(document, return_tensors="pt", padding=True).to(DEVICE).input_ids[0])

        substrings = [''.join(token.text_with_ws for token in doc[:-i]) for i in range(1,len(document)) ]
        substrings = [substring for substring in substrings if substring != ""] # at least one token

        n_generations_per_lenght = 5
        substrings = list(itertools.chain.from_iterable(itertools.repeat(s, n_generations_per_lenght) for s in substrings))

        batch_size = 20 # adjust so it fits in your GPU memory
        prediction_original = detector.predict_label([document])[0]
        row = (detector.__class__.__name__, document, None, None, prediction_original, None) # to mark the document in the csv cache if no pair is found
        for batch in (sklearn.utils.gen_batches(len(substrings), batch_size)):
            encoded = base_tokenizer(substrings[batch], return_tensors="pt", padding=True).to(DEVICE)


            outputs = base_model.generate(**encoded, min_length=n_tokens_original-5, max_length=n_tokens_original+5, do_sample=True, pad_token_id=base_tokenizer.eos_token_id, eos_token_id=base_tokenizer.eos_token_id)
            decoded = base_tokenizer.batch_decode(outputs, skip_special_tokens=True)

            predictions = detector.predict_label(decoded)
            
            if any(predictions != prediction_original):
                first_new_label = (predictions!=prediction_original).argmax(axis=0)
                assert decoded[first_new_label] != document
                assert predictions[first_new_label] != prediction_original
                #                                   original  prompt                              first instance that flips label
                row = (detector.__class__.__name__, document, substrings[batch][first_new_label], decoded[first_new_label], prediction_original, predictions[first_new_label])
                break
        pd.DataFrame([row], columns=columns).to_csv(RESULTS_PATH, mode="a", encoding="UTF-8", index=False, header=False)


In [None]:
df = pd.read_csv(RESULTS_PATH)

In [None]:
df = df.dropna()
df

In [None]:
df.groupby(["Detector", "f(Edited)"]).count()

## Plot metrics of perturbed documents

In [None]:
df["Tokens New Part"] = df.apply(lambda row : len(nlp(row["Edited"].replace(row["Prompt"], ""))) , axis=1)
df["Tokens Original Part"] = df.apply(lambda row : len(nlp(row["Original"].replace(row["Prompt"], ""))) , axis=1)
df["Tokens Prompt"] = df.apply(lambda row : len(nlp(row["Prompt"])) , axis=1)
df["Tokens Document"] = df.apply(lambda row : len(nlp(row["Original"])) , axis=1)

In [None]:
df["Tokens Edited"] = (df["Tokens Document"] - df["Tokens Prompt"])
df["Tokens Edited Proportion"] = (df["Tokens Document"] - df["Tokens Prompt"]) / df["Tokens Document"]

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from matplotlib.backends.backend_pgf import FigureCanvasPgf
matplotlib.backend_bases.register_backend('pdf', FigureCanvasPgf)
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'sans-serif',
    "font.sans-serif": "Helvetica",
    'text.usetex': True,
    'pgf.rcfonts': False,
})
sns.set_palette(sns.color_palette("husl", 4))

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharey=True, figsize=(5,5))
df[df["f(Original)"] == 1].groupby(["Detector"])["Tokens Edited Proportion"].plot.hist(alpha=0.5, bins=20, ax=ax1 )
ax1.legend(loc="upper right")
ax1.set_title("$f(d_i)$ = human → machine = $f(d_i^\Omega)$")

df[df["f(Original)"] == 0].groupby(["Detector"])["Tokens Edited Proportion"].plot.hist(alpha=0.5, bins=20, ax=ax2 )
ax2.legend(loc="upper right")
ax2.set_title("$f(d_i)$ = machine → human = $f(d_i^\Omega)$")
ax2.set_xlabel("")
f.tight_layout()
plt.savefig('./figures/contrastivity-label-flip.pgf')
plt.show()

In [None]:
df[df["Tokens Edited Proportion"] <= 0.50].groupby("Detector")["Tokens Edited"].describe()

In [None]:
df[df["Tokens Edited Proportion"] <= 0.50].groupby("Detector")["Tokens Edited Proportion"].describe()

# Run Experiment/Calculate Scores

In [None]:
df = df[df["Tokens Edited Proportion"] <= 0.5]

In [None]:
import krippendorff

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
columns = ["Explainer",
          "Detector",
          "f(d) → f(m)",
          "Mean FI Original Cut Part",
          "Mean FI Edited New Part",
          "Mean FI Original Common Part",
          "Mean FI Edited Common Part",
          "[Score 1] cos sim",
          "[Score 1] Krippendorff",
          "[Score 4]",
          "[Score 2]",
          "[Score 3]",
          "Tokens Edited Proportion"]

In [None]:
results = []

for detector_class in detector_classes:
    detector = detector_class()
    random_run = 1
    for explainer_class in explainer_classes:
            if explainer_class == Random_Explainer:
                 explainer = explainer_class(detector, seed=random_run) # 10 random runs per detector (Random_Explainer is 10 times in explainer_classes)
                 random_run += 1
            else:
                explainer = explainer_class(detector)
            for idx, (_, original, prompt, edited, f_original, f_edited, _,_,_,_,_,tokens_edited_proportion) in tqdm(list(df[df["Detector"] == detector.__class__.__name__].iterrows()), desc="Gathering results"):
                target_label = f_edited
                # if not (explainer.is_cached(original)) or not(explainer.is_cached(edited)):
                #      continue
                # The generation strategy above uses spacy's tokenizer (where punctuation chars are end up in seperate tokens). 
                # The explanation methods have their own tokenizers and FI scores are reported irt to those tokens
                # For LIME, multiple punctuation chars can end up in the same token e.g.: "!)" is one token, "!" too, "#+#++..,++##" as well. 
                # This is problematic when comparing explanations:
                # i.e. this can fail: assert explainer.tokenize(original)[0:lenght_promt] == explainer.tokenize(edited)[0:lenght_promt]
                
                # For this experiment, it is only important to separate the "prompt", which is the common part between the two documents, from the generated/cut parts
                # Determining the bounds of the prompt has to be done AFTER tokenization:
                # Strategy: change bounds until assert explainer.tokenize(original)[0:lenght_promt] == explainer.tokenize(edited)[0:lenght_promt] passes
                lenght_promt = len(explainer.tokenize(prompt)) # this would suffice for SHAP, but not for lime (because it sometimes collapses punctuation chars into one token) 
                while explainer.tokenize(original)[0:lenght_promt] != explainer.tokenize(edited)[0:lenght_promt] or len(explainer.tokenize(original)[lenght_promt:]) == 0 or len(explainer.tokenize(edited)[lenght_promt:]) == 0:    
                    lenght_promt -= 1 
                lenght_promt = max(lenght_promt, 1) # if the first word is followed by a punctuation char, e.g., "Example! Is a sentence." and the prompt is just "Example", the loop above would set lenght_promt=0. This happens as LIME tokenizes this to ['Example!', 'Is', 'a', 'sentence.']
                assert explainer.tokenize(original)[1:lenght_promt] == explainer.tokenize(edited)[1:lenght_promt]
                       
                # get cut/edited parts
                exp_original_cut_part = explainer.get_fi_scores(original, fill=True)[target_label][lenght_promt:] # TODO hard coded: "machine"
                exp_edited_new_part = explainer.get_fi_scores(edited, fill=True)[target_label][lenght_promt:] # setting fill=True returns all features (not just the top_k) 
                fi_scores_exp_original_cut_part = np.array([fi_score for _, fi_score in exp_original_cut_part])
                fi_scores_exp_edited_new_part =   np.array([fi_score for _, fi_score in exp_edited_new_part])
                

                # get common part
                exp_original_common_part = explainer.get_fi_scores(original, fill=True)[target_label][0:lenght_promt]
                exp_edited_common_part = explainer.get_fi_scores(edited, fill=True)[target_label][0:lenght_promt]
                fi_scores_exp_original_common_part = np.array([fi_score for _, fi_score in exp_original_common_part])
                fi_scores_exp_edited_common_part=    np.array([fi_score for _, fi_score in exp_edited_common_part])
                
                # build result row
                score_2_new_and_cut_parts_opposite = np.mean(fi_scores_exp_edited_new_part) > np.mean(fi_scores_exp_original_cut_part)
                score_4_max_fi_in_new_part = fi_scores_exp_edited_new_part.max() > fi_scores_exp_edited_common_part.max()
                score_3_new_average_higher_than_common = np.mean(fi_scores_exp_edited_new_part) > np.mean(fi_scores_exp_edited_common_part)

                cannonical_form = np.vstack([fi_scores_exp_original_common_part, fi_scores_exp_edited_common_part])
                
                assert len(fi_scores_exp_original_cut_part) > 0
                assert len(fi_scores_exp_edited_new_part) > 0
                assert len(fi_scores_exp_original_common_part) > 0
                assert len(fi_scores_exp_edited_common_part) > 0

                score_1_k_alpha = None
                if np.all(cannonical_form==0): # the krippendorff library requires items to not all be the same
                    score_1_k_alpha = 1
                else:
                    score_1_k_alpha = krippendorff.alpha(cannonical_form, level_of_measurement="interval")

                score_1_cos_sim = cosine_similarity(fi_scores_exp_original_common_part.reshape(1, -1),fi_scores_exp_edited_common_part.reshape(1, -1))[0,0]
                row =     (
                     explainer.__class__.__name__, 
                     detector.__class__.__name__, 
                     "m → h" if target_label else "h → m",
                     np.mean(fi_scores_exp_original_cut_part), 
                     np.mean(fi_scores_exp_edited_new_part), 
                     np.mean(fi_scores_exp_original_common_part), 
                     np.mean(fi_scores_exp_edited_common_part),  
                     score_1_cos_sim,               
                     score_1_k_alpha,                             
                     score_4_max_fi_in_new_part, 
                     score_2_new_and_cut_parts_opposite,
                     score_3_new_average_higher_than_common,
                     tokens_edited_proportion
                     )
                results.append(row)



In [None]:
dff = pd.DataFrame(results, columns=columns)
dff["Explainer"] = dff["Explainer"].str.replace("_Explainer", "")


In [None]:
from scipy.stats.mstats import ttest_1samp

In [None]:
export_cols = [
          "[Score 1] cos sim",
          "[Score 1] Krippendorff",          
          "[Score 2]",
          "[Score 3]",
          "[Score 4]",
]

In [None]:
def df_to_latex(styled_df, caption="TODO", label="TODO", environment="table"):
    return styled_df.format(precision=3).to_latex(environment=environment, convert_css=True, clines="all;data", hrules=True, caption=caption, label=label)

In [None]:
dff

In [None]:
# style_bold = dff.set_index(["Explainer", "Detector","f(d) → f(m)"])[export_cols].groupby(["Detector","Explainer",  "f(d) → f(m)"]).mean().style.highlight_max(props="font-weight: bold;")

In [None]:
def style_dff(dff, groupby):
    p_results = dff.set_index(["Explainer", "Detector","f(d) → f(m)"])[export_cols].groupby(groupby).agg(
    {
          "[Score 1] cos sim": ["count", "mean"],
          "[Score 1] Krippendorff": "mean",          
          "[Score 2]": "mean",
          "[Score 3]": "mean",
          "[Score 4]": "mean",
        }
    )

    p_results[('n')] = p_results[('[Score 1] cos sim', 'count')]
    p_results = p_results.drop([('[Score 1] cos sim', 'count')], axis=1)

    p_results = p_results[[list(p_results.columns)[-1]] + list(p_results.columns)[:-1]]
    p_results.columns = [a for a, _ in p_results.columns]
    p_results = p_results.rename(columns={'[Score 1] cos sim': '(1) cosine', '[Score 1] Krippendorff': '(1) Krippendorff', '[Score 2]': '(2)', '[Score 3]': '(3)', })
    p_results = p_results.sort_values(by=["(2)"], ascending=False)
    p_results = p_results.style#.apply(highlight_max, subset=p_results.columns[1:], axis=None)
    p_results.hide("[Score 4]", axis=1)
    return p_results


In [None]:
p_results_detector_level = style_dff(dff, groupby=["Detector","Explainer",  "f(d) → f(m)"])
display(p_results_detector_level)
p_results_aggregate_level = style_dff(dff, groupby=["Explainer"]).hide(subset=["n","[Score 4]", "(1) cosine"], axis=1)
display(p_results_aggregate_level)

In [None]:
out = df_to_latex(p_results_detector_level, label="contrastivity-explainer-detector-direction", caption="Scores per detector, explainer and direction of change. For scores 2 and 3, higher values are better", environment="longtable")
with open("figures/tables_contrastivity_detector.tex", "w", encoding="UTF-8") as text_file:
    text_file.write(out)
out = df_to_latex(p_results_aggregate_level, label="contrastivity-aggregate", caption="Average scores per method. For scores 2 and 3, higher values are better")
with open("figures/tables_contrastivity_explainer.tex", "w", encoding="UTF-8") as text_file:
    text_file.write(out)

In [None]:
p_results_aggregate_level