In [None]:
USE_CACHE = True
RESULTS_PATH = "./results/token_removal" # path to cache results to so that plots can be adjusted without re-running the experiment

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm


In [None]:
test = pd.read_pickle("./dataset_test.pkl")


documents = test["answer"]
gold_labels = test["author"] == "human_answers"

from detector_radford import DetectorRadford
from detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorRadford, DetectorGuo, DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Random_Explainer, Anchor_Explainer
explainer_classes = [Anchor_Explainer, LIME_Explainer, SHAP_Explainer]

In [None]:
from abc import ABC, abstractmethod
class Experiment(ABC):
    @abstractmethod 
    def __init__(self, explainer, document, gt):
        self.document = document
        self.gt = gt
        self.explainer = explainer
        self.fi_scores_machine  = self.explainer.get_fi_scores(document, fill=True)[0]
        self.fi_scores_human = self.explainer.get_fi_scores(document, fill=True)[1]
    @property
    @abstractmethod
    def modified_document(self):
        pass
    # as in Arras et al. 2016: "The target class is the true document class,[...]"
    def get_fi_scores_target(self):
        if self.gt:
            return self.fi_scores_human
        else:
            return self.fi_scores_machine
    def remove_features(self, id_fi_tuples_list, mask=True):
        tokenized_modified_document = self.explainer.tokenize(self.document)
       
        ### assert right words are removed
        top_words = [word for word, fi in self.explainer.as_list(self.explainer.get_explanation_cached(self.document), label=self.gt) if fi > 0]
        for a, b in zip(tokenized_modified_document, self.explainer.tokenize(self.document)):
            if a == self.explainer.detector.get_pad_token():
                assert b in top_words, "Masking strategy faulty"
        ###
       
        ids_tokens_to_remove = []
        # don't remove anything if there are no best/worst features (e.g. only positive or only negative fi scores)
        if len(id_fi_tuples_list):
            ids_tokens_to_remove, _ = zip(*id_fi_tuples_list) 
        # replace with pad token (all detectors support partial input)
        if mask:
            for t in ids_tokens_to_remove:
                tokenized_modified_document[t] = self.explainer.detector.get_pad_token()
        else:
            tokenized_modified_document = [t for i, t in enumerate(tokenized_modified_document) if i not in ids_tokens_to_remove]
        return self.explainer.untokenize(tokenized_modified_document)

In [None]:
class Experiment_Delete_n_Highest(Experiment):
    def __init__(self, explainer, document, gt, include_zero_scores=False, n=10, mask=True):
        super().__init__(explainer, document, gt)
        self.n = n
        self.include_zero_scores = include_zero_scores
        self.mask = mask
    @property
    def modified_document(self):
        if self.n == 0:
            return self.document
        highest_fi_scores_target = sorted(self.get_fi_scores_target(), key=lambda x: x[1], reverse=True)[0:self.n]
        # remove 0 scores if !include_zero_scores
        highest_fi_scores_target = [(idx, fi) for idx,fi in highest_fi_scores_target if (self.include_zero_scores or (fi != 0)) and (fi >= 0)]
        if len(highest_fi_scores_target) == 0:
            return None
        return self.remove_features(highest_fi_scores_target, mask=self.mask)


In [None]:
# prediction cache, useful as original document is the same in all experiments
prediction_cache = {}
def prediction_cached(detector, document):
    id = (detector.__class__.__name__,document)
    if id not in prediction_cache:
        prediction_cache[id] = detector.predict_proba([document])[0]
    return prediction_cache[id]


In [None]:
def get_results_row(document, gt,  explainer, experiment_class, detector, n, mask):
    """Runs the experiment at the specified n for one detector and explanation method. Used to create a pandas df

    Args:
        document: The original document
        gt: Ground truth of the original document
        explainer: Instance of the explanation method to use
        experiment_class: A class that extends Experiment
        detector: Instance of the detector to use
        n: How many tokens to remove
        mask: If False, tokens are deleted instead of masked with the tokenizer's mask/pad token 

    Returns:
        A row of the df
    """
    experiment = experiment_class(explainer, document, gt, n=n, mask=mask)
    if experiment.modified_document is None: # modified_document is None when len(fi_scores) < n. skip to speed up calculation
        return [    explainer.__class__.__name__, 
                    explainer.detector.__class__.__name__, 
                    n,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None,
                    None]
    p_machine_original, p_human_original  = prediction_cached(detector, experiment.document)
    y_original = p_human_original >= p_machine_original

    p_machine_modified = None
    p_human_modified = None

    p_machine_modified, p_human_modified = prediction_cached(detector, experiment.modified_document)
    y_modified = p_human_modified >= p_machine_modified
    return [
                    explainer.__class__.__name__, 
                    explainer.detector.__class__.__name__, 
                    n,
                    p_machine_original,
                    p_human_original,
                    y_original,
                    p_machine_modified,
                    p_human_modified,
                    y_modified,
                    gt]

In [None]:
def run_experiment(experiment_class, detector, n=10, mask=True):
    """Collects results for all explanation methods as a pandas df

    Args:
        experiment_class: A class that extends Experiment
        detector:  Instance of the detector to use
        n: How many tokens to remove. Defaults to 10.
        mask: If False, tokens are deleted instead of masked with the tokenizer's mask/pad token. Defaults to True.

    Returns:
        A pandas df with results built with get_results_row
    """
    results = []
    columns=["Explainer", "Detector", "n", "p_machine_original", "p_human_original", "y_original", "p_machine_modified", "p_human_modified", "y_modified", "gt"]
    
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        if isinstance(explainer, LIME_Explainer) and n > explainer.num_features:
            print("skip lime",n, explainer.num_features)
            continue

        for document, gt in zip(documents, gold_labels):
            results.append(get_results_row(document, gt, explainer, experiment_class, detector, n, mask))

    # random baseline (with new seeds at each run)
    for i in range(0,5):
        random_explainer = Random_Explainer(detector, seed=42-i)
        for document, gt in zip(documents, gold_labels):
            results_row = get_results_row(document, gt, random_explainer, experiment_class, detector, n, mask)
            results_row[0] = "Random Run "+str(i)
            results.append(results_row)

    df = pd.DataFrame(results, columns=columns)

    df["p_target_original"] = df.apply(lambda x: x["p_human_original"] if x["gt"] else x["p_machine_original"], axis=1)
    df["p_target_modified"] = df.apply(lambda x: x["p_human_modified"] if x["gt"] else x["p_machine_modified"], axis=1)    
    df["drop_target"] = df["p_target_original"] - df["p_target_modified"] 

    

    return df

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def get_acc_by_initial_prediction_right_wrong(df, n, mask):
    """Calculates accuracy scores for the two cases "initially wrong" and "initially right"

    Args:
        df: Output from run_experiment
        n: How many tokens to remove
        mask: If False, tokens are deleted instead of masked with the tokenizer's mask/pad token 

    Returns:
        A df with two accurary_scores
    """
    accs = []
    for explainer_name, group in df.groupby("Explainer"):
        group_initial_right = group[group["y_original"] == group["gt"]]
        group_initial_wrong = group[group["y_original"] != group["gt"]]

        acc_initial_right = accuracy_score(group_initial_right["gt"].astype(int), group_initial_right["y_modified"].astype(int)) if len(group_initial_right) > 0 else None
        acc_inital_wrong = accuracy_score(group_initial_wrong["gt"].astype(int), group_initial_wrong["y_modified"].astype(int)) if len(group_initial_wrong) > 0 else None 
        accs.append((explainer_name,acc_initial_right, acc_inital_wrong,n, mask))
    return pd.DataFrame(accs, columns=["Explainer", "acc_initial_right", "acc_initial_wrong", "n", "mask"])

In [None]:
import os

In [None]:
n_deleted_words = list(range(0,10+1)) + [20,30,40,50]


for detector_class in detector_classes:
    
    detector = detector_class()
    path = os.path.join(RESULTS_PATH, detector.__class__.__name__+".csv")
    if USE_CACHE and os.path.isfile(path):
        continue

    dfs_at_n_mask = [(run_experiment(Experiment_Delete_n_Highest,detector, n=n, mask=True), n, True) for n in tqdm(n_deleted_words, desc="Running mask experiment for "+detector.__class__.__name__)]
    dfs_at_n_delete = [(run_experiment(Experiment_Delete_n_Highest,detector, mask=False, n=n), n, False) for n in tqdm(n_deleted_words, desc="Running delete experiment for "+detector.__class__.__name__)]
    dfs_at_n = dfs_at_n_mask + dfs_at_n_delete
    dfs_at_n = [(df.dropna(),n,mask) for df,n, mask in dfs_at_n]
    df_accuracy_scores = pd.concat([get_acc_by_initial_prediction_right_wrong(df,n, mask) for df, n, mask in dfs_at_n])
    
    df_accuracy_scores.loc[df_accuracy_scores["Explainer"].str.startswith("Random"), "Explainer"] = "Random"
    # df_accuracy_scores.groupby(["Explainer", "n"]).mean().reset_index()

    df_accuracy_scores["Explainer"] = df_accuracy_scores["Explainer"].str.replace("_Explainer","")
    df_accuracy_scores["Detector"] = detector.__class__.__name__
    
    df_accuracy_scores.to_csv(path, encoding="UTF-8", index=False)


## Plot
Like in Arras et al.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def plot_acc(detector_name, df_accuracy_scores, mask):
    df_accuracy_scores = df_accuracy_scores[(df_accuracy_scores["n"] <= 10) | (df_accuracy_scores["Explainer"] == "SHAP") | (df_accuracy_scores["Explainer"] == "Random") ]
    f, (ax_right, ax_wrong) = plt.subplots(1,2, sharey=True, figsize=(18,4))
    ax_right.axhline(y=0.5,linestyle='--', lw=0.5, color="red")
    ax_wrong.axhline(y=0.5,linestyle='--', lw=0.5, color="red")

    sns.lineplot(data=df_accuracy_scores[df_accuracy_scores["mask"]== mask], x="n", y="acc_initial_right", hue="Explainer", ax=ax_right)
    sns.lineplot(data=df_accuracy_scores[df_accuracy_scores["mask"]== mask], x="n", y="acc_initial_wrong", hue="Explainer", ax=ax_wrong)
    ax_wrong.set_xlim(0,30)
    ax_right.set_xlim(0,30)
    plt.suptitle(("Mask" if mask else "Delete") + " " + detector_name)
    f.tight_layout()   

In [None]:
for f in os.listdir(RESULTS_PATH):
    detector_name = f.split(".")[0]
    df_accuracy_scores = pd.read_csv(os.path.join(RESULTS_PATH, f))
    plot_acc(detector_name, df_accuracy_scores, mask=True)

In [None]:
for f in os.listdir(RESULTS_PATH):
    detector_name = f.split(".")[0]
    df_accuracy_scores = pd.read_csv(os.path.join(RESULTS_PATH, f))
    plot_acc(detector_name, df_accuracy_scores, mask=False)

In [None]:
df_accuracy_scores = pd.concat([pd.read_csv(os.path.join(RESULTS_PATH, f)) for f in os.listdir(RESULTS_PATH)]).reset_index(drop=True)
df_accuracy_scores = df_accuracy_scores[(df_accuracy_scores["n"] <= 10) | (df_accuracy_scores["Explainer"] == "SHAP") | (df_accuracy_scores["Explainer"] == "Random") ]


In [None]:
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.backends.backend_pgf import FigureCanvasPgf
matplotlib.backend_bases.register_backend('pdf', FigureCanvasPgf)
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'sans-serif',
    "font.sans-serif": "Helvetica",
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [None]:
for mask in [False, True]:
    f, (ax_right, ax_wrong) = plt.subplots(1,2, sharey=True, figsize=(10,5))
    sns.lineplot(data=df_accuracy_scores[df_accuracy_scores["mask"]== mask], x="n", y="acc_initial_right", hue="Explainer", ax=ax_right, markers=True, marker='o', errorbar="sd")
    sns.lineplot(data=df_accuracy_scores[df_accuracy_scores["mask"]== mask], x="n", y="acc_initial_wrong", hue="Explainer", ax=ax_wrong, markers=True, marker='o',errorbar="sd")
    ax_right.axhline(y=0.5,linestyle='--', lw=0.5, color="black")
    ax_wrong.axhline(y=0.5,linestyle='--', lw=0.5, color="black")
    ax_right.set_ylabel("Accuracy")
    plt.tight_layout()
    plt.savefig('./figures/token_removal_mask.pgf' if mask else './figures/token_removal_delete.pgf')
    plt.show()  