In [None]:
DEBUG = False
N_DEBUG = 50
N_RANDOM_RUNS = 100
OUTPUT_DIR = "./pointing_game_datasets/"

In [None]:
from detector_radford import DetectorRadford
from detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorGuo, DetectorRadford,DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer, Random_Explainer
explainer_classes =  [Random_Explainer] * N_RANDOM_RUNS +[LIME_Explainer, SHAP_Explainer, Anchor_Explainer ]
explainer_classes

In [None]:
import pointing_game_util

In [None]:
import os
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('sentencizer')

In [None]:
test = pd.read_pickle("./dataset_test.pkl")
test = test # always load the full dataset! (np.random.shuffle(tokenized_sentences)). slice the actual hybrid_documents if debugging!
documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py

In [None]:
# pd.Series((len(list(nlp(d).sents)) for d in documents)).describe()

In [None]:
# keep a copy of the hybrid documents for the assert in the loop
ref_assert_hybrid_documents, _, _ = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=LIME_Explainer(DetectorRadford()).tokenize)


In [None]:
# pd.Series((len(list(nlp(d).sents)) for d in ref_assert_hybrid_documents)).describe()

In [None]:
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [10]:
results = []
for detector_class in detector_classes:  
    detector = detector_class()
    predictions_hybrid = None
    for i, explainer_class in enumerate(explainer_classes):
        if explainer_class == Random_Explainer:
            explainer = explainer_class(detector,seed=i)
        else:
            explainer = explainer_class(detector)
        if explainer_class == Anchor_Explainer and detector_class == DetectorDetectGPT:
            continue

        hybrid_documents, tokenized_hybrid_documents, GT = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=explainer.tokenize)
        assert (all([a==b for a,b in zip(ref_assert_hybrid_documents,hybrid_documents)])), "(full) Hybrid documents don't match" # tokenized_hybrid_documents differ by design to make the calculation of the pointing game accuracy easier
        
        # write csv (for debug purposes)
        pd.DataFrame(zip(hybrid_documents, tokenized_hybrid_documents, GT), columns=["Hybrid Document", "Tokenized Hybrid Document", "GT"]).to_csv(os.path.join(OUTPUT_DIR, detector.__class__.__name__+ "-"+explainer.__class__.__name__+".csv"),index=False)
        if predictions_hybrid is None:
            predictions_hybrid = detector.predict_label(hybrid_documents) # the assert above guarantees that the documents are the same across explainers

        pointing_game_scores = pointing_game_util.get_pointing_game_scores(hybrid_documents, explainer, predictions_hybrid, GT)
        results.extend([(explainer.__class__.__name__, detector.__class__.__name__, pointing_game_score) for pointing_game_score in pointing_game_scores])
    
        




Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 6181.41it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 6823.36it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 7847.61it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 7481.45it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8061.85it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 7091.56it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8312.72it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8189.34it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8446.02it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8718.16it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8443.76it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 7931.45it/s]
Generating explanations: 100%|██████████| 271/271 [00:00<00:00, 8436.11it/s]

In [11]:
from scipy.stats.mstats import ttest_1samp

In [12]:
# some formatting functions
def get_p_asterisks(group):
    val =  group.mean()
   # print(group.name)
    _, p = ttest_1samp(group, popmean=0.5)
    if p <= 0.001:
        return "{:.2f}\\textsuperscript{{***}}".format(val)
    if p <= 0.01:
        return "{:.2f}\\textsuperscript{{**}}".format(val)
    if p <= 0.05:
        return "{:.2f}\\textsuperscript{{*}}".format(val)
    if p > 0.05:
        return "{:.2f}\\textsuperscript{{ns}}".format(val)

def highlight_max(col):
    vals = col.str.extract(r"(-*\d*\.\d*)").astype(float).values.flatten()
    max_val = vals.max()
    return ["font-weight: bold;" if c == max_val else "" for c in vals ]
def df_to_latex(styled_df, caption="TODO", label="TODO"):
    return styled_df.to_latex(environment="table", convert_css=True, clines="all;data", hrules=True, caption=caption, label=label)

In [13]:
def style_dff(dff, groupby):
    dff["Explainer"] = dff["Explainer"].str.replace("_Explainer", "")
    p_results = dff.groupby(groupby).agg(
    {
          "Pointing Game Scores": get_p_asterisks,
        }
    )
    p_results = p_results.style.apply(highlight_max, subset=p_results.columns)
    return p_results


In [14]:
dff = pd.DataFrame(results, columns=["Explainer", "Detector", "Pointing Game Scores"])
dff

Unnamed: 0,Explainer,Detector,Pointing Game Scores
0,Random_Explainer,DetectorGuo,0.0
1,Random_Explainer,DetectorGuo,0.0
2,Random_Explainer,DetectorGuo,1.0
3,Random_Explainer,DetectorGuo,0.0
4,Random_Explainer,DetectorGuo,0.0
...,...,...,...
83463,SHAP_Explainer,DetectorDetectGPT,0.0
83464,SHAP_Explainer,DetectorDetectGPT,0.0
83465,SHAP_Explainer,DetectorDetectGPT,0.0
83466,SHAP_Explainer,DetectorDetectGPT,0.0


In [15]:
p_results_aggregate_level = style_dff(dff, groupby=["Explainer"])
display(p_results_aggregate_level)

Unnamed: 0_level_0,Pointing Game Scores
Explainer,Unnamed: 1_level_1
Anchor,0.59\textsuperscript{***}
LIME,0.55\textsuperscript{**}
Random,0.57\textsuperscript{***}
SHAP,0.69\textsuperscript{***}


In [16]:
p_results = style_dff(dff, groupby=["Explainer", "Detector"])
display(p_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pointing Game Scores
Explainer,Detector,Unnamed: 2_level_1
Anchor,DetectorGuo,0.68\textsuperscript{***}
Anchor,DetectorRadford,0.49\textsuperscript{ns}
LIME,DetectorDetectGPT,0.63\textsuperscript{***}
LIME,DetectorGuo,0.61\textsuperscript{***}
LIME,DetectorRadford,0.40\textsuperscript{**}
Random,DetectorDetectGPT,0.58\textsuperscript{***}
Random,DetectorGuo,0.64\textsuperscript{***}
Random,DetectorRadford,0.48\textsuperscript{***}
SHAP,DetectorDetectGPT,0.63\textsuperscript{***}
SHAP,DetectorGuo,0.81\textsuperscript{***}


In [17]:
out = df_to_latex(p_results, label="pointing-game-explainer-detector", caption="Scores per detector and explainer")
out += (df_to_latex(p_results_aggregate_level, label="pointing-game-explainer", caption="Scores per explainer"))
with open("figures/tables_pointing_game.tex", "w", encoding="UTF-8") as text_file:
    text_file.write(out)

In [18]:

# for hybrid_document in hybrid_documents:

#     explainer = LIME_Explainer(detector)
#     explainer.get_explanation_cached(hybrid_document).show_in_notebook()

#     explainer = SHAP_Explainer(detector)
#     shap.text_plot(explainer.get_explanation_cached(hybrid_document))
        

In [41]:
import numpy as np
pd.read_csv("pointing_game_datasets/DetectorDetectGPT-Random_Explainer.csv")["GT"].agg(lambda x: x.count("False")).mean()

using <function <lambda> at 0x00000256EC2C3560> in Series.agg cannot aggregate and has been deprecated. Use Series.transform to keep behavior unchanged.


70.63468634686348

In [42]:
pd.read_csv("pointing_game_datasets/DetectorDetectGPT-Random_Explainer.csv")["GT"].agg(lambda x: x.count("True")).mean()

using <function <lambda> at 0x00000256EBCBFEC0> in Series.agg cannot aggregate and has been deprecated. Use Series.transform to keep behavior unchanged.


52.67158671586716