In [1]:
DEBUG = True
N_DEBUG = 163

OUTPUT_DIR = "./pointing_game_datasets/"

In [2]:
from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorGuo, DetectorRadford]#,DetectorDetectGPT]

from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer
explainer_classes = [Anchor_Explainer, LIME_Explainer, SHAP_Explainer ]

In [3]:
import pointing_game_util

In [4]:
import os
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x202027a20d0>

In [5]:
test = pd.read_pickle("./dataset_test.pkl")
test = test # always load the full dataset! (np.random.shuffle(tokenized_sentences)). slice the actual hybrid_documents if debugging!
documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py

In [6]:
pd.Series((len(list(nlp(d).sents)) for d in documents)).describe()

count    305.000000
mean       5.708197
std        2.109681
min        1.000000
25%        4.000000
50%        5.000000
75%        7.000000
max       16.000000
dtype: float64

In [7]:
# keep a copy of the hybrid documents for the assert in the loop
ref_assert_hybrid_documents, _, _ = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=LIME_Explainer(DetectorRadford()).tokenize)


In [8]:
pd.Series((len(list(nlp(d).sents)) for d in ref_assert_hybrid_documents)).describe()

count    271.000000
mean       6.409594
std        1.163544
min        3.000000
25%        6.000000
50%        6.000000
75%        7.000000
max       15.000000
dtype: float64

In [9]:
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [10]:
results = []
for detector_class in detector_classes:
    detector = detector_class()
    print("Initialized " + detector.__class__.__name__)
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        print("Initialized " + explainer.__class__.__name__)

        print("Indexing hybrid documents for " + explainer.__class__.__name__)
        hybrid_documents, tokenized_hybrid_documents, GT = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=explainer.tokenize)
        assert (all([a==b for a,b in zip(ref_assert_hybrid_documents,hybrid_documents)])), "(full) Hybrid documents don't match" # tokenized_hybrid_documents differ by design to make the calculation of the pointing game accuracy easier

        if DEBUG:
            hybrid_documents = hybrid_documents[0:N_DEBUG]
            tokenized_hybrid_documents = tokenized_hybrid_documents[0:N_DEBUG]
            GT = GT[0:N_DEBUG]
        
        # write csv (for debug purposes)
        pd.DataFrame(zip(hybrid_documents, tokenized_hybrid_documents, GT), columns=["Hybrid Document", "Tokenized Hybrid Document", "GT"]).to_csv(os.path.join(OUTPUT_DIR, detector.__class__.__name__+ "-"+explainer.__class__.__name__+".csv"),index=False)
        print("Predicting hybrid documents")
        predictions_hybrid = detector.predict_label(hybrid_documents)

        print("Obtaining explanations on hybrid documents and calculating pointing game accuracy")

        
       # pointing_game_acc = pointing_game_util.get_pointing_game_acc(hybrid_documents, explainer, predictions_hybrid, GT)
        pointing_game_scores = pointing_game_util.get_pointing_game_scores(hybrid_documents, explainer, predictions_hybrid, GT)
      #  print("Pointing game accuracy for {} | {}: {}".format(explainer.__class__.__name__, detector.__class__.__name__, pointing_game_acc))
        results.extend([(explainer.__class__.__name__, detector.__class__.__name__, pointing_game_score) for pointing_game_score in pointing_game_scores])
    
        




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialized DetectorGuo
Initialized Anchor_Explainer
Indexing hybrid documents for Anchor_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


100%|██████████| 163/163 [00:04<00:00, 32.96it/s]


Initialized LIME_Explainer
Indexing hybrid documents for LIME_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


Generating explanations: 100%|██████████| 163/163 [00:01<00:00, 89.15it/s] 


Initialized SHAP_Explainer
Indexing hybrid documents for SHAP_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


Generating explanations: 100%|██████████| 163/163 [00:01<00:00, 118.93it/s]


Initialized DetectorRadford
Initialized Anchor_Explainer
Indexing hybrid documents for Anchor_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


100%|██████████| 163/163 [00:06<00:00, 24.52it/s]


Initialized LIME_Explainer
Indexing hybrid documents for LIME_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


Generating explanations: 100%|██████████| 163/163 [00:02<00:00, 58.22it/s]


Initialized SHAP_Explainer
Indexing hybrid documents for SHAP_Explainer
Predicting hybrid documents
Generating explanations on hybrid documents and calculating pointing game accuracy


Generating explanations: 100%|██████████| 163/163 [00:01<00:00, 114.63it/s]


In [11]:
from scipy.stats.mstats import ttest_1samp

In [12]:
# some formatting functions
def get_p_asterisks(group):
    val =  group.mean()
   # print(group.name)
    _, p = ttest_1samp(group, popmean=0.5)
    if p <= 0.001:
        return "{:.2f}\\textsuperscript{{***}}".format(val)
    if p <= 0.01:
        return "{:.2f}\\textsuperscript{{**}}".format(val)
    if p <= 0.05:
        return "{:.2f}\\textsuperscript{{*}}".format(val)
    if p > 0.05:
        return "{:.2f}\\textsuperscript{{ns}}".format(val)

def highlight_max(col):
    vals = col.str.extract(r"(-*\d*\.\d*)").astype(float).values.flatten()
    max_val = vals.max()
    return ["font-weight: bold;" if c == max_val else "" for c in vals ]
def df_to_latex(styled_df, caption="TODO", label="TODO"):
    return styled_df.to_latex(environment="table", convert_css=True, clines="all;data", hrules=True, caption=caption, label=label)

In [13]:
def style_dff(dff, groupby):
    dff["Explainer"] = dff["Explainer"].str.replace("_Explainer", "")
    p_results = dff.groupby(groupby).agg(
    {
          "Pointing Game Scores": get_p_asterisks,
        }
    )
    p_results = p_results.style.apply(highlight_max, subset=p_results.columns)
    return p_results


In [14]:
dff = pd.DataFrame(results, columns=["Explainer", "Detector", "Pointing Game Scores"])
dff

Unnamed: 0,Explainer,Detector,Pointing Game Scores
0,Anchor_Explainer,DetectorGuo,0.0
1,Anchor_Explainer,DetectorGuo,1.0
2,Anchor_Explainer,DetectorGuo,1.0
3,Anchor_Explainer,DetectorGuo,0.1
4,Anchor_Explainer,DetectorGuo,0.7
...,...,...,...
973,SHAP_Explainer,DetectorRadford,0.0
974,SHAP_Explainer,DetectorRadford,0.0
975,SHAP_Explainer,DetectorRadford,0.0
976,SHAP_Explainer,DetectorRadford,1.0


In [15]:
p_results_aggregate_level = style_dff(dff, groupby=["Explainer"])
display(p_results_aggregate_level)

Unnamed: 0_level_0,Pointing Game Scores
Explainer,Unnamed: 1_level_1
Anchor,0.58\textsuperscript{***}
LIME,0.50\textsuperscript{ns}
SHAP,0.73\textsuperscript{***}


In [16]:
p_results = style_dff(dff, groupby=["Explainer", "Detector"])
display(p_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pointing Game Scores
Explainer,Detector,Unnamed: 2_level_1
Anchor,DetectorGuo,0.68\textsuperscript{***}
Anchor,DetectorRadford,0.49\textsuperscript{ns}
LIME,DetectorGuo,0.62\textsuperscript{**}
LIME,DetectorRadford,0.39\textsuperscript{**}
SHAP,DetectorGuo,0.82\textsuperscript{***}
SHAP,DetectorRadford,0.64\textsuperscript{***}


In [17]:
print(df_to_latex(p_results, label="pointing-game-explainer-detector", caption="Scores per detector and explainer"))
print(df_to_latex(p_results_aggregate_level, label="pointing-game-explainer", caption="Scores per explainer"))

\begin{table}
\caption{Scores per detector and explainer}
\label{pointing-game-explainer-detector}
\begin{tabular}{lll}
\toprule
 &  & Pointing Game Scores \\
Explainer & Detector &  \\
\midrule
\multirow[c]{2}{*}{Anchor} & DetectorGuo & 0.68\textsuperscript{***} \\
\cline{2-3}
 & DetectorRadford & 0.49\textsuperscript{ns} \\
\cline{1-3} \cline{2-3}
\multirow[c]{2}{*}{LIME} & DetectorGuo & 0.62\textsuperscript{**} \\
\cline{2-3}
 & DetectorRadford & 0.39\textsuperscript{**} \\
\cline{1-3} \cline{2-3}
\multirow[c]{2}{*}{SHAP} & DetectorGuo & \bfseries 0.82\textsuperscript{***} \\
\cline{2-3}
 & DetectorRadford & 0.64\textsuperscript{***} \\
\cline{1-3} \cline{2-3}
\bottomrule
\end{tabular}
\end{table}

\begin{table}
\caption{Scores per explainer}
\label{pointing-game-explainer}
\begin{tabular}{ll}
\toprule
 & Pointing Game Scores \\
Explainer &  \\
\midrule
Anchor & 0.58\textsuperscript{***} \\
\cline{1-2}
LIME & 0.50\textsuperscript{ns} \\
\cline{1-2}
SHAP & \bfseries 0.73\textsuperscr

In [18]:

# for hybrid_document in hybrid_documents:

#     explainer = LIME_Explainer(detector)
#     explainer.get_explanation_cached(hybrid_document).show_in_notebook()

#     explainer = SHAP_Explainer(detector)
#     shap.text_plot(explainer.get_explanation_cached(hybrid_document))
        