In [1]:
DEBUG = False
N_DEBUG = 275

OUTPUT_DIR = "./pointing_game_datasets/"

In [2]:
from detector_radford import DetectorRadford
from detector_detectgpt import DetectorDetectGPT
from detector_guo import DetectorGuo
detector_classes = [DetectorDetectGPT]

from explainer_wrappers import SHAP_Explainer, LIME_Explainer, Anchor_Explainer
explainer_classes = [SHAP_ExplainerAnchor_Explainer]

In [3]:
import pointing_game_util

In [4]:
import os
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x16235d23690>

In [5]:
test = pd.read_pickle("./dataset_test.pkl")
test = test # always load the full dataset! (np.random.shuffle(tokenized_sentences)). slice the actual hybrid_documents if debugging!
documents = test["answer"]
gold_labels = test["author"] == "human_answers" # convention: 0: machine, 1: human, see detector.py

In [6]:
# pd.Series((len(list(nlp(d).sents)) for d in documents)).describe()

In [7]:
# keep a copy of the hybrid documents for the assert in the loop
ref_assert_hybrid_documents, _, _ = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=LIME_Explainer(DetectorRadford()).tokenize)


In [8]:
# pd.Series((len(list(nlp(d).sents)) for d in ref_assert_hybrid_documents)).describe()

In [9]:
if not os.path.exists(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [10]:
results = []
for detector_class in detector_classes:
    detector = detector_class()
    print("Initialized " + detector.__class__.__name__)
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        print("Initialized " + explainer.__class__.__name__)

        print("Indexing hybrid documents for " + explainer.__class__.__name__)
        hybrid_documents, tokenized_hybrid_documents, GT = pointing_game_util.hybrid(documents.to_list(), gold_labels.to_list(), word_tokenizer=explainer.tokenize)
        assert (all([a==b for a,b in zip(ref_assert_hybrid_documents,hybrid_documents)])), "(full) Hybrid documents don't match" # tokenized_hybrid_documents differ by design to make the calculation of the pointing game accuracy easier

        if DEBUG:
            hybrid_documents = hybrid_documents[200:N_DEBUG]
            tokenized_hybrid_documents = tokenized_hybrid_documents[200:N_DEBUG]
            GT = GT[200:N_DEBUG]
        
        # write csv (for debug purposes)
        pd.DataFrame(zip(hybrid_documents, tokenized_hybrid_documents, GT), columns=["Hybrid Document", "Tokenized Hybrid Document", "GT"]).to_csv(os.path.join(OUTPUT_DIR, detector.__class__.__name__+ "-"+explainer.__class__.__name__+".csv"),index=False)
        print("Predicting hybrid documents")
        predictions_hybrid = detector.predict_label(hybrid_documents)

        print("Obtaining explanations on hybrid documents and calculating pointing game accuracy")

        
       # pointing_game_acc = pointing_game_util.get_pointing_game_acc(hybrid_documents, explainer, predictions_hybrid, GT)
        pointing_game_scores = pointing_game_util.get_pointing_game_scores(hybrid_documents, explainer, predictions_hybrid, GT)
      #  print("Pointing game accuracy for {} | {}: {}".format(explainer.__class__.__name__, detector.__class__.__name__, pointing_game_acc))
        results.extend([(explainer.__class__.__name__, detector.__class__.__name__, pointing_game_score) for pointing_game_score in pointing_game_scores])
    
        




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Initialized DetectorDetectGPT
Initialized Anchor_Explainer
Indexing hybrid documents for Anchor_Explainer
Predicting hybrid documents
Obtaining explanations on hybrid documents and calculating pointing game accuracy


  0%|          | 0/271 [00:00<?, ?it/s]

Best: 109 (mean:1.0000000000, n: 1, n_total 110 lb:0.0000) Worst: 0 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 54 (mean:1.0000000000, n: 1, n_total 120 lb:0.0000) Worst: 45 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 64 (mean:1.0000000000, n: 1, n_total 130 lb:0.0000) Worst: 24 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 19 (mean:1.0000000000, n: 1, n_total 140 lb:0.0000) Worst: 1 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 65 (mean:1.0000000000, n: 1, n_total 150 lb:0.0000) Worst: 96 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 20 (mean:1.0000000000, n: 1, n_total 160 lb:0.0000) Worst: 3 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 63 (mean:1.0000000000, n: 1, n_total 170 lb:0.0000) Worst: 43 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 63 (mean:1.0000000000, n: 6, n_total 180 lb:0.0000) Worst: 80 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 18 (mean:1.0000000000, n: 1, n_total 190 lb:0.0000) Worst: 32 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 66 (mean:1.0000000000, n

  0%|          | 1/271 [04:43<21:17:46, 283.95s/it]

while: (21,) mean = 0.80 lb = 0.70 ub = 0.90 coverage: 0.00 n: 116
Found eligible anchor  (21,) Coverage: 0.0 Is best? True
Best: 165 (mean:1.0000000000, n: 1, n_total 166 lb:0.0000) Worst: 94 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 82 (mean:1.0000000000, n: 1, n_total 176 lb:0.0000) Worst: 146 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 0 (mean:1.0000000000, n: 1, n_total 186 lb:0.0000) Worst: 93 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best: 16 (mean:1.0000000000, n: 1, n_total 196 lb:0.0000) Worst: 42 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best of size  1 :
40 1.0 0 1


  1%|          | 2/271 [10:22<23:38:05, 316.30s/it]

while: (40,) mean = 0.89 lb = 0.71 ub = 1.00 coverage: 0.00 n: 36
Found eligible anchor  (40,) Coverage: 0.0 Is best? True
Best: 153 (mean:1.0000000000, n: 1, n_total 194 lb:0.0000) Worst: 0 (mean:0.0000, n: 1, ub:1.0000) B = 1.00
Best of size  1 :
45 1.0 0 1
while: (45,) mean = 0.45 lb = 0.13 ub = 0.78 coverage: 0.00 n: 11


In [None]:
from scipy.stats.mstats import ttest_1samp

In [None]:
# some formatting functions
def get_p_asterisks(group):
    val =  group.mean()
   # print(group.name)
    _, p = ttest_1samp(group, popmean=0.5)
    if p <= 0.001:
        return "{:.2f}\\textsuperscript{{***}}".format(val)
    if p <= 0.01:
        return "{:.2f}\\textsuperscript{{**}}".format(val)
    if p <= 0.05:
        return "{:.2f}\\textsuperscript{{*}}".format(val)
    if p > 0.05:
        return "{:.2f}\\textsuperscript{{ns}}".format(val)

def highlight_max(col):
    vals = col.str.extract(r"(-*\d*\.\d*)").astype(float).values.flatten()
    max_val = vals.max()
    return ["font-weight: bold;" if c == max_val else "" for c in vals ]
def df_to_latex(styled_df, caption="TODO", label="TODO"):
    return styled_df.to_latex(environment="table", convert_css=True, clines="all;data", hrules=True, caption=caption, label=label)

In [None]:
def style_dff(dff, groupby):
    dff["Explainer"] = dff["Explainer"].str.replace("_Explainer", "")
    p_results = dff.groupby(groupby).agg(
    {
          "Pointing Game Scores": get_p_asterisks,
        }
    )
    p_results = p_results.style.apply(highlight_max, subset=p_results.columns)
    return p_results


In [None]:
dff = pd.DataFrame(results, columns=["Explainer", "Detector", "Pointing Game Scores"])
dff

Unnamed: 0,Explainer,Detector,Pointing Game Scores
0,SHAP_Explainer,DetectorDetectGPT,1
1,SHAP_Explainer,DetectorDetectGPT,1
2,SHAP_Explainer,DetectorDetectGPT,1
3,SHAP_Explainer,DetectorDetectGPT,1
4,SHAP_Explainer,DetectorDetectGPT,1
5,SHAP_Explainer,DetectorDetectGPT,1
6,SHAP_Explainer,DetectorDetectGPT,1
7,SHAP_Explainer,DetectorDetectGPT,1
8,SHAP_Explainer,DetectorDetectGPT,1
9,SHAP_Explainer,DetectorDetectGPT,1


In [None]:
p_results_aggregate_level = style_dff(dff, groupby=["Explainer"])
display(p_results_aggregate_level)

Unnamed: 0_level_0,Pointing Game Scores
Explainer,Unnamed: 1_level_1
LIME,0.62\textsuperscript{ns}
SHAP,0.72\textsuperscript{*}


In [None]:
p_results = style_dff(dff, groupby=["Explainer", "Detector"])
display(p_results)

Unnamed: 0_level_0,Unnamed: 1_level_0,Pointing Game Scores
Explainer,Detector,Unnamed: 2_level_1
LIME,DetectorDetectGPT,0.62\textsuperscript{ns}
SHAP,DetectorDetectGPT,0.72\textsuperscript{*}


In [None]:
print(df_to_latex(p_results, label="pointing-game-explainer-detector", caption="Scores per detector and explainer"))
print(df_to_latex(p_results_aggregate_level, label="pointing-game-explainer", caption="Scores per explainer"))

\begin{table}
\caption{Scores per detector and explainer}
\label{pointing-game-explainer-detector}
\begin{tabular}{lll}
\toprule
 &  & Pointing Game Scores \\
Explainer & Detector &  \\
\midrule
LIME & DetectorDetectGPT & 0.62\textsuperscript{ns} \\
\cline{1-3} \cline{2-3}
SHAP & DetectorDetectGPT & \bfseries 0.72\textsuperscript{*} \\
\cline{1-3} \cline{2-3}
\bottomrule
\end{tabular}
\end{table}

\begin{table}
\caption{Scores per explainer}
\label{pointing-game-explainer}
\begin{tabular}{ll}
\toprule
 & Pointing Game Scores \\
Explainer &  \\
\midrule
LIME & 0.62\textsuperscript{ns} \\
\cline{1-2}
SHAP & \bfseries 0.72\textsuperscript{*} \\
\cline{1-2}
\bottomrule
\end{tabular}
\end{table}



In [None]:

# for hybrid_document in hybrid_documents:

#     explainer = LIME_Explainer(detector)
#     explainer.get_explanation_cached(hybrid_document).show_in_notebook()

#     explainer = SHAP_Explainer(detector)
#     shap.text_plot(explainer.get_explanation_cached(hybrid_document))
        