In [1]:
SURVEY_SUBMODULE_DIR = "../survey" # this notebook will create a folder "explanations" there
SAVE_PATH = "./dataset_user_study_18.csv" # created by document_selection_user_study.ipynb

This notebook exports the user-study csv from `SAVE_PATH` to the UI. The UI uses a JSON file for each document and explainer-detector pair. The explanations are stored as HTML files. For LIME and SHAP, javascript is bundled, for Anchor, it must be imported due to bundle size. 

In [2]:
from gpt2outputdataset.detector_radford import DetectorRadford
from detectgpt.detector_detectgpt import DetectorDetectGPT
from detector_dummy import DetectorDummy
from detector_guo import DetectorGuo
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer
from IPython.core.display import display, HTML
from tqdm import tqdm

Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display


In [3]:
explainer_classes = [LIME_Explainer, SHAP_Explainer, Anchor_Explainer]
detector_classes = [ DetectorGuo, DetectorRadford, DetectorDetectGPT]

In [4]:
import pandas as pd
import json
import os

In [5]:
df = pd.read_csv(SAVE_PATH)


In [6]:
df.groupby(["Detector"]).count()

Unnamed: 0_level_0,Unnamed: 0,Explainer,Documents Phases 1+3,Documents Phases 2+4,f(a),f(b),GT a,GT b,idx a,idx b,Spacy Similarity,Jaccard Similarity,Cosine Similarity tfidf,hash a,hash b
Detector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DetectorDetectGPT,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
DetectorGuo,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
DetectorRadford,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18


In [7]:
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR)): 
    raise FileNotFoundError("Make sure you cloned the survey submodule")
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations"))
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html"))
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data"))

While the explanations are cached, this will still take some time as `detector.predict_proba` must be called.

In [8]:

for detector_name, group in df.groupby("Detector"):
    if detector_name == "DetectorDetectGPT":
        detector = DetectorDetectGPT()
    if detector_name == "DetectorRadford":
        detector = DetectorRadford()
    if detector_name == "DetectorGuo":
        detector = DetectorGuo()

    documents_group = list(group.reset_index().iterrows()) # reset_index: count from 0 in each group
    
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        for index, row in tqdm(documents_group, "Exporting " + detector.__class__.__name__ + " " + explainer.__class__.__name__):
            a = row["Documents Phases 1+3"]
            b = row["Documents Phases 2+4"]
            # Phase 1 + 3: 
            path_explanation_html = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html", explainer.get_hash(a)+".html")
            with open(path_explanation_html, "w", encoding="UTF-8") as text_file:
                text_file.write(explainer.get_HTML(a, bundle=False))
            # print(row)
            # display(HTML(explainer.get_HTML(a)))
            # display(HTML(explainer.get_HTML(b)))
            # print("----------------")
            path_explanation_json = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data", explainer.get_hash(a)+".json")
            with open(path_explanation_json, "w", encoding="UTF-8") as text_file:
                explanation_data = { # do not include explanation
                    "document_nr": index,
                    "detector" : detector.__class__.__name__,
                    "explainer" : explainer.__class__.__name__,
                    "ground_truth" : row["GT a"],
                    "detector_label" : int(detector.predict_label([row["Documents Phases 1+3"]])[0]),
                    "detector_p_machine": float(detector.predict_proba([row["Documents Phases 1+3"]])[0][0]),
                    "detector_p_human": float(detector.predict_proba([row["Documents Phases 1+3"]])[0][1]),
                    "document": row["Documents Phases 1+3"],
                    "explanation_filename": explainer.get_hash(a),               
                }
                text_file.write(json.dumps(explanation_data))
            # Phase 2+4: Just string
            path_explanation_html = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html", explainer.get_hash(b)+".html")
            #  display(HTML(explainer.get_HTML(b)))
            path_explanation_json = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data", explainer.get_hash(b)+".json")
            with open(path_explanation_json, "w", encoding="UTF-8") as text_file:
                explanation_data = { # do not include explanation
                    "document_nr": index,
                    "detector" : detector.__class__.__name__,
                    "explainer" : explainer.__class__.__name__,
                    "document": row["Documents Phases 2+4"]                
                }
                text_file.write(json.dumps(explanation_data))


Using cache dir ./.cache
Loading BASE model EleutherAI/pythia-70m...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


MOVING BASE MODEL TO GPU...DONE (0.21s)
DONE (0.09s)


Exporting DetectorDetectGPT LIME_Explainer: 100%|██████████| 18/18 [00:52<00:00,  2.92s/it]
Exporting DetectorDetectGPT SHAP_Explainer: 100%|██████████| 18/18 [00:54<00:00,  3.01s/it]
Exporting DetectorDetectGPT Anchor_Explainer: 100%|██████████| 18/18 [00:54<00:00,  3.04s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorGuo LIME_Explainer: 100%|██████████| 18/18 [00:01<00:00, 10.24it/s]
Exporting DetectorGuo SHAP_Explainer: 100%|██████████| 18/18 [00:01<00:00,  9.76it/s]
Exporting DetectorGuo Anchor_Explainer: 100%|██████████| 18/18 [00:02<00:00,  6.32it/s]
Exporting DetectorRadford LIME_Explainer: 100%|██████████| 18/18 [00:01<00:00,  9.39it/s]
Exporting DetectorRadford SHAP_Explainer: 100%|██████████| 18/18 [00:01<00:00, 10.10it/s]
Exporting DetectorRadford Anchor_Explainer: 100%|██████████| 18/18 [00:02<00:00,  6.36it/s]
