In [1]:
SURVEY_SUBMODULE_DIR = "../survey" # this notebook will create a folder "explanations" there
SAVE_PATH = "./dataset_user_study.csv" # created by document_selection_user_study.ipynb

This notebook exports the user-study csv from `SAVE_PATH` to the UI. The UI uses a JSON file for each document and explainer-detector pair. The explanations are stored as HTML files. For LIME and SHAP, javascript is bundled, for Anchor, it must be imported due to bundle size. 

In [2]:
from detector_radford import DetectorRadford
from detector_detectgpt import DetectorDetectGPT
from detector_dummy import DetectorDummy
from detector_guo import DetectorGuo
from explainer_wrappers import LIME_Explainer, SHAP_Explainer, Anchor_Explainer
from IPython.core.display import display, HTML
from tqdm import tqdm

Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display


In [3]:
explainer_classes = [LIME_Explainer, SHAP_Explainer, Anchor_Explainer]
detector_classes = [ DetectorGuo, DetectorRadford, DetectorDetectGPT]

In [4]:
import pandas as pd
import json
import os

In [5]:
df = pd.read_csv(SAVE_PATH)


In [6]:
df.groupby(["Detector"]).count()

Unnamed: 0_level_0,Unnamed: 0,Explainer,Documents Phases 1+3,Documents Phases 2+4,f(a),f(b),GT a,GT b,idx a,idx b,Spacy Similarity,Jaccard Similarity,Cosine Similarity tfidf,hash a,hash b
Detector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
DetectorDetectGPT,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
DetectorGuo,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18
DetectorRadford,18,18,18,18,18,18,18,18,18,18,18,18,18,18,18


In [7]:
df

Unnamed: 0.1,Unnamed: 0,Detector,Explainer,Documents Phases 1+3,Documents Phases 2+4,f(a),f(b),GT a,GT b,idx a,idx b,Spacy Similarity,Jaccard Similarity,Cosine Similarity tfidf,hash a,hash b
0,0,DetectorDetectGPT,Anchor_Explainer,"Like anything else, the price/value of gold is...","The way I would use it is, every trade done by...",1,1,True,True,41,203,0.827313,0.112782,0.061295,61e5376ff4746da16bf0659eaabebbf5037c7c69d552f5...,7107855c70ca6021b3690796048a3da0067b5e6abf98d6...
1,1,DetectorDetectGPT,Anchor_Explainer,You'll likely see several more scary market ev...,No. Securities brokers/dealers in the United S...,1,1,True,True,173,1,0.802838,0.083333,0.056487,7bd59b306ee4f86d66ea0c02c7c6ad2f0c42df557f3731...,fe43a432243c718cfa3fa2bae6294e8c9514f4d931d2b5...
2,2,DetectorDetectGPT,Anchor_Explainer,Got any creative for a 10 year old’s birthday?...,Both are correct depending on what you are rea...,1,1,False,True,12,277,0.907527,0.137681,0.077867,cba2fc2710696f28b3c9bda38caaff1013b835fdb2f8e5...,edebbd11c843b985aa543b539f521ef0b87b05f2cffc95...
3,3,DetectorDetectGPT,Anchor_Explainer,Both are saying essentially the same thing. T...,"Assuming you live in the US, it is quite norma...",0,0,True,True,52,238,0.867257,0.176101,0.15379,714b04dd8923e09ea3f370b93660441d792104140d13d3...,60b992dfcad293c2fbe76d7842a4e469ba041ced7c5270...
4,4,DetectorDetectGPT,Anchor_Explainer,"The ""open"" price is the price at which the fir...","In general, the date of offer refers to the da...",0,0,False,False,123,291,0.899291,0.196078,0.092865,fcd143af324a2542069f9d2669f6cf9cd0f3b0b8245f94...,6dec874873d092e717f684d6bb283e1976d6c7ea8ec3d8...
5,5,DetectorDetectGPT,Anchor_Explainer,It is generally not recommended to give any me...,It is important to follow the recommendations ...,0,0,False,False,118,146,0.977747,0.242991,0.297141,44c99778341c5e80abb1e16320744e7def9185ab134d4b...,8e0f457e411931fbae8b13a36d7680bd93fc87f08242ca...
6,6,DetectorDetectGPT,LIME_Explainer,Exchange traded options are issued in a way th...,Traditionally options expired on the 3rd Wedne...,1,1,True,True,34,172,0.838525,0.106796,0.155825,bc1ac544b523873961b9f7584e55d8d4cde629ea895fa1...,8e42aeed132b74f34fa79d2c96ed8a54104a343073a89f...
7,7,DetectorDetectGPT,LIME_Explainer,You can't. Even as a technical trader you shou...,Depends on how you measure liquidity. There's...,1,1,True,True,10,30,0.884345,0.130137,0.146908,a342eb9f18bd17d57a51ac8d2a82af3705f3761337de56...,48ccd95896b29bef9ffd5a14addf3235e368318b8a7f66...
8,8,DetectorDetectGPT,LIME_Explainer,Limit books are managed by exchanges. If an or...,Reuters has a service you can subscribe to tha...,1,1,True,True,191,249,0.94361,0.137931,0.072292,cb18fc73b58feeb14211103bba236c3a96f4bc881f323c...,e6b72b7969bbad53385251e1baf30d9f72a75067fc1707...
9,9,DetectorDetectGPT,LIME_Explainer,Multivariate statistics is a branch of statist...,"There are many potential causes of fever, shiv...",0,0,False,False,180,301,0.916336,0.108108,0.039615,4c1cbef644e5ad6a64c2209098c975c58e5f80d93e25f5...,0e65e1bbbf8133d14e940529817a55398d439910f1376e...


In [8]:
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR)): 
    raise FileNotFoundError("Make sure you cloned the survey submodule")
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations"))
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html"))
if not os.path.exists(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data")): 
    os.makedirs(os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data"))

While the explanations are cached, this will still take some time as `detector.predict_proba` must be called.

# Export for user-study

In [9]:

# for detector_name, group in df.groupby("Detector"):
#     if detector_name == "DetectorDetectGPT":
#         detector = DetectorDetectGPT()
#     if detector_name == "DetectorRadford":
#         detector = DetectorRadford()
#     if detector_name == "DetectorGuo":
#         detector = DetectorGuo()

#     documents_group = list(group.reset_index().iterrows()) # reset_index: count from 0 in each group
    
#     for explainer_class in explainer_classes:
#         explainer = explainer_class(detector)
#         for index, row in tqdm(documents_group, "Exporting " + detector.__class__.__name__ + " " + explainer.__class__.__name__):
#             a = row["Documents Phases 1+3"]
#             b = row["Documents Phases 2+4"]
#             # Phase 1 + 3: 
#             path_explanation_html = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html", explainer.get_hash(a)+".html")
#             with open(path_explanation_html, "w", encoding="UTF-8") as text_file:
#                 text_file.write(explainer.get_HTML(a, bundle=False))
#             path_explanation_json = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data", explainer.get_hash(a)+".json")
#             with open(path_explanation_json, "w", encoding="UTF-8") as text_file:
#                 explanation_data = { # do not include explanation
#                     "document_nr": index,
#                     "detector" : detector.__class__.__name__,
#                     "explainer" : explainer.__class__.__name__,
#                     "ground_truth" : row["GT a"],
#                     "detector_label" : int(detector.predict_label([row["Documents Phases 1+3"]])[0]),
#                     "detector_p_machine": float(detector.predict_proba([row["Documents Phases 1+3"]])[0][0]),
#                     "detector_p_human": float(detector.predict_proba([row["Documents Phases 1+3"]])[0][1]),
#                     "document": row["Documents Phases 1+3"],
#                     "explanation_filename": explainer.get_hash(a),               
#                 }
#                 text_file.write(json.dumps(explanation_data))
#             # Phase 2+4: Just string
#             path_explanation_html = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "html", explainer.get_hash(b)+".html")
#             #  display(HTML(explainer.get_HTML(b)))
#             path_explanation_json = os.path.join(SURVEY_SUBMODULE_DIR, "explanations", "data", explainer.get_hash(b)+".json")
#             with open(path_explanation_json, "w", encoding="UTF-8") as text_file:
#                 explanation_data = { # do not include explanation
#                     "document_nr": index,
#                     "detector" : detector.__class__.__name__,
#                     "explainer" : explainer.__class__.__name__,
#                     "document": row["Documents Phases 2+4"]                
#                 }
#                 text_file.write(json.dumps(explanation_data))


# Export as HTML for review

## One file for the selecting combinations

In [10]:
out = "<html><body>"
out += "<h1>Pairs as generated in the notebook</h1>"

for (detector_name, explainer_name), group in df.groupby(["Detector", "Explainer"]):
    out += "<h2>{}</h2>".format(detector_name)
    if detector_name == "DetectorDetectGPT":
        detector = DetectorDetectGPT()
    if detector_name == "DetectorRadford":
        detector = DetectorRadford()
    if detector_name == "DetectorGuo":
        detector = DetectorGuo()

    if explainer_name == "LIME_Explainer":
        explainer = LIME_Explainer(detector)
    if explainer_name == "SHAP_Explainer":
        explainer = SHAP_Explainer(detector)
    if explainer_name == "Anchor_Explainer":
        explainer = Anchor_Explainer(detector)
    documents_group = list(group.reset_index().iterrows()) # reset_index: count from 0 in each group
    

    for index, row in tqdm(documents_group, "Exporting " + detector.__class__.__name__ + " " + explainer.__class__.__name__):
        a = row["Documents Phases 1+3"]
        b = row["Documents Phases 2+4"]
        template = """
        Document: {document_nr}<br/>
        Detector: {detector}<br/>
        Explainer: {explainer}<br/>
        Ground Truth: {ground_truth}<br/>
        f(x): {detector_label}
        """
        # Phase 1 + 3: 
        out += "<h3>Pair {}</h3>".format(index)

        explanation_data = { 
            "document_nr": row["idx a"],
            "detector" : detector.__class__.__name__,
            "explainer" : explainer.__class__.__name__,
            "ground_truth" : row["GT a"],
            "detector_label" : int(detector.predict_label([row["Documents Phases 1+3"]])[0]),
        }
        out+= template.format(**explanation_data)
        out += explainer.get_HTML(a, bundle=True)
       

        explanation_data = { 
            "document_nr": row["idx b"],
            "detector" : detector.__class__.__name__,
            "explainer" : explainer.__class__.__name__,
            "ground_truth" : row["GT b"],
            "detector_label" : int(detector.predict_label([row["Documents Phases 2+4"]])[0]),           
        }
        out+= template.format(**explanation_data)
        out += explainer.get_HTML(b, bundle=True)
        out +="<hr/>"
out +="</body></html>"
with open("rendered_datasets_user_study/dataset.html", "w", encoding="UTF-8") as text_file:
    text_file.write(out)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorDetectGPT Anchor_Explainer: 100%|██████████| 6/6 [00:12<00:00,  2.14s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorDetectGPT LIME_Explainer: 100%|██████████| 6/6 [00:12<00:00,  2.01s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorDetectGPT SHAP_Explainer: 100%|██████████| 6/6 [00:14<00:00,  2.35s/it]
Special tokens have 

## And one per group

In [11]:


for detector_name, group in df.groupby("Detector"):
   
    if detector_name == "DetectorDetectGPT":
        detector = DetectorDetectGPT()
    if detector_name == "DetectorRadford":
        detector = DetectorRadford()
    if detector_name == "DetectorGuo":
        detector = DetectorGuo()


    documents_group = list(group.reset_index().iterrows()) # reset_index: count from 0 in each group
    for explainer_class in explainer_classes:
        explainer = explainer_class(detector)
        out = "<html><body>"
        out += "<h1>Pairs as displayed to this group</h1>"
        out += "<p>Users only see the first explanation of each pair.</p>"
        for index, row in tqdm(documents_group, "Exporting " + detector.__class__.__name__ + " " + explainer.__class__.__name__):
            a = row["Documents Phases 1+3"]
            b = row["Documents Phases 2+4"]
            template = """
            Document: {document_nr}<br/>
            Displayed Explainer: {explainer}<br/>
            Selecting Explainer: {explainer_s}<br/>
            Ground Truth: {ground_truth}<br/>
            f(x): {detector_label}
            """
            # Phase 1 + 3: 
            out += "<h3>Pair {}</h3>".format(index)

            explanation_data = { 
                "document_nr": row["idx a"],
                "explainer" : explainer.__class__.__name__,
                "explainer_s" : row["Explainer"],
                "ground_truth" : row["GT a"],
                "detector_label" : int(detector.predict_label([row["Documents Phases 1+3"]])[0]),
            }
            out+= template.format(**explanation_data)
            out += explainer.get_HTML(a, bundle=True)
        

            explanation_data = { 
                "document_nr": row["idx b"],
                "explainer" : explainer.__class__.__name__,
                "explainer_s" : row["Explainer"],
                "ground_truth" : row["GT b"],
                "detector_label" : int(detector.predict_label([row["Documents Phases 2+4"]])[0]),           
            }
            out+= template.format(**explanation_data)
            out += explainer.get_HTML(b, bundle=True)
            out +="<hr/>"
        out +="</body></html>"
        with open("rendered_datasets_user_study/groups/{}-{}.html".format(detector_name, explainer.__class__.__name__), "w", encoding="UTF-8") as text_file:
            text_file.write(out)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorDetectGPT LIME_Explainer: 100%|██████████| 18/18 [00:35<00:00,  1.97s/it]
Exporting DetectorDetectGPT SHAP_Explainer: 100%|██████████| 18/18 [00:33<00:00,  1.87s/it]
Exporting DetectorDetectGPT Anchor_Explainer: 100%|██████████| 18/18 [00:36<00:00,  2.03s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Exporting DetectorGuo LIME_Explainer: 100%|██████████| 18/18 [00:03<00:00,  4.82it/s]
Exporting DetectorGuo SHAP_Explainer: 100%|██████████| 18/18 [00:01<00:00, 11.27it/s]
Exporting DetectorGuo Anchor_Explainer: 100%|██████████| 18/18 [00:05<00:00,  3.38it/s]
Exporting DetectorRadford LIME_Explainer: 100%|██████████| 18/18 [00:03<00:00,  5.94it/s]
Exporting Detector