This notebook contains some tests to decide on an experimental design for the simulatability experiment.

In [1]:
import krippendorff


In [2]:
from scipy.stats import ttest_ind

In [3]:

import numpy as np 
import scipy.stats as stats 
import matplotlib.pyplot as plt


In [4]:
import pandas as pd
import numpy as np

In [5]:
import random

## Hase et al. 2020.

This is a within-subject* design with 4 phases: (1) Predictions only, (2) Pre-learn test, (3) Teaching: Predictions + Explanations, (4) Eval.

Phase 1 and 3 share a set of documents as do 2 and 4.

Result: Report **average change** in user accuracy per explanation method (phase 2 vs. 4), CI and p values of mean

Additional parameters in Hase et al.:
- Balance data "by model correctness" so random guessing can't succeed: *"we ensure that true positives, false positives,
true negatives, and false negatives are equally represented in the inputs. [...] We confirm user understanding of the data
balancing in our screening test"*
- Forced choice, to not "favor overly niche explanations" (like in Ribeiro et al.)
- Separate teach and test phases
- Pre prediction phase to obtain a baseline
- **All users see the same examples**


*: One explanation method per user, some users repeat the experiment with a new dataset!?!



### Feasability

In [6]:
columns_experiment = ["user_id", "document_id", "user_label"]


In [7]:
def guess(detector_label,p):
    return detector_label if bool(np.random.choice([0,1],p=[1-p, p])) else not detector_label

In [8]:
import sqlite3
import pandas as pd
# Create your connection.
connection = sqlite3.connect("../survey/db.db")

user_df = pd.read_sql_query("SELECT * FROM users", connection)

In [9]:

n_learn = 16
n_eval = 16
n_users = 10

mu_got_it_right_pre=0.5
sigma_got_it_right_pre=0.05
mu_gain = 0.2
sigma_gain = 0.1

In [10]:
import requests
import json

In [11]:
url = "http://localhost:3002"

In [12]:
documentNr = 1
label = 0

In [13]:
user_df

Unnamed: 0,ID,access_token,current_phase,detector,explainer,document_order_a,document_order_b
0,1,TPKJTF,4,DetectorGuo,SHAP_Explainer,"[3,4,11,0,16,14,13,1,6,12,2,5,17,7,9,8,10,15]","[15,3,4,5,2,11,17,7,10,6,8,16,14,1,9,0,12,13]"
1,2,VAUMMS,4,DetectorRadford,SHAP_Explainer,"[6,15,2,10,7,13,4,1,3,0,17,5,9,14,8,16,11,12]","[14,2,0,17,11,10,6,1,12,16,4,3,5,8,7,9,15,13]"
2,3,FMHKEQ,4,DetectorDetectGPT,SHAP_Explainer,"[14,13,3,6,15,9,7,5,2,8,17,16,4,1,0,12,11,10]","[16,1,0,13,5,15,3,4,12,11,8,6,17,7,10,2,9,14]"
3,4,KDNIWG,4,DetectorGuo,SHAP_Explainer,"[4,3,7,15,17,16,2,9,10,12,6,5,1,8,11,14,0,13]","[6,10,1,15,0,11,17,13,3,2,12,14,16,8,4,7,9,5]"
4,5,DZRFHU,4,DetectorDetectGPT,SHAP_Explainer,"[14,0,17,16,9,1,5,15,2,8,7,13,4,6,12,3,10,11]","[3,7,17,11,14,13,16,10,0,4,1,2,5,15,9,8,6,12]"
5,6,PPGWGL,4,DetectorGuo,LIME_Explainer,"[3,7,11,10,13,16,2,5,17,1,8,15,14,12,9,4,0,6]","[12,13,4,7,14,10,16,15,3,2,17,9,8,1,0,11,5,6]"
6,7,YECUUS,4,DetectorGuo,LIME_Explainer,"[15,6,1,14,9,10,3,4,0,2,13,16,17,11,8,7,5,12]","[5,17,13,0,1,8,10,15,11,2,3,16,4,6,14,9,7,12]"
7,8,NFJIFZ,4,DetectorGuo,LIME_Explainer,"[0,1,8,16,2,17,5,11,3,10,13,15,6,4,14,7,9,12]","[4,10,16,0,9,5,15,3,1,8,11,6,14,2,17,13,7,12]"
8,9,HHVIFV,4,DetectorRadford,LIME_Explainer,"[17,2,6,11,16,5,14,15,13,9,1,7,4,0,3,10,8,12]","[0,5,13,9,3,7,15,16,17,12,4,1,8,6,10,14,2,11]"
9,10,BDUGPI,4,DetectorRadford,SHAP_Explainer,"[12,14,0,1,2,3,17,4,10,5,15,6,13,7,11,8,9,16]","[3,14,11,6,1,7,16,8,15,13,12,5,0,4,10,17,2,9]"


In [14]:
df_user_study = pd.read_pickle("./dataset_user_study_new.pkl")

In [15]:
users = []
user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
for idx, user in user_df.iterrows():
    res = requests.get(url+"/auth/"+ user["access_token"])
    auth_token = json.loads(res.text)
    headers = {'Content-Type': 'application/json','Authorization': "Bearer "+auth_token, "Content-Type": "application/json",}

    # go to phase 2
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)

    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[user["detector"]],:].reset_index(drop=True)
    for doc_nr, row in df_user_documents.iterrows():
        p_without = user_dist_without()
        requests.post(url+"/submitPhase2", json={"ID": doc_nr, "label": guess(row["f(b)"], p_without)}, headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    for doc_nr, row in df_user_documents.iterrows():
        p_with = np.clip(p_without + user_dist_gain(), 0,1)
        requests.post(url+"/submitPhase4", json={"ID": doc_nr, "label": guess(row["f(b)"], p_with)}, headers=headers)

In [16]:
def user_metrics(df_user_responses, df_user_study):
    detector = df_user_responses.iloc[0]["detector"]
    # explainer = df_user_responses.iloc[0]["explainer"]
    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[detector],:].reset_index(drop=True)
    detector_predictions = df_user_documents["f(b)"].astype(bool)

    user_responses = df_user_responses.loc[df_user_responses.groupby("document_nr")["timestamp"].idxmax()].set_index("document_nr")["label"].astype(bool) # only keep most recent response
    # display(user_responses)
    # display(detector_predictions)
    TP = ((detector_predictions) & (user_responses)).sum()
    FP = ((~detector_predictions) & (user_responses)).sum()

    TN = ((~detector_predictions) & (~user_responses)).sum()
    FN = ((detector_predictions) & (~user_responses)).sum()

    acc = (TP+TN) / (TP+FP+TN+FN)
    # print("acc", acc)
    # print("TP", TP)
    # print("FP", FP)
    # print("TN", TN)
    # print("FN", FN)


    assert sum([TP,FP,TN,FN]) == len(detector_predictions), "Check that input is bool"
    assert (acc ==(user_responses == detector_predictions).sum() / len(detector_predictions)), "Check that input is bool: acc"

    return pd.DataFrame([(TP,TN,FP,FN, acc)], columns=["TP","TN","FP","FN", "User Accuracy"])

In [19]:
u = user_df.set_index("ID").rename_axis("user_id")[["explainer", "detector"]]


In [27]:
df_phase_2 = pd.read_sql_query("SELECT responses_phase_2.*, users.detector, users.explainer FROM responses_phase_2 INNER JOIN users ON responses_phase_2.user_id = users.ID", connection)
df_phase_4 = pd.read_sql_query("SELECT responses_phase_4.*, users.detector, users.explainer FROM responses_phase_4 INNER JOIN users ON responses_phase_4.user_id = users.ID", connection)

metrics_phase_4 = df_phase_4.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study))
metrics_phase_2 = df_phase_2.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study))


difference = metrics_phase_4 - metrics_phase_2 


In [28]:
difference.join(u).groupby(["detector"])["User Accuracy"].mean()

detector
DetectorDetectGPT    0.259259
DetectorGuo          0.154321
DetectorRadford      0.234568
Name: User Accuracy, dtype: float64

In [29]:
difference.join(u).groupby(["explainer"])["User Accuracy"].mean()

explainer
Anchor_Explainer    0.222222
LIME_Explainer      0.228395
SHAP_Explainer      0.197531
Name: User Accuracy, dtype: float64

In [30]:
difference.join(u).groupby(["detector", "explainer"])["User Accuracy"].mean()

detector           explainer       
DetectorDetectGPT  Anchor_Explainer    0.296296
                   LIME_Explainer      0.203704
                   SHAP_Explainer      0.277778
DetectorGuo        Anchor_Explainer    0.148148
                   LIME_Explainer      0.259259
                   SHAP_Explainer      0.055556
DetectorRadford    Anchor_Explainer    0.222222
                   LIME_Explainer      0.222222
                   SHAP_Explainer      0.259259
Name: User Accuracy, dtype: float64

In [98]:
metrics_phase_4.join(u).index.get_level_values(0)

Index([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27],
      dtype='int64', name='user_id')

In [135]:
def highlight_significant(row, props=''):
  #  display(s)
    styles = [''] * len(row)
    styles[0] = 'font-weight: bold' if row["p value"] <= 0.05 else ''
    return styles

In [142]:
latex_output = []

Unnamed: 0_level_0,Before
detector,Unnamed: 1_level_1
DetectorDetectGPT,0.746914
DetectorGuo,0.635802
DetectorRadford,0.691358


In [166]:
def get_aggregate_results(groupby, label, caption):
    tvalues = []
    pvalues = []
    for name, group_2 in metrics_phase_2.join(u).groupby(groupby):
        group_4 =  metrics_phase_4.join(u)[np.all(metrics_phase_4.join(u)[groupby].values == name, axis=1)]
        tvalue, pvalue = ttest_ind(group_2["User Accuracy"],group_4["User Accuracy"])
        tvalues.append(tvalue)
        pvalues.append(pvalue)

    df_aggregate_results = pd.DataFrame(difference.join(u).groupby(groupby)["User Accuracy"].mean())

    df_aggregate_results = df_aggregate_results.join(pd.DataFrame(metrics_phase_2.join(u).groupby(groupby)["User Accuracy"].mean()).rename({"User Accuracy":"Before"}, axis=1))
    df_aggregate_results = df_aggregate_results.join(pd.DataFrame(metrics_phase_4.join(u).groupby(groupby)["User Accuracy"].mean()).rename({"User Accuracy":"After"}, axis=1))

    df_aggregate_results["t value"] = tvalues
    df_aggregate_results["p value"] = pvalues

    df_aggregate_results.rename(columns={"User Accuracy":"Increase in User Accuracy"}, inplace=True)
    result = df_aggregate_results.style.apply(highlight_significant, axis=1)\
        .map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2)
    latex_output.append(result.to_latex(environment="longtable", 
                                        convert_css=True, 
                                        clines="all;data", 
                                        hrules=True, 
                                        caption=caption, 
                                        label=label))
    return result

In [167]:
get_aggregate_results(["detector"], "resultsuserstudydetector", "Results aggregated by detector")

Unnamed: 0_level_0,Increase in User Accuracy,Before,After,t value,p value
detector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DetectorDetectGPT,0.26,0.49,0.75,-3.56,0.0
DetectorGuo,0.15,0.48,0.64,-3.04,0.01
DetectorRadford,0.23,0.46,0.69,-4.17,0.0


In [164]:
get_aggregate_results(["explainer"], "resultsuserstudyexplainer", "Results aggregated by explainer")

Unnamed: 0_level_0,Increase in User Accuracy,Before,t value,p value
explainer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anchor_Explainer,0.22,0.7,-4.18,0.0
LIME_Explainer,0.23,0.73,-3.66,0.0
SHAP_Explainer,0.2,0.64,-2.96,0.01


In [146]:
get_aggregate_results(["explainer", "detector"], "resultsuserstudyexplainerdetector", "Results aggregated by explainer and detector pairing")

Unnamed: 0_level_0,Unnamed: 1_level_0,Increase in User Accuracy,t value,p value
explainer,detector,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Anchor_Explainer,DetectorDetectGPT,0.3,-3.02,0.04
Anchor_Explainer,DetectorGuo,0.15,-1.71,0.16
Anchor_Explainer,DetectorRadford,0.22,-1.92,0.13
LIME_Explainer,DetectorDetectGPT,0.2,-1.4,0.23
LIME_Explainer,DetectorGuo,0.26,-2.8,0.05
LIME_Explainer,DetectorRadford,0.22,-2.83,0.05
SHAP_Explainer,DetectorDetectGPT,0.28,-1.85,0.14
SHAP_Explainer,DetectorGuo,0.06,-1.06,0.35
SHAP_Explainer,DetectorRadford,0.26,-2.04,0.11


In [147]:
for  l in latex_output:
    print(l)

\begin{longtable}{lrrr}
\caption{Results aggregated by detector} \label{resultsuserstudydetector} \\
\toprule
 & \rotatebox{45}{Increase in User Accuracy} & \rotatebox{45}{t value} & \rotatebox{45}{p value} \\
detector &  &  &  \\
\midrule
\endfirsthead
\caption[]{Results aggregated by detector} \\
\toprule
 & \rotatebox{45}{Increase in User Accuracy} & \rotatebox{45}{t value} & \rotatebox{45}{p value} \\
detector &  &  &  \\
\midrule
\endhead
\midrule
\multicolumn{4}{r}{Continued on next page} \\
\midrule
\endfoot
\bottomrule
\endlastfoot
DetectorDetectGPT & \bfseries 0.26 & -3.56 & 0.00 \\
\cline{1-4}
DetectorGuo & \bfseries 0.15 & -3.04 & 0.01 \\
\cline{1-4}
DetectorRadford & \bfseries 0.23 & -4.17 & 0.00 \\
\cline{1-4}
\end{longtable}

\begin{longtable}{lrrr}
\caption{Results aggregated by explainer} \label{resultsuserstudyexplainer} \\
\toprule
 & \rotatebox{45}{Increase in User Accuracy} & \rotatebox{45}{t value} & \rotatebox{45}{p value} \\
explainer &  &  &  \\
\midrule
\endfir

In [None]:




# # "Reproducing" information from Table 1/2 in Hase et al.:


# user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
# df_change = user_metrics_eval - user_metrics_pre
# df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

# user_acc_col = df_change["Change in User Accuracy"] # for convenience

# # use student t for low number of samples
# lower, upper = stats.t.interval(
# confidence=0.95, 
# df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
#             loc=user_acc_col.mean(), 
#             scale=stats.sem(user_acc_col)
#             ) 

# p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

# k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



# lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval

# # print results
# #  print("Mean change in acc",user_acc_col.mean())
# # print("CI for mean change: [{},{}]".format(lower,upper))

# ##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))


# #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
# #   print("Krippendorff between users: {}".format(k_alpha))
# return p_val

In [None]:
# def evaluate_user_study(df_user_study, df_phase_2, df_phase_4):
#     # "Reproducing" information from Table 1/2 in Hase et al.:
#     user_metrics_pre = df_phase_2.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study))

#     user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
#     df_change = user_metrics_eval - user_metrics_pre
#     df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

#     user_acc_col = df_change["Change in User Accuracy"] # for convenience

#     # use student t for low number of samples
#     lower, upper = stats.t.interval(
#     confidence=0.95, 
#     df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
#               loc=user_acc_col.mean(), 
#               scale=stats.sem(user_acc_col)
#               ) 
    
#     p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

#     k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



#     lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval
    
#     # print results
#   #  print("Mean change in acc",user_acc_col.mean())
#    # print("CI for mean change: [{},{}]".format(lower,upper))

# ##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))

    
#   #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
#  #   print("Krippendorff between users: {}".format(k_alpha))
#     return p_val
    

In [None]:
# def simulate_hase(

#         n_learn = 16,
#         n_eval = 16,
#         n_users = 10,

#         mu_got_it_right_pre=0.5,
#         sigma_got_it_right_pre=0.05,

#         mu_gain = 0.1,
#         sigma_gain = 0.1,


# ):
#     users = []
#     user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
#     user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
#     for i in range(1, n_users+1):
#         p_without = user_dist_without()
#         p_with = np.clip(p_without + user_dist_gain(), 0,1)
#         users.append(("u_%s" % i, p_without ,p_with))
#     documents_learn_1_2 = ["l_%s" % i for i in range(1,n_learn+1)]
#     documents_pre_eval = ["e_%s" % i for i in range(1,n_eval+1)]

#     df_detector_output = mock_detector_responses(documents_pre_eval)

#     responses_pre, responses_eval = mock_user_responses(df_detector_output, documents_pre_eval, users)
#     df_pre =pd.DataFrame(responses_pre, columns=columns_experiment)
#     df_eval =pd.DataFrame(responses_eval, columns=columns_experiment)

#  #   print("# responses pre", len(responses_pre))
#   #  print("# responses pre per method", len(responses_pre)/3)
#    # print("Each user saw {} instances. ".format(2*n_learn + 2*n_eval) )
#    # print("Used {} unique documents. A set of {} in phase 1 and 3; and a set of {} in phase 2 and 4.".format(n_learn + n_eval,n_learn, n_eval))

# #    print("Results based on {} unique eval documents.".format(n_eval))

# #    print("Results based on {} datapoints.".format(len(responses_eval)))
#     p_value = evaluate_user_study(df_detector_output, df_pre, df_eval)
#     return p_value