This notebook contains some tests to decide on an experimental design for the simulatability experiment.

In [1]:
import krippendorff


In [2]:
from scipy.stats import ttest_ind

In [3]:

import numpy as np 
import scipy.stats as stats 
import matplotlib.pyplot as plt


In [4]:
import pandas as pd
import numpy as np

In [5]:
import random

## Hase et al. 2020.

This is a within-subject* design with 4 phases: (1) Predictions only, (2) Pre-learn test, (3) Teaching: Predictions + Explanations, (4) Eval.

Phase 1 and 3 share a set of documents as do 2 and 4.

Result: Report **average change** in user accuracy per explanation method (phase 2 vs. 4), CI and p values of mean

Additional parameters in Hase et al.:
- Balance data "by model correctness" so random guessing can't succeed: *"we ensure that true positives, false positives,
true negatives, and false negatives are equally represented in the inputs. [...] We confirm user understanding of the data
balancing in our screening test"*
- Forced choice, to not "favor overly niche explanations" (like in Ribeiro et al.)
- Separate teach and test phases
- Pre prediction phase to obtain a baseline
- **All users see the same examples**


*: One explanation method per user, some users repeat the experiment with a new dataset!?!



### Feasability

In [6]:
columns_experiment = ["user_id", "document_id", "user_label"]


In [7]:
def guess(detector_label,p):
    return detector_label if bool(np.random.choice([0,1],p=[1-p, p])) else not detector_label

In [8]:
def evaluate_user_study(df_user_study, df_phase_2, df_phase_4):
    # "Reproducing" information from Table 1/2 in Hase et al.:
    user_metrics_pre = df_phase_2.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study))

    user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
    df_change = user_metrics_eval - user_metrics_pre
    df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

    user_acc_col = df_change["Change in User Accuracy"] # for convenience

    # use student t for low number of samples
    lower, upper = stats.t.interval(
    confidence=0.95, 
    df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
              loc=user_acc_col.mean(), 
              scale=stats.sem(user_acc_col)
              ) 
    
    p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

    k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



    lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval
    
    # print results
  #  print("Mean change in acc",user_acc_col.mean())
   # print("CI for mean change: [{},{}]".format(lower,upper))

##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))

    
  #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
 #   print("Krippendorff between users: {}".format(k_alpha))
    return p_val
    

In [9]:
import sqlite3
import pandas as pd
# Create your connection.
connection = sqlite3.connect("../survey/db.db")

user_df = pd.read_sql_query("SELECT * FROM users", connection)

In [10]:

n_learn = 16
n_eval = 16
n_users = 10

mu_got_it_right_pre=0.5
sigma_got_it_right_pre=0.05
mu_gain = 0.2
sigma_gain = 0.1

In [11]:
import requests
import json

In [12]:
url = "http://localhost:3002"

In [13]:
documentNr = 1
label = 0

In [14]:
df_user_study = pd.read_pickle("./dataset_user_study.pkl")

In [15]:
users = []
user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
for idx, user in user_df.iterrows():
    res = requests.get(url+"/auth/"+ user["access_token"])
    auth_token = json.loads(res.text)
    headers = {'Content-Type': 'application/json','Authorization': "Bearer "+auth_token, "Content-Type": "application/json",}

    # go to phase 2
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    for doc_nr, row in df_user_study.iterrows():
        p_without = user_dist_without()
        requests.post(url+"/submitPhase2", json={"ID": doc_nr, "label": guess(row["f(b)"], p_without)}, headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    requests.get(url+"/completeCurrentPhase", headers=headers)
    for doc_nr, row in df_user_study.iterrows():
        p_with = np.clip(p_without + user_dist_gain(), 0,1)
        requests.post(url+"/submitPhase4", json={"ID": doc_nr, "label": guess(row["f(b)"], p_with)}, headers=headers)

In [16]:
def user_metrics(df_user_responses, detector_predictions):
    detector_predictions = detector_predictions.astype(bool)
    user_responses = df_user_responses.loc[df_user_responses.groupby("document_nr")["timestamp"].idxmax()].set_index("document_nr")["label"].astype(bool) # only keep most recent response
    # display(user_responses)
    # display(detector_predictions)
    TP = ((detector_predictions) & (user_responses)).sum()
    FP = ((~detector_predictions) & (user_responses)).sum()

    TN = ((~detector_predictions) & (~user_responses)).sum()
    FN = ((detector_predictions) & (~user_responses)).sum()

    acc = (TP+TN) / (TP+FP+TN+FN)
    # print("acc", acc)
    # print("TP", TP)
    # print("FP", FP)
    # print("TN", TN)
    # print("FN", FN)


    assert sum([TP,FP,TN,FN]) == len(detector_predictions), "Check that input is bool"
    assert (acc ==(user_responses == detector_predictions).sum() / len(detector_predictions)), "Check that input is bool: acc"

    return pd.DataFrame([(TP,TN,FP,FN, acc)], columns=["TP","TN","FP","FN", "User Accuracy"])

In [17]:
u = user_df.set_index("ID").rename_axis("user_id")[["explainer", "detector"]]


In [18]:
df_phase_2 = pd.read_sql_query("SELECT * FROM responses_phase_2", connection)
df_phase_4 = pd.read_sql_query("SELECT * FROM responses_phase_4", connection)

metrics_phase_4 = df_phase_4.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study["f(b)"].rename_axis("document_nr")))
metrics_phase_2 = df_phase_2.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study["f(b)"].rename_axis("document_nr")))


difference = metrics_phase_4 - metrics_phase_2 


In [27]:
difference.join(u).groupby(["detector"])["User Accuracy"].mean()

detector
DetectorDetectGPT    0.203704
DetectorGuo          0.209877
DetectorRadford      0.185185
Name: User Accuracy, dtype: float64

In [28]:
difference.join(u).groupby(["explainer"])["User Accuracy"].mean()

explainer
Anchor_Explainer    0.265432
LIME_Explainer      0.179012
SHAP_Explainer      0.154321
Name: User Accuracy, dtype: float64

In [29]:
difference.join(u).groupby(["detector", "explainer"])["User Accuracy"].mean()

detector           explainer       
DetectorDetectGPT  Anchor_Explainer    0.314815
                   LIME_Explainer      0.222222
                   SHAP_Explainer      0.074074
DetectorGuo        Anchor_Explainer    0.259259
                   LIME_Explainer      0.166667
                   SHAP_Explainer      0.203704
DetectorRadford    Anchor_Explainer    0.222222
                   LIME_Explainer      0.148148
                   SHAP_Explainer      0.185185
Name: User Accuracy, dtype: float64

In [22]:
for name, group_2 in metrics_phase_2.join(u).groupby(["detector"]):
    detector = name[0]
    group_4 =  metrics_phase_4.join(u)[metrics_phase_4.join(u)["detector"] == detector]
    print(len(group_2))
    print(ttest_ind(group_2["User Accuracy"],group_4["User Accuracy"]).pvalue)

9
0.001177719019692292
9
0.0005498607824233039
9
0.002406518577369974


In [23]:
for name, group_2 in metrics_phase_2.join(u).groupby(["explainer"]):
    explainer = name[0]
    group_4 =  metrics_phase_4.join(u)[metrics_phase_4.join(u)["explainer"] == explainer]

    print(ttest_ind(group_2["User Accuracy"],group_4["User Accuracy"]).pvalue)

3.39455232925049e-05
0.002275787115565749
0.004757523483762692


In [24]:




# # "Reproducing" information from Table 1/2 in Hase et al.:


# user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
# df_change = user_metrics_eval - user_metrics_pre
# df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

# user_acc_col = df_change["Change in User Accuracy"] # for convenience

# # use student t for low number of samples
# lower, upper = stats.t.interval(
# confidence=0.95, 
# df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
#             loc=user_acc_col.mean(), 
#             scale=stats.sem(user_acc_col)
#             ) 

# p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

# k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



# lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval

# # print results
# #  print("Mean change in acc",user_acc_col.mean())
# # print("CI for mean change: [{},{}]".format(lower,upper))

# ##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))


# #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
# #   print("Krippendorff between users: {}".format(k_alpha))
# return p_val

In [25]:
# def simulate_hase(

#         n_learn = 16,
#         n_eval = 16,
#         n_users = 10,

#         mu_got_it_right_pre=0.5,
#         sigma_got_it_right_pre=0.05,

#         mu_gain = 0.1,
#         sigma_gain = 0.1,


# ):
#     users = []
#     user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
#     user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
#     for i in range(1, n_users+1):
#         p_without = user_dist_without()
#         p_with = np.clip(p_without + user_dist_gain(), 0,1)
#         users.append(("u_%s" % i, p_without ,p_with))
#     documents_learn_1_2 = ["l_%s" % i for i in range(1,n_learn+1)]
#     documents_pre_eval = ["e_%s" % i for i in range(1,n_eval+1)]

#     df_detector_output = mock_detector_responses(documents_pre_eval)

#     responses_pre, responses_eval = mock_user_responses(df_detector_output, documents_pre_eval, users)
#     df_pre =pd.DataFrame(responses_pre, columns=columns_experiment)
#     df_eval =pd.DataFrame(responses_eval, columns=columns_experiment)

#  #   print("# responses pre", len(responses_pre))
#   #  print("# responses pre per method", len(responses_pre)/3)
#    # print("Each user saw {} instances. ".format(2*n_learn + 2*n_eval) )
#    # print("Used {} unique documents. A set of {} in phase 1 and 3; and a set of {} in phase 2 and 4.".format(n_learn + n_eval,n_learn, n_eval))

# #    print("Results based on {} unique eval documents.".format(n_eval))

# #    print("Results based on {} datapoints.".format(len(responses_eval)))
#     p_value = evaluate_user_study(df_detector_output, df_pre, df_eval)
#     return p_value