This notebook evaluates the results form the user study. Set `FILL_DATABASE` to True to create a mock database. Set it to False to use the results from an existing database.

If mocking data, this notebook expects the server to run on `server_url` with a database with no entries, as set up by running `node setup.mjs`. 

In [1]:
USER_STUDY_CSV = "./dataset_user_study.csv"
SQLITE_DB = "../survey/db.db"
FILL_DATABASE = False # if True, data is mocked, THIS CALLS THE APIs

server_url = "http://localhost:3002" # server to mock data on


In [2]:

import krippendorff
from scipy.stats import ttest_rel
from scipy.stats import ttest_1samp
import numpy as np 
import scipy.stats as stats 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Adapted from Hase et al. 2020. Main points:

This is a within-subject* design with 4 phases: (1) Predictions only, (2) Pre-learn test, (3) Teaching: Predictions + Explanations, (4) Eval.

Phase 1 and 3 share a set of documents as do 2 and 4.

Result Hase et al.: Report **average change** in user accuracy per explanation method (phase 2 vs. 4), CI and p values of mean

Additional details by Hase et al.:
- Balance data "by model correctness" so random guessing can't succeed: *"we ensure that true positives, false positives,
true negatives, and false negatives are equally represented in the inputs. [...] We confirm user understanding of the data
balancing in our screening test"*
- Forced choice, to not "favor overly niche explanations" (like in Ribeiro et al.)
- Separate teach and test phases
- Pre prediction phase to obtain a baseline



In [3]:
import sqlite3
import pandas as pd
connection = sqlite3.connect(SQLITE_DB)

user_df = pd.read_sql_query("SELECT * FROM users", connection)

In [4]:
df_user_study = pd.read_csv(USER_STUDY_CSV)

## Fill Database
If `FILL_DATABASE` is True

In [5]:
%%writefile run_user.py

def run_user(idx, user, url, df_user_study):
    n_learn = 16
    n_eval = 16
    n_users = 10

    mu_got_it_right_pre=0.5
    sigma_got_it_right_pre=0.05
    mu_gain = 0.2
    sigma_gain = 0.1

    def guess(detector_label,p):
        return detector_label if bool(np.random.choice([0,1],p=[1-p, p])) else not detector_label

    import requests
    import json
    import numpy as np
    import pandas as pd
    user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
    user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
  
    res = requests.get(url+"/auth/"+ user["access_token"])

    print(res.text)
    auth_token = json.loads(res.text)
    headers = {'Content-Type': 'application/json','Authorization': "Bearer "+auth_token, "Content-Type": "application/json",}

    requests.post(url+"/api/submitParticipantInfo", json={
    "has_seen_explanation_methods_before": "yes",
    "has_seen_OTHERS_before": "yes",
    "level_of_expertise": "is-researcher-explainability",
    "familiarity_with_chatgpt": "occasional-use",
    "prefers_monochromatic_methods": "yes" if idx % 20 == 0 else "no"
    }, headers=headers)
    # go to phase 2
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 0}, headers=headers)
    # if idx % 8 == 0:
    #     return
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 1}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 2}, headers=headers)

    res = requests.get(url+"/api/state", headers=headers)
    state = json.loads(res.text)

    # user_df = pd.read_sql_query("SELECT * FROM users", connection) # update as group is assigned now
    # user = user_df.iloc[idx]
   # print(user[["detector", "explainer"]])
    # return state
    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[state["detector"]],:].reset_index(drop=True)
    for doc_nr, row in df_user_documents.iterrows():
        p_without = user_dist_without()
        requests.post(url+"/api/submitPhase2", json={"ID": doc_nr, "label": guess(row["f(b)"], p_without)}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 3}, headers=headers)

    for doc_nr, row in df_user_documents.iterrows():
        json_ = {"lickert-q{}-{}".format(question_nr, doc_nr): str(np.random.choice([1,2,3,4,5], p=[0.1,0.2,0.1,0.4,0.2])) for question_nr in range(1,4)}
        json_["document_nr"] = doc_nr
        requests.post(url+"/api/submitPhase3", json=json_, headers=headers)

    requests.post(url+"/api/completeCurrentPhase", json={"expected": 4}, headers=headers)
    for doc_nr, row in df_user_documents.iterrows():
        p_with = np.clip(p_without + user_dist_gain(), 0,1)
        requests.post(url+"/api/submitPhase4", json={"ID": doc_nr, "label": guess(row["f(b)"], p_with)}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 5}, headers=headers)


Overwriting run_user.py


In [6]:
from tqdm import tqdm
from multiprocess import Pool
from run_user import run_user


if FILL_DATABASE:
    max_pool = 10
    mock_user_data = [(idx, user, server_url, df_user_study) for idx, user in user_df.iterrows() if idx < 27]

    with Pool(max_pool) as p:
        pool_outputs = list(tqdm(p.starmap(run_user,mock_user_data),total=len(mock_user_data)))    
    print(pool_outputs)


In [7]:
user_df = pd.read_sql_query("SELECT * FROM users where current_phase = 5", connection) # update df from database

In [8]:
user_df.groupby(["detector", "explainer"])["ID"].count()

detector           explainer     
DetectorDetectGPT  SHAP_Explainer    1
Name: ID, dtype: int64

## Evaluation

In [25]:
def user_metrics(df_user_responses, df_user_study):
    detector = df_user_responses.iloc[0]["detector"]
    # explainer = df_user_responses.iloc[0]["explainer"]
    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[detector],:].reset_index(drop=True)
    detector_predictions = df_user_documents["f(b)"].astype(bool)

    user_responses = df_user_responses.loc[df_user_responses.groupby("document_nr")["timestamp"].idxmax()].set_index("document_nr")["label"].astype(bool) # only keep most recent response
    TP = ((detector_predictions) & (user_responses)).sum()
    FP = ((~detector_predictions) & (user_responses)).sum()

    TN = ((~detector_predictions) & (~user_responses)).sum()
    FN = ((detector_predictions) & (~user_responses)).sum()

    acc = (TP+TN) / (TP+FP+TN+FN)
    print("acc", acc)
    print("TP", TP)
    print("FP", FP)
    print("TN", TN)
    print("FN", FN)


    assert sum([TP,FP,TN,FN]) == len(detector_predictions), "Check that input is bool"
    assert (acc ==(user_responses == detector_predictions).sum() / len(detector_predictions)), "Check that input is bool: acc"

    return pd.DataFrame([(TP,TN,FP,FN, acc)], columns=["TP","TN","FP","FN", "User Accuracy"])

In [26]:
u = user_df.set_index("ID").rename_axis("user_id")[["explainer", "detector"]]


In [27]:
df_phase_2 = pd.read_sql_query("SELECT responses_phase_2.*, users.detector, users.explainer FROM responses_phase_2 INNER JOIN users ON responses_phase_2.user_id = users.ID", connection)
df_phase_4 = pd.read_sql_query("SELECT responses_phase_4.*, users.detector, users.explainer FROM responses_phase_4 INNER JOIN users ON responses_phase_4.user_id = users.ID", connection)

metrics_phase_4 = df_phase_4.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study))
metrics_phase_2 = df_phase_2.groupby(["user_id"]).apply(lambda df_user_responses : user_metrics(df_user_responses,df_user_study))


difference = metrics_phase_4 - metrics_phase_2 


acc 0.8333333333333334
TP 9
FP 3
TN 6
FN 0
acc 0.8333333333333334
TP 9
FP 3
TN 6
FN 0


In [30]:
df_phase_2

Unnamed: 0,ID,timestamp,label,user_id,document_nr,detector,explainer
0,1,2024-03-20 13:04:26,1,1,15,DetectorDetectGPT,SHAP_Explainer
1,2,2024-03-20 13:05:08,1,1,14,DetectorDetectGPT,SHAP_Explainer
2,3,2024-03-20 13:05:30,1,1,0,DetectorDetectGPT,SHAP_Explainer
3,4,2024-03-20 13:06:02,0,1,10,DetectorDetectGPT,SHAP_Explainer
4,5,2024-03-20 13:06:08,0,1,9,DetectorDetectGPT,SHAP_Explainer
5,6,2024-03-20 13:06:46,1,1,7,DetectorDetectGPT,SHAP_Explainer
6,7,2024-03-20 13:07:01,1,1,13,DetectorDetectGPT,SHAP_Explainer
7,8,2024-03-20 13:07:12,1,1,8,DetectorDetectGPT,SHAP_Explainer
8,9,2024-03-20 13:07:16,0,1,3,DetectorDetectGPT,SHAP_Explainer
9,10,2024-03-20 13:07:32,1,1,5,DetectorDetectGPT,SHAP_Explainer


In [31]:
df_phase_4

Unnamed: 0,ID,timestamp,label,user_id,document_nr,detector,explainer
0,1,2024-03-20 13:22:49,1,1,15,DetectorDetectGPT,SHAP_Explainer
1,2,2024-03-20 13:23:05,1,1,14,DetectorDetectGPT,SHAP_Explainer
2,3,2024-03-20 13:23:28,1,1,0,DetectorDetectGPT,SHAP_Explainer
3,4,2024-03-20 13:23:46,0,1,10,DetectorDetectGPT,SHAP_Explainer
4,5,2024-03-20 13:23:47,0,1,9,DetectorDetectGPT,SHAP_Explainer
5,6,2024-03-20 13:23:51,1,1,7,DetectorDetectGPT,SHAP_Explainer
6,7,2024-03-20 13:24:11,1,1,13,DetectorDetectGPT,SHAP_Explainer
7,8,2024-03-20 13:24:14,1,1,8,DetectorDetectGPT,SHAP_Explainer
8,9,2024-03-20 13:24:18,0,1,3,DetectorDetectGPT,SHAP_Explainer
9,10,2024-03-20 13:24:24,1,1,5,DetectorDetectGPT,SHAP_Explainer


In [28]:
difference.join(u).groupby(["detector"])["User Accuracy"].mean()

detector
DetectorDetectGPT    0.0
Name: User Accuracy, dtype: float64

In [29]:
difference.join(u).groupby(["explainer"])["User Accuracy"].mean()

explainer
SHAP_Explainer    0.0
Name: User Accuracy, dtype: float64

In [14]:
difference.join(u).groupby(["detector", "explainer"])["User Accuracy"].mean()

detector           explainer     
DetectorDetectGPT  SHAP_Explainer    0.0
Name: User Accuracy, dtype: float64

In [15]:
metrics_phase_4.join(u).index.get_level_values(0)

Index([1], dtype='int64', name='user_id')

In [16]:
def highlight_significant(row, props=''):
  #  display(s)
    styles = [''] * len(row)
    styles[2] = 'font-weight: bold' if row["p value"] <= 0.05 else ''
    return styles

In [17]:
latex_output = []

In [18]:
# per group
groupby = ["explainer", "detector"]
tvalues = []
pvalues = []
for name, group_2 in metrics_phase_2.join(u).groupby(groupby):
    group_4 =  metrics_phase_4.join(u)[np.all(metrics_phase_4.join(u)[groupby].values == name, axis=1)]
    tvalue, pvalue = ttest_rel(group_2["User Accuracy"],group_4["User Accuracy"], alternative="less")
    tvalues.append(tvalue)
    pvalues.append(pvalue)
    display(group_2)

df_aggregate_results = pd.DataFrame(difference.join(u).groupby(groupby)["User Accuracy"].mean())

df_aggregate_results = df_aggregate_results.join(pd.DataFrame(metrics_phase_2.join(u).groupby(groupby)["User Accuracy"].mean()).rename({"User Accuracy":"Before"}, axis=1))
df_aggregate_results = df_aggregate_results.join(pd.DataFrame(metrics_phase_4.join(u).groupby(groupby)["User Accuracy"].mean()).rename({"User Accuracy":"After"}, axis=1))

df_aggregate_results["t value"] = tvalues
df_aggregate_results["p value"] = pvalues

df_aggregate_results.rename(columns={"User Accuracy":"Increase in User Accuracy"}, inplace=True)
df_aggregate_results = df_aggregate_results.reindex(sorted(df_aggregate_results.columns), axis=1)
result = df_aggregate_results.style\
    .map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2).hide(["t value"], axis=1).format_index(escape="latex", axis=0)\
    .apply(highlight_significant, axis=1)
latex_output.append(result.to_latex(environment="longtable", 
                                    convert_css=True, 
                                    clines="all;data", 
                                    hrules=True, 
                                    caption="Results per group", 
                                    label="user-study-per-group"))
result

  var *= np.divide(n, n-ddof)  # to avoid error on division by zero
  var *= np.divide(n, n-ddof)  # to avoid error on division by zero


Unnamed: 0_level_0,Unnamed: 1_level_0,TP,TN,FP,FN,User Accuracy,explainer,detector
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,9,6,3,0,0.833333,SHAP_Explainer,DetectorDetectGPT


Unnamed: 0_level_0,Unnamed: 1_level_0,After,Before,Increase in User Accuracy,p value
explainer,detector,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SHAP\_Explainer,DetectorDetectGPT,0.83,0.83,0.0,


In [19]:
from scipy.stats import combine_pvalues

In [20]:
pd.DataFrame(df_aggregate_results.groupby("explainer").apply(lambda group: combine_pvalues(group["p value"], method="stouffer")[1])).rename(columns={"0":"Increase in User Accuracy"}, inplace=True)

## Lickert Items

In [21]:
df_phase_3 = pd.read_sql_query("SELECT responses_phase_3.*, users.detector, users.explainer FROM responses_phase_3 INNER JOIN users ON responses_phase_3.user_id = users.ID", connection)
df_phase_3

Unnamed: 0,ID,timestamp,label,user_id,document_nr,question_nr,detector,explainer
0,1,2024-03-20 13:12:44,1,1,12,1,DetectorDetectGPT,SHAP_Explainer
1,2,2024-03-20 13:12:45,1,1,12,1,DetectorDetectGPT,SHAP_Explainer
2,3,2024-03-20 13:12:45,1,1,12,2,DetectorDetectGPT,SHAP_Explainer
3,4,2024-03-20 13:12:49,1,1,12,1,DetectorDetectGPT,SHAP_Explainer
4,5,2024-03-20 13:12:49,1,1,12,2,DetectorDetectGPT,SHAP_Explainer
...,...,...,...,...,...,...,...,...
112,113,2024-03-20 13:21:45,2,1,10,1,DetectorDetectGPT,SHAP_Explainer
113,114,2024-03-20 13:21:45,3,1,10,2,DetectorDetectGPT,SHAP_Explainer
114,115,2024-03-20 13:21:45,3,1,10,2,DetectorDetectGPT,SHAP_Explainer
115,116,2024-03-20 13:21:45,2,1,10,1,DetectorDetectGPT,SHAP_Explainer


In [22]:
user_responses = df_phase_3.loc[df_phase_3.groupby(["user_id", "question_nr", "document_nr"])["timestamp"].idxmax()].set_index(["user_id", "document_nr", "question_nr"]).drop(["timestamp", "ID"], axis=1)
user_responses

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label,detector,explainer
user_id,document_nr,question_nr,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,1,2,DetectorDetectGPT,SHAP_Explainer
1,1,1,3,DetectorDetectGPT,SHAP_Explainer
1,2,1,4,DetectorDetectGPT,SHAP_Explainer
1,3,1,4,DetectorDetectGPT,SHAP_Explainer
1,4,1,3,DetectorDetectGPT,SHAP_Explainer
1,5,1,2,DetectorDetectGPT,SHAP_Explainer
1,6,1,3,DetectorDetectGPT,SHAP_Explainer
1,7,1,2,DetectorDetectGPT,SHAP_Explainer
1,8,1,3,DetectorDetectGPT,SHAP_Explainer
1,9,1,3,DetectorDetectGPT,SHAP_Explainer


In [23]:
user_responses.groupby(["detector", "explainer", "question_nr"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,label
detector,explainer,question_nr,Unnamed: 3_level_1
DetectorDetectGPT,SHAP_Explainer,1,2.944444
DetectorDetectGPT,SHAP_Explainer,2,3.055556
DetectorDetectGPT,SHAP_Explainer,3,3.111111


In [24]:
groupby = [ "explainer", "document_nr"]
alpha_values = []
for name, group in user_responses.groupby(groupby):
    cannonical_form = group.reset_index().pivot(columns=["question_nr"], values=["label"], index=["user_id", "document_nr"])
    # print(cannonical_form)
    # print()
    alpha_values.append((*name, krippendorff.alpha(cannonical_form, level_of_measurement="ordinal")))
df_krippendorff_alpha = pd.DataFrame(alpha_values, columns=groupby + ["alpha"])
df_krippendorff_alpha.groupby(groupby[0:-1]).describe()["alpha"]


ValueError: There has to be at least one unit with values assigned by at least two coders.

In [None]:
def highlight_significant(row, props=''):
  #  display(s)
    styles = [''] * len(row)
    styles[0] = 'font-weight: bold' if row["p value"] <= 0.05 else ''
    return styles

In [None]:
def get_aggregate_results_lickert(groupby, label, caption):
    tvalues = []
    pvalues = []
    for name, group in user_responses.groupby(groupby):
        tvalue, pvalue = ttest_rel(group["label"], popmean=3)
        tvalues.append(tvalue)
        pvalues.append(pvalue)
        
        

    df_aggregate_results = pd.DataFrame(user_responses.groupby(groupby)["label"].mean())


    df_aggregate_results["t value"] = tvalues
    df_aggregate_results["p value"] = pvalues

    df_aggregate_results = df_aggregate_results.reindex(sorted(df_aggregate_results.columns), axis=1)
    result = df_aggregate_results.style.apply(highlight_significant, axis=1)\
        .map_index(lambda v: "rotatebox:{45}--rwrap;", level=0, axis=1).format(precision=2).hide(["t value"], axis=1).format_index(escape="latex", axis=0)
    latex_output.append(result.to_latex(environment="longtable", 
                                        convert_css=True, 
                                        clines="all;data", 
                                        hrules=True, 
                                        caption=caption, 
                                        label=label))

    
    return result


In [None]:
get_aggregate_results_lickert(["detector", "explainer", "question_nr"], "lickert-detector-explainer","Lickert Scale Items on detector-explainer level")

TypeError: ttest_rel() got an unexpected keyword argument 'popmean'

In [None]:
get_aggregate_results_lickert([ "explainer", "question_nr"], "lickert-explainer","Lickert Scale Items on explainer level")

Unnamed: 0_level_0,Unnamed: 1_level_0,label,p value
explainer,question_nr,Unnamed: 2_level_1,Unnamed: 3_level_1
Anchor\_Explainer,1,3.54,0.0
Anchor\_Explainer,2,3.43,0.0
Anchor\_Explainer,3,3.38,0.0
LIME\_Explainer,1,3.46,0.0
LIME\_Explainer,2,3.26,0.01
LIME\_Explainer,3,3.42,0.0
SHAP\_Explainer,1,3.37,0.0
SHAP\_Explainer,2,3.33,0.0
SHAP\_Explainer,3,3.32,0.0


In [None]:
with open("figures/tables_user_study.tex", "w", encoding="UTF-8") as text_file:
    text_file.write("\n".join(latex_output))

In [None]:




# # "Reproducing" information from Table 1/2 in Hase et al.:


# user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
# df_change = user_metrics_eval - user_metrics_pre
# df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

# user_acc_col = df_change["Change in User Accuracy"] # for convenience

# # use student t for low number of samples
# lower, upper = stats.t.interval(
# confidence=0.95, 
# df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
#             loc=user_acc_col.mean(), 
#             scale=stats.sem(user_acc_col)
#             ) 

# p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

# k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



# lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval

# # print results
# #  print("Mean change in acc",user_acc_col.mean())
# # print("CI for mean change: [{},{}]".format(lower,upper))

# ##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))


# #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
# #   print("Krippendorff between users: {}".format(k_alpha))
# return p_val

In [None]:
# def evaluate_user_study(df_user_study, df_phase_2, df_phase_4):
#     # "Reproducing" information from Table 1/2 in Hase et al.:
#     user_metrics_pre = df_phase_2.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study))

#     user_metrics_eval = df_phase_4.groupby(["user_id"]).apply(lambda x : user_metrics(x,df_user_study)) # TODO
#     df_change = user_metrics_eval - user_metrics_pre
#     df_change = df_change.rename(columns={"User Accuracy": "Change in User Accuracy"})

#     user_acc_col = df_change["Change in User Accuracy"] # for convenience

#     # use student t for low number of samples
#     lower, upper = stats.t.interval(
#     confidence=0.95, 
#     df=len(user_acc_col)-1, # degrees of freedom = # samples - 1 for mean
#               loc=user_acc_col.mean(), 
#               scale=stats.sem(user_acc_col)
#               ) 
    
#     p_val = ttest_ind(user_metrics_eval["User Accuracy"],user_metrics_pre["User Accuracy"]).pvalue

#     k_alpha = krippendorff.alpha(reliability_data=df_phase_4.groupby(["user_id"]).apply(lambda df : user_responses.astype(int).to_list()).to_list())



#     lower_b, upper_b = stats.bootstrap((user_acc_col,), np.mean, confidence_level=0.95,).confidence_interval
    
#     # print results
#   #  print("Mean change in acc",user_acc_col.mean())
#    # print("CI for mean change: [{},{}]".format(lower,upper))

# ##    print("CI by bootstrap: [{},{}]".format(lower_b, upper_b))

    
#   #  print("p=%.10f" % p_val, "significant (< 0.05)" if p_val < 0.05 else "NOT significant (> 0.05)")
#  #   print("Krippendorff between users: {}".format(k_alpha))
#     return p_val
    

In [None]:
# def simulate_hase(

#         n_learn = 16,
#         n_eval = 16,
#         n_users = 10,

#         mu_got_it_right_pre=0.5,
#         sigma_got_it_right_pre=0.05,

#         mu_gain = 0.1,
#         sigma_gain = 0.1,


# ):
#     users = []
#     user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
#     user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
#     for i in range(1, n_users+1):
#         p_without = user_dist_without()
#         p_with = np.clip(p_without + user_dist_gain(), 0,1)
#         users.append(("u_%s" % i, p_without ,p_with))
#     documents_learn_1_2 = ["l_%s" % i for i in range(1,n_learn+1)]
#     documents_pre_eval = ["e_%s" % i for i in range(1,n_eval+1)]

#     df_detector_output = mock_detector_responses(documents_pre_eval)

#     responses_pre, responses_eval = mock_user_responses(df_detector_output, documents_pre_eval, users)
#     df_pre =pd.DataFrame(responses_pre, columns=columns_experiment)
#     df_eval =pd.DataFrame(responses_eval, columns=columns_experiment)

#  #   print("# responses pre", len(responses_pre))
#   #  print("# responses pre per method", len(responses_pre)/3)
#    # print("Each user saw {} instances. ".format(2*n_learn + 2*n_eval) )
#    # print("Used {} unique documents. A set of {} in phase 1 and 3; and a set of {} in phase 2 and 4.".format(n_learn + n_eval,n_learn, n_eval))

# #    print("Results based on {} unique eval documents.".format(n_eval))

# #    print("Results based on {} datapoints.".format(len(responses_eval)))
#     p_value = evaluate_user_study(df_detector_output, df_pre, df_eval)
#     return p_value