This notebook evaluates the results form the user study. Set `FILL_DATABASE` to True to create a mock database. Set it to False to use the results from an existing database.

If mocking data, this notebook expects the server to run on `server_url` with a database with no entries, as set up by running `node setup.mjs`. 

In [None]:
USER_STUDY_CSV = "./results/user_study/selection.csv"
SQLITE_DB = "../survey/db_final.db"
FILL_DATABASE = False # if True, data is mocked, THIS CALLS THE APIs

server_url = "http://localhost:3002" # server to mock data on


In [None]:

import krippendorff
from scipy.stats import ttest_rel
from scipy.stats import ttest_1samp
import numpy as np 
import scipy.stats as stats 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
import sqlite3
import pandas as pd
connection = sqlite3.connect(SQLITE_DB)

user_df = pd.read_sql_query("SELECT * FROM users", connection)

In [None]:
df_user_study = pd.read_csv(USER_STUDY_CSV)

## Fill Database
If `FILL_DATABASE` is True

In [None]:
%%writefile run_user.py

def run_user(idx, user, url, df_user_study):
    n_learn = 16
    n_eval = 16
    n_users = 10

    mu_got_it_right_pre=0.5
    sigma_got_it_right_pre=0.05
    mu_gain = 0.2
    sigma_gain = 0.1

    def guess(detector_label,p):
        return detector_label if bool(np.random.choice([0,1],p=[1-p, p])) else not detector_label

    import requests
    import json
    import numpy as np
    import pandas as pd
    user_dist_without = lambda : np.clip(np.random.normal(mu_got_it_right_pre, sigma_got_it_right_pre, 1)[0], 0,1)
    user_dist_gain = lambda : np.clip(np.random.normal(mu_gain, sigma_gain, 1)[0], -1,1)
  
    res = requests.get(url+"/auth/"+ user["access_token"])

    print(res.text)
    auth_token = json.loads(res.text)
    headers = {'Content-Type': 'application/json','Authorization': "Bearer "+auth_token, "Content-Type": "application/json",}

    requests.post(url+"/api/submitParticipantInfo", json={
    "has_seen_explanation_methods_before": "yes",
    "has_seen_OTHERS_before": "yes",
    "level_of_expertise": "is-researcher-explainability",
    "familiarity_with_chatgpt": "occasional-use",
    "prefers_monochromatic_methods": "yes" if idx % 20 == 0 else "no"
    }, headers=headers)
    # go to phase 2
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 0}, headers=headers)
    # if idx % 8 == 0:
    #     return
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 1}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 2}, headers=headers)

    res = requests.get(url+"/api/state", headers=headers)
    state = json.loads(res.text)

    # user_df = pd.read_sql_query("SELECT * FROM users", connection) # update as group is assigned now
    # user = user_df.iloc[idx]
   # print(user[["detector", "explainer"]])
    # return state
    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[state["detector"]],:].reset_index(drop=True)
    for doc_nr, row in df_user_documents.iterrows():
        p_without = user_dist_without()
        requests.post(url+"/api/submitPhase2", json={"ID": doc_nr, "label": guess(row["f(b)"], p_without)}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 3}, headers=headers)

    for doc_nr, row in df_user_documents.iterrows():
        json_ = {"lickert-q{}-{}".format(question_nr, doc_nr): str(np.random.choice([1,2,3,4,5], p=[0.1,0.2,0.1,0.4,0.2])) for question_nr in range(1,4)}
        json_["document_nr"] = doc_nr
        requests.post(url+"/api/submitPhase3", json=json_, headers=headers)

    requests.post(url+"/api/completeCurrentPhase", json={"expected": 4}, headers=headers)
    for doc_nr, row in df_user_documents.iterrows():
        p_with = np.clip(p_without + user_dist_gain(), 0,1)
        requests.post(url+"/api/submitPhase4", json={"ID": doc_nr, "label": guess(row["f(b)"], p_with)}, headers=headers)
    requests.post(url+"/api/completeCurrentPhase", json={"expected": 5}, headers=headers)


In [None]:
from tqdm import tqdm
from multiprocess import Pool
from run_user import run_user


if FILL_DATABASE:
    max_pool = 10
    mock_user_data = [(idx, user, server_url, df_user_study) for idx, user in user_df.iterrows() if idx < 27]

    with Pool(max_pool) as p:
        pool_outputs = list(tqdm(p.starmap(run_user,mock_user_data),total=len(mock_user_data)))    
    print(pool_outputs)


## Evaluation
### Participant Info

In [None]:
participant_info = pd.read_sql_query("SELECT participant_info.*, users.detector, users.explainer FROM participant_info INNER JOIN users ON participant_info.user_id = users.ID where users.current_phase = 5", connection)
participant_info.head(1)

In [None]:
stat = lambda col_name : display(participant_info[col_name].value_counts(normalize=True).round(2))

In [None]:
stat("has_seen_explanation_methods_before")

In [None]:
stat("level_of_expertise")

In [None]:
stat("familiarity_with_chatgpt")

In [None]:
stat("has_seen_explanation_methods_before")

### Forward Simulation

There are two more users for Guo+SHAP and Radford+Anchor:

In [None]:
user_df = pd.read_sql_query("SELECT * FROM users where current_phase = 5", connection) # remove the two last 
user_df.groupby(["detector", "explainer"])["ID"].count()

Exclude the last two:

In [None]:
user_df = pd.read_sql_query("SELECT * FROM users where current_phase = 5 and users.id != 76 and users.id != 45", connection) # remove the two last 

In [None]:
user_df.groupby(["detector", "explainer"])["ID"].count()

In [None]:
user_df.head()

In [None]:
def get_is_correct(df_user_responses, df_user_study):
    """Returns a dataframe that specifies wheter the users correctly guessed the detector's decision for each document

    Args:
        df_user_responses: Responses from user study
        df_user_study: Documents for user study
    """
    detector = df_user_responses.iloc[0]["detector"] 

    df_user_documents = df_user_study.loc[df_user_study.groupby("Detector").groups[detector],:].reset_index(drop=True) # decisions differ by detector
    detector_predictions = df_user_documents["f(b)"].astype(bool) # set B
    user_responses = df_user_responses.loc[df_user_responses.groupby("document_nr")["timestamp"].idxmax()].set_index("document_nr")["label"].astype(bool) # only keep most recent response

    return user_responses == detector_predictions

In [None]:
u = user_df.set_index("ID").rename_axis("user_id")[["explainer", "detector"]] # rename for join

# get from DB
df_phase_2 = pd.read_sql_query("SELECT responses_phase_2.*, users.detector, users.explainer FROM responses_phase_2 INNER JOIN users ON responses_phase_2.user_id = users.ID", connection)
df_phase_4 = pd.read_sql_query("SELECT responses_phase_4.*, users.detector, users.explainer FROM responses_phase_4 INNER JOIN users ON responses_phase_4.user_id = users.ID", connection)

# get two dataframes "is_correct_phase_2"
is_correct_phase_4 = df_phase_4.groupby(["user_id"]).apply(lambda df_user_responses : get_is_correct(df_user_responses,df_user_study))
is_correct_phase_2 = df_phase_2.groupby(["user_id"]).apply(lambda df_user_responses : get_is_correct(df_user_responses,df_user_study))

In [None]:
from statsmodels.stats.contingency_tables import mcnemar, SquareTable 

In [None]:
latex_output = []

In [None]:
# method level

results = []
for explainer, _ in u.groupby("explainer"):
    # get contingency table by explainer
    phase_2 = u.join(is_correct_phase_2).loc[(u.join(is_correct_phase_2)["explainer"] == explainer)].set_index(["explainer", "detector"])
    phase_4 = u.join(is_correct_phase_4).loc[(u.join(is_correct_phase_4)["explainer"] == explainer)].set_index(["explainer", "detector"])

    contingency_table =SquareTable.from_data(pd.concat([phase_2.melt()["value"], phase_4.melt()["value"]], axis=1))
 
    marginal_row_prob, marginal_col_prob = contingency_table.marginal_probabilities
 
    user_accuracy_4 = marginal_col_prob[True]
    user_accuracy_2 = marginal_row_prob[True]

    m = mcnemar(contingency_table.table,exact=True) # use binominal distribution, p value already multiplied by 2

    results.append((explainer.replace("_Explainer", ""),  #matrix, marginal_frequencies, 
                    user_accuracy_2,
                    user_accuracy_4,
                    ((user_accuracy_4 / user_accuracy_2) -1.0),
                    m.pvalue#, m.statistic
                    ))
df = pd.DataFrame(results, columns=[
    "", 
    "User Acc without",
    "User Acc with",
    "Change", 
    "p"]).sort_values(by=["Change"], ascending=False).style.format({
    "User Acc without": "{:.3f}".format,
    "User Acc with": "{:.3f}".format,
    "Change": "{:.2%}".format,
    "p": "{:.3f}".format,
}).hide(axis="index")
latex_output.append(df.to_latex(environment="table", 
                                    convert_css=True, 
                                    clines="all;data", 
                                    hrules=True, 
                
                                    caption="Forward simulation experiment by method", 
                                    label="user-study-per-method"))
df

In [None]:
# group level, (redundant but with different groupby)

results = []
for (explainer, detector),_ in u.groupby(["explainer","detector"]):
    # get contingency table by explainer
    phase_2 = u.join(is_correct_phase_2).loc[(u.join(is_correct_phase_2)["explainer"] == explainer) & (u.join(is_correct_phase_2)["detector"] == detector)].set_index(["explainer", "detector"])
    phase_4 = u.join(is_correct_phase_4).loc[(u.join(is_correct_phase_4)["explainer"] == explainer) & (u.join(is_correct_phase_2)["detector"] == detector)].set_index(["explainer", "detector"])

    contingency_table =SquareTable.from_data(pd.concat([phase_2.melt()["value"], phase_4.melt()["value"]], axis=1))
 
    marginal_row_prob, marginal_col_prob = contingency_table.marginal_probabilities
 
    user_accuracy_4 = marginal_col_prob[True]
    user_accuracy_2 = marginal_row_prob[True]

    m = mcnemar(contingency_table.table,exact=True) # use binominal distribution, p value already multiplied by 2

    results.append((explainer.replace("_Explainer", ""), detector.replace("Detector", ""), #matrix, marginal_frequencies, 
                    user_accuracy_2,
                    user_accuracy_4,
                    ((user_accuracy_4 / user_accuracy_2) -1.0),
                    m.pvalue#, m.statistic
                    ))
df = pd.DataFrame(results, columns=[
    "",
    "", 
    "User Acc without",
    "User Acc with",
    "Change", 
    "p"]).sort_values(by=["Change"], ascending=False).style.format({
    "User Acc without": "{:.3f}".format,
    "User Acc with": "{:.3f}".format,
    "Change": "{:.2%}".format,
    "p": "{:.3f}".format,
}).hide(axis="index")
latex_output.append(df.to_latex(environment="table", 
                                    convert_css=True, 
                                    clines="all;data", 
                
                                    hrules=True, 
                                    caption="Forward simulation experiment by group", 
                                    label="user-study-per-group"))
df

### Rating Task

In [None]:
df_phase_3 = pd.read_sql_query("SELECT responses_phase_3.*, users.detector, users.explainer FROM responses_phase_3 INNER JOIN users ON responses_phase_3.user_id = users.ID where users.current_phase = 5 and users.id != 76 and users.id != 45", connection)
# remove the two additional participants (see above)

In [None]:
# only keep most recent responses
user_responses = df_phase_3.loc[df_phase_3.groupby(["user_id", "question_nr", "document_nr"])["timestamp"].idxmax()].set_index(["user_id", "document_nr", "question_nr"]).drop(["timestamp", "ID"], axis=1)
user_responses

In [None]:
assert len(user_responses) == 36*18*3

In [None]:
user_responses.reset_index().set_index(["detector", "user_id", "document_nr"]).groupby(["explainer"]).mean()

In [None]:
def highlight_significant(row, props=''):
  #  display(s)
    styles = [''] * len(row)
    styles[0] = 'font-weight: bold' if row["p value"] <= 0.05 else ''
    return styles

In [None]:
import seaborn as sns

In [None]:
def get_aggregate_results_lickert(groupby, label, caption):
    df_aggregate_results = pd.DataFrame(user_responses.groupby(groupby)["label"].mean())
    
    #df_aggregate_results = df_aggregate_results.reindex(sorted(df_aggregate_results.columns), axis=1)
    df_aggregate_results = df_aggregate_results.reset_index()
    df_aggregate_results["explainer"] = df_aggregate_results["explainer"].str.replace("_Explainer", "")
    if "detector" in groupby:
        df_aggregate_results["detector"] = df_aggregate_results["detector"].str.replace("Detector", "")
    df_aggregate_results = df_aggregate_results.rename({"question_nr":"Question"}, axis=1)
    df_aggregate_results = df_aggregate_results.set_index(groupby[0:-1]+["Question"])
    df_aggregate_results.plot.bar()
    
    result = df_aggregate_results.style.format(precision=2).format_index(escape="latex", axis=0)
    latex_output.append(result.to_latex(environment="table", 
                                        convert_css=True, 
                                        clines="all;data", 
                                        hrules=True, 
                                        caption=caption, 
                                        label=label))


    return result


## Evaluation

In [None]:
get_aggregate_results_lickert(["detector", "explainer", "question_nr"], "rating-group","Rating task at the group level")

In [None]:
get_aggregate_results_lickert([ "explainer", "question_nr"], "lickert-explainer","Rating task on the method level")

In [None]:
with open("figures/tables_user_study.tex", "w", encoding="UTF-8") as text_file:
    text_file.write("\n".join(latex_output).replace("%", u"""\%"""))