# Evaluating the Human Study

## Settings

In [None]:
!pip install -q --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib torchmetrics

In [None]:
import ast
import json
from pathlib import Path

import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torchmetrics
from google.oauth2 import service_account
from googleapiclient.discovery import build
from matplotlib import pyplot as plt
from torchmetrics import Accuracy, F1Score, MetricCollection, Precision, Recall
from tqdm import tqdm


## Study

**Note:** Replace `binary` in the definition of `mode` (frist line of the following cell) with `multi`/`open` to run this for the multi-choice or open-ended study.

In [None]:
mode = "open"  # Change to "multi" or "open" for multi-choice or open-ended study
study_file = Path(f"../../generated-dataset-30_000/human-study/{mode}/study_forms_{mode}.csv")
ground_truth_file = study_file.with_name(study_file.name.replace("study_forms_", "study_data_"))
form_results = study_file.with_name(study_file.name.replace("study_forms_", "form_responses_"))
final_results = study_file.with_name(study_file.name.replace("study_forms_", "final_results_"))
identification_results = study_file.with_name(
    study_file.name.replace("study_forms_", "identification_results_")
)
form_results

In [None]:
study_setup = pd.read_csv(study_file)
study_setup["study_entry_ids"] = study_setup["study_entry_ids"].apply(json.loads)
study_setup = study_setup.explode("study_entry_ids").set_index("study_entry_ids")
study_setup

In [None]:
SERVICE_ACCOUNT_FILE = "../../research-430307-ad3438ad46d0.json"
SCOPES = [
    "https://www.googleapis.com/auth/forms.body.readonly",
    "https://www.googleapis.com/auth/forms.responses.readonly",
]

credentials = service_account.Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPES)

forms_service = build("forms", "v1", credentials=credentials)

### For binary and multi

In [None]:
collected_results = {}

for form_id in tqdm(study_setup["form_id"].unique()):
    form = forms_service.forms().get(formId=form_id).execute()
    responses = forms_service.forms().responses().list(formId=form_id).execute()

    if not responses or not responses["responses"]:
        print(f"Form {form_id} has no responses")
        continue
    # elif responses:
    #     print(f"Form {form_id} has {len(responses['responses'])} responses")

    question_mapping = {}
    for item in form["items"]:
        if "title" in item and "Question #" in item["title"]:
            question_number = int(item["title"].split("#")[-1])
            # Find the next item which contains the questionItem
            next_index = form["items"].index(item) + 1
            if next_index < len(form["items"]):
                next_item = form["items"][next_index]
                if "questionItem" in next_item:
                    question_id = next_item["questionItem"]["question"]["questionId"]
                    question_mapping[question_id] = question_number
                if "questionItem" in next_item:
                    question_id = next_item["questionItem"]["question"]["questionId"]
                    question_mapping[question_id] = question_number

    results = [
        {
            "study_entry_ids": (
                -1 if "00999999" == elem["questionId"] else question_mapping[elem["questionId"]]
            ),
            "answer": elem["textAnswers"]["answers"][0]["value"],
        }
        for elem in responses["responses"][0]["answers"].values()
    ]
    study_entry_df = pd.DataFrame(results)

    prolific_id = study_entry_df[study_entry_df["study_entry_ids"] == -1]["answer"].item()
    study_entry_df = study_entry_df[study_entry_df["study_entry_ids"] != -1]
    study_entry_df["study_entry_ids"] -= 1  # Undo the offset for nice presentation
    study_entry_df.set_index("study_entry_ids", inplace=True)

    collected_results[prolific_id] = study_entry_df

all_results = pd.concat(
    collected_results.values(), keys=collected_results.keys(), names=["prolific_id"]
).reset_index(level=0)
all_results.to_csv(form_results)

### For open

In [None]:
collected_results = {}
collected_results_identification = {}

for form_id in tqdm(study_setup["form_id"].unique()):
    form = forms_service.forms().get(formId=form_id).execute()
    responses = forms_service.forms().responses().list(formId=form_id).execute()

    if not responses or not responses["responses"]:
        print(f"Form {form_id} has no responses")
        continue
    # elif responses:
    #     print(f"Form {form_id} has {len(responses['responses'])} responses")

    questionid_to_answer = {}
    questionid_to_indentification_answer = {}
    for item in form["items"]:
        if "title" in item and "Question #" in item["title"]:
            question_number = int(item["title"].split("#")[-1])
            # Find the next item which contains the questionItem
            index = form["items"].index(item)
            if index + 2 < len(form["items"]):
                item_identify = form["items"][index + 1]
                item_answer = form["items"][index + 2]
                if "questionGroupItem" in item_identify:
                    for action_index, item in enumerate(item_identify["questionGroupItem"]["questions"]):
                        question_id = item["questionId"]
                        questionid_to_indentification_answer[question_id] = (question_number, action_index)
                if "questionItem" in item_answer:
                    question_id = item_answer["questionItem"]["question"]["questionId"]
                    questionid_to_answer[question_id] = question_number

    results = []
    results_identification = []
    for elem in responses["responses"][0]["answers"].values():
        q_id = elem["questionId"]
        value = elem["textAnswers"]["answers"][0]["value"]
        if q_id == "00999999":
            prolific_id = value
        elif q_id in questionid_to_answer:
            results.append(
                {
                    "study_entry_ids": questionid_to_answer[elem["questionId"]],
                    "answer": value,
                }
            )
        elif q_id in questionid_to_indentification_answer:
            question_number, action_index = questionid_to_indentification_answer[q_id]
            results_identification.append(
                {
                    "study_entry_ids": question_number,
                    "action_index": action_index,
                    "answer": value,
                }
            )
        else:
            raise ValueError(f"Question ID {q_id} not found in the mapping")

    study_entry_df = pd.DataFrame(results)
    study_entry_df["study_entry_ids"] -= 1  # Undo the offset for nice presentation
    study_entry_df.set_index("study_entry_ids", inplace=True)
    collected_results[prolific_id] = study_entry_df

    id_study_entry_df = pd.DataFrame(results_identification)
    id_study_entry_df["study_entry_ids"] -= 1  # Undo the offset for nice presentation
    id_study_entry_df.set_index("study_entry_ids", inplace=True)
    collected_results_identification[prolific_id] = id_study_entry_df

all_results = pd.concat(
    collected_results.values(), keys=collected_results.keys(), names=["prolific_id"]
).reset_index(level=0)
all_results.sort_values(by=["study_entry_ids"], inplace=True)
all_results.to_csv(form_results)

all_results_identification = pd.concat(
    collected_results_identification.values(),
    keys=collected_results_identification.keys(),
    names=["prolific_id"],
).reset_index(level=0)
all_results_identification.sort_values(by=["study_entry_ids", "action_index"], inplace=True)
all_results_identification.to_csv(identification_results)

In [None]:
all_results = pd.read_csv(form_results, index_col=0)

for pid, group in all_results.groupby("prolific_id"):
    print(pid + ":")
    for a in group["answer"]:
        print("    " + a)

all_results

In [None]:
all_results_open = pd.read_csv(final_results, index_col=0)
all_results_open

In [None]:
# Get an arbitrary URL to visualize it
print(all_results_open.iloc[0]["responder_uri"])

In [None]:
participant_results = study_setup.join(all_results, how="inner").rename(
    columns={"answer": "participant_answer"}
)
participant_results

In [None]:
ground_truth = pd.read_csv(ground_truth_file, index_col=0)
ground_truth

In [None]:
df = participant_results.join(ground_truth, on="study_entry_ids", how="inner")
df

In [None]:
df.to_csv(final_results)

In [None]:
df = pd.read_csv(final_results, index_col=0)

In [None]:
len(df["responder_uri"].value_counts())

In [None]:
df["responder_uri"].value_counts().sort_values()

In [None]:
len(df["prolific_id"].value_counts())

In [None]:
df["prolific_id"].value_counts()

In [None]:
num_classes = len(df["answer"].value_counts())
task = "multiclass"
avg = "macro"

metrics = MetricCollection(
    {
        "accuracy": Accuracy(num_classes=num_classes, task=task, average=avg),
        "precision": Precision(num_classes=num_classes, task=task, average=avg),
        "recall": Recall(num_classes=num_classes, task=task, average=avg),
        "f1": F1Score(num_classes=num_classes, task=task, average=avg),
    }
)

to_idx = {
    "yes": 1,
    "no": 0,
    "a": 0,
    "b": 1,
    "c": 2,
}

metrics(
    torch.tensor(df["answer"].str.lower().map(to_idx).to_numpy()),
    torch.tensor(df["participant_answer"].str.lower().map(to_idx).to_numpy()),
)

#### Can humans identify the actions

In [None]:
df_id_data = pd.read_csv(identification_results, index_col=0).rename(
    columns={"answer": "action_classification"}
)
df_id_data

In [None]:
all_test_data = (
    pd.read_csv(study_file.parents[2] / "test_data.csv", index_col=0)
    .reset_index()
    .drop_duplicates("sample_id")
    .set_index("sample_id")
)
all_test_data

In [None]:
df_id = (
    df_id_data.reset_index()
    .join(ground_truth, on="study_entry_ids", how="inner")
    .set_index("sample_id")
    .drop(columns=["question", "answer"])
    .join(all_test_data, on="sample_id", how="inner")
    .reset_index()
    .set_index(["sample_id", "action_index"])[["action_classification", "action_sequence"]]
)
df_id["action_sequence"] = df_id["action_sequence"].apply(ast.literal_eval)
df_id["action_sequence"] = df_id.apply(lambda row: row["action_sequence"][row.name[1]], axis=1)
df_id

In [None]:
action_to_int = {action: index for index, action in enumerate(df_id["action_sequence"].explode().unique())}
action_to_int

In [None]:
before = len(df_id)
df_id = df_id[df_id["action_classification"] != "<not recognized>"]
after = len(df_id)
f"Removed {before - after} entries with <not recognized> out of {before} ({(before - after) / before:.2%})"

In [None]:
def get_metrics(df: pd.DataFrame) -> dict[str, float]:
    res = MetricCollection(
        {
            "Accuracy": torchmetrics.Accuracy(task="multiclass", num_classes=len(action_to_int)),
            "Precision": torchmetrics.Precision(task="multiclass", num_classes=len(action_to_int)),
            "Recall": torchmetrics.Recall(task="multiclass", num_classes=len(action_to_int)),
            "F1": torchmetrics.F1Score(task="multiclass", num_classes=len(action_to_int), average="micro"),
        }
    )(
        torch.from_numpy(df["action_classification"].map(action_to_int).to_numpy(np.int32)),
        torch.from_numpy(df["action_sequence"].map(action_to_int).to_numpy(np.int32)),
    )
    return {key: value.item() for key, value in res.items()}

In [None]:
get_metrics(df_id)


In [None]:
res = {}

for correct_action, data in df_id.groupby("action_sequence"):
    res[correct_action] = get_metrics(data)
    res[correct_action]["count"] = data.shape[0]

df_individual_actions = pd.DataFrame(res).T.sort_index().astype({"count": int})
df_individual_actions[["Accuracy", "Precision", "Recall", "F1"]] *= 100
df_individual_actions

In [None]:
sns.set_theme(style="whitegrid")
sns.set_context("talk", font_scale=1)

plt.figure(figsize=(10, 6))
sns.barplot(
    data=df_individual_actions.reset_index()
    .sort_values(by="F1", ascending=False)
    .replace({"index": {"picking something up with both hands": "picking [...] both hands"}}),
    x="F1",
    y="index",
    hue="F1",
    palette=sns.color_palette("crest_r", n_colors=len(df_individual_actions)),
    legend=False,
)
plt.xlabel("F1 Score [%]")
plt.ylabel(None)
plt.xlim(0, 100)
sns.despine(left=True, bottom=True)
plt.savefig(f"action_classification_{mode}.pdf", bbox_inches="tight", pad_inches=0)

plt.show()

In [None]:
df_id[df_id["action_sequence"] == "catching a ball"]["action_classification"].value_counts(
    normalize=True
) * 100

## Collect all participant IDs that contributed

In [None]:
path_to_responses = lambda mode: Path(
    f"../../generated-dataset-30_000/human-study/{mode}/form_responses_{mode}.csv"
)

participant_ids = pd.concat(
    {
        "binary": pd.read_csv(path_to_responses("binary"), index_col=0)["prolific_id"].drop_duplicates(),
        "multi": pd.read_csv(path_to_responses("multi"), index_col=0)["prolific_id"].drop_duplicates(),
        "open": pd.read_csv(path_to_responses("open"), index_col=0)["prolific_id"].drop_duplicates(),
    }
)

participant_ids.groupby(level=0).count()

## Demographics

In [None]:
dems_binary = pd.read_csv(
    "../../generated-dataset-30_000/human-study/binary/prolific_demographics_export.csv"
)
dems_multi = pd.read_csv("../../generated-dataset-30_000/human-study/multi/prolific_demographics_export.csv")
dems_open = pd.read_csv("../../generated-dataset-30_000/human-study/open/prolific_demographics_export.csv")
dems = pd.concat([dems_binary, dems_multi, dems_open])

# dems = dems[dems["Age"] != "CONSENT_REVOKED"]
# dems = dems[dems["Status"].isin(["AWAITING REVIEW", "APPROVED"])]
dems = dems[dems["Participant id"].isin(participant_ids)]
dems["Age"] = dems["Age"].astype(int)
dems

In [None]:
dems.info()

In [None]:
sns.set_context("talk")
sns.set_style("whitegrid")
sns.set_palette("crest")

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 4), squeeze=False)
axes = axes[0, :]

dems_plot = dems.copy().replace(
    {
        "Country of residence": {
            "United Kingdom": "UK",
            "United States": "US",
            "Canada": "CA",
            "Germany": "DE",
            "Portugal": "PT",
            "Mexico": "MX",
            "Spain": "ES",
            "New Zealand": "NZ",
        }
    }
)

eth = dems_plot["Ethnicity simplified"]
eth = eth[eth != "DATA_EXPIRED"]
ethnicity_counts = eth.value_counts(normalize=True) * 100
sex_counts = dems_plot["Sex"].value_counts(normalize=True) * 100
origin_counts = dems_plot["Country of residence"].value_counts(normalize=True) * 100

# Aggregate the smallest ones in origin_counts into "Other"
limit = 5
origin_counts["Other"] = origin_counts[origin_counts < limit].sum()
origin_counts = origin_counts[origin_counts >= limit]

color_palettes = {
    # "Simplified\nEthnicity": sns.color_palette("Set1"),
    # "Sex": sns.color_palette("Set2"),
    # "Country of\nresidence": sns.color_palette("colorblind"),
    "Simplified Ethnicity": sns.color_palette("crest", n_colors=len(ethnicity_counts)),
    "Sex": sns.color_palette("crest", n_colors=len(sex_counts)),
    "Country of Residence": sns.color_palette("crest", n_colors=len(origin_counts)),
}
fontsize = 14

bottom_ethnicity = 0
for index, value in ethnicity_counts.items():
    axes[0].barh(
        "Simplified\nEthnicity",
        value,
        left=bottom_ethnicity,
        label=index,
        color=color_palettes["Simplified Ethnicity"][ethnicity_counts.index.get_loc(index)],
    )
    axes[0].text(
        bottom_ethnicity + value / 2,
        0,
        index,
        ha="center",
        va="center",
        color="white",
        fontsize=fontsize,
        rotation=0 if value > 8 else 90,
    )
    bottom_ethnicity += value

bottom_origin = 0
for index, value in origin_counts.items():
    axes[0].barh(
        "Country of\nResidence",
        value,
        left=bottom_origin,
        label=index,
        color=color_palettes["Country of Residence"][origin_counts.index.get_loc(index)],
    )
    axes[0].text(
        bottom_origin + value / 2,
        1,
        index,
        ha="center",
        va="center",
        color="white",
        fontsize=fontsize,
    )
    bottom_origin += value

bottom_sex = 0
for index, value in sex_counts.items():
    axes[0].barh(
        "Sex",
        value,
        left=bottom_sex,
        label=index,
        color=color_palettes["Sex"][sex_counts.index.get_loc(index)],
    )
    axes[0].text(
        bottom_sex + value / 2,
        2,
        index,
        ha="center",
        va="center",
        color="white",
        fontsize=fontsize,
    )
    bottom_sex += value


axes[0].set_xlim(0, 100)
# axes[0].set_xlabel("Percentage")
axes[0].set_xticks([0, 20, 40, 60, 80, 100])
axes[0].set_xticklabels([f"{val.get_text()}%" for val in axes[0].get_xticklabels()])
sns.despine(ax=axes[0], left=True, bottom=True)

sns.histplot(
    x=dems["Age"],
    stat="percent",
    ax=axes[1],
    element="bars",
    bins=[15, 20, 25, 30, 35, 40, 45, 50, 55],
)
axes[1].xaxis.grid(False)
axes[1].set_ylabel(None)
axes[1].set_yticks([5, 10, 15, 20, 25, 30])
axes[1].set_yticklabels([f"{val.get_text()}%" for val in axes[1].get_yticklabels()])
sns.despine(ax=axes[1], left=True)

plt.tight_layout(w_pad=1, pad=0)

plt.savefig(study_file.absolute().parents[1] / "human_evaluation-demographics.pdf")
plt.show()