In [1]:
import pandas as pd
import scipy.stats as st
import os

In [2]:
type_dict = {
    "id": "Int64",
    "submitdate": "str",
    "lastpage": "string",
    "startlanguage": "string",
    "seed": "string",
    "token": "string",
    "startdate": "str",
    "datestamp": "str",
    "refurl": "string",
    "group": "string",
    "tasks": "Int64",
    "UUID": "string",
    "age": "string",
    "gender_mf": "string",
    "gender_other": "string",
    "education": "string",
    "usefulness[SQ001]": "Int64",
    "usefulness[SQ002]": "Int64",
    "usefulness[SQ003]": "Int64",
    "usefulness[SQ004]": "Int64",
    "usefulness[SQ005]": "Int64",
    "usefulness[SQ006]": "Int64",
    "ease_of_use[SQ001]": "Int64",
    "ease_of_use[SQ002]": "Int64",
    "ease_of_use[SQ003]": "Int64",
    "ease_of_use[SQ004]": "Int64",
    "ease_of_use[SQ005]": "Int64",
    "ease_of_use[SQ006]": "Int64",
    "self_efficacy[SQ001]": "Int64",
    "self_efficacy[SQ002]": "Int64",
    "self_efficacy[SQ003]": "Int64",
    "self_efficacy[SQ004]": "Int64",
    "self_efficacy[SQ005]": "Int64",
    "self_efficacy[SQ006]": "Int64",
    "self_efficacy[SQ007]": "Int64",
    "self_efficacy[SQ008]": "Int64",
    "self_efficacy[SQ009]": "Int64",
    "self_efficacy[SQ010]": "Int64",
    "load[SQ001]": "Int64",
    "load[SQ002]": "Int64",
    "load[SQ003]": "Int64",
    "load[SQ004]": "Int64",
    "load[SQ005]": "Int64",
    "load[SQ006]": "Int64",
}
meta_cols = [
    "submitdate",
    "startdate",
    "datestamp",
    "tasks",
    "age",
    "gender_mf",
    "education",
    "UUID",
]

question_dict = {
    "usefulness[SQ001]": "Using the chatbot for learning enable me to accomplish tasks more quickly.",
    "usefulness[SQ002]": "Using the chatbot would improve my learning performance.",
    "usefulness[SQ003]": "Using the chatbot in my Job would increase my learning outcomes.",
    "usefulness[SQ004]": "Using the chatbot would enhance my effectiveness when learning.",
    "usefulness[SQ005]": "Using the chatbot would make it easier to learn maths.",
    "usefulness[SQ006]": "I would find the chatbot useful for learning.",
    "ease_of_use[SQ001]": "Learning to operate the chatbot would be easy for me.",
    "ease_of_use[SQ002]": "I would find it easy to get the chatbot to do what I want.",
    "ease_of_use[SQ003]": "My interaction with the chatbot would be clear and understandable.",
    "ease_of_use[SQ004]": "I would find the chatbot to be flexible to interact with.",
    "ease_of_use[SQ005]": "It would be easy for me to become skillful at using the chatbot.",
    "ease_of_use[SQ006]": "I would find the chatbot easy to use.",
    "self_efficacy[SQ001]": "...if there was no one around to tell me what to do as I go.",
    "self_efficacy[SQ002]": "...if I had never used a tool like it before.",
    "self_efficacy[SQ003]": "...if I had only the software manuals for reference.",
    "self_efficacy[SQ004]": "...if I had seen someone else using it before trying it myself.",
    "self_efficacy[SQ005]": "...if I could call someone for help if I got stuck.",
    "self_efficacy[SQ006]": "...if someone else had helped me get started.",
    "self_efficacy[SQ007]": "...if I had a lot of time to solve the problems for which the tool was provided.",
    "self_efficacy[SQ008]": "...if I had just the built-in help facility for assistance.",
    "self_efficacy[SQ009]": "...if someone showed me how to do it first.",
    "self_efficacy[SQ010]": "...if I had used a similar tool before this one to solve the same problems.",
    "load[SQ001]": "Mental Demand",
    "load[SQ002]": "Physical Demand",
    "load[SQ003]": "Temporal Demand",
    "load[SQ004]": "Performance",
    "load[SQ005]": "Effort",
    "load[SQ006]": "Frustration",
}

In [3]:
def get_tam_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "usefulness" in col or "ease_of_use" in col]]

def get_self_efficacy_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "self_efficacy" in col]]

def get_load_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "load" in col]]

In [4]:
raw_res = pd.read_csv("./data/survey_results.csv", index_col="id", dtype=type_dict, parse_dates=["submitdate", "startdate", "datestamp"], date_format="%Y-%m-%dT%H:%M:%S%z")

# fix scale for load question 4 (needs to be reversed)
raw_res["load[SQ004]"] = 20 - raw_res["load[SQ004]"]

In [5]:
working_df = raw_res.drop(columns=["lastpage", "startlanguage", "gender_other", "seed", "token", "refurl"])

meta_df = working_df[meta_cols]
data_df = working_df.drop(columns=meta_cols)

# sanity check df format
# data_df

In [6]:
# produce separate output csv
expl_df = data_df[raw_res["group"] == "E"]
base_df = data_df[raw_res["group"] == "B"]

os.makedirs("./out", exist_ok=True)

expl_df.to_csv("./out/survey_results_expl.csv")
base_df.to_csv("./out/survey_results_base.csv")

### Evaluation

In [7]:
# split data into constructs
tam_df = get_tam_df(data_df)
self_efficacy_df = get_self_efficacy_df(data_df)
load_df = get_load_df(data_df)

Evaluate NASA TLX results as “raw TLX” as per https://doi.org/10.1177/154193120605000909. Since the given task was not time-constrained and did not involve physical effort, the corresponding subscales are ignored in the evaluation.

In [8]:
# columns for mental demand, performance, effort, frustration
cols = [question_dict["load[SQ001]"], question_dict["load[SQ004]"], question_dict["load[SQ005]"], question_dict["load[SQ006]"]]

# rename columns according to question_dict for easier interpretation
load_df = load_df.rename(columns=question_dict)

# split groups; don't worry about the indices, it looks shady but it's fine)
base_group = load_df[raw_res["group"] == "B"]
expl_group = load_df[raw_res["group"] == "E"]

results = {}
for col in cols:
    base_mean = base_group[col].mean()
    expl_mean = expl_group[col].mean()
    t_stat, p_val = st.ttest_ind(
        base_group[col], expl_group[col], alternative="greater", equal_var=False
    )
    results[col] = {
        "base_mean": base_mean,
        "expl_mean": expl_mean,
        "t_stat": t_stat,
        "p_val": p_val,
    }

for col, res in results.items():
    print(f"Column: {col}")
    print(f"  Base Mean: {res['base_mean']:.2f}")
    print(f"  Expl Mean: {res['expl_mean']:.2f}")
    print(f"  t-statistic: {res['t_stat']:.4f}")
    print(f"  p-value: {res['p_val']:.4f}")

Column: Mental Demand
  Base Mean: 12.33
  Expl Mean: 10.71
  t-statistic: 0.5069
  p-value: 0.3111
Column: Performance
  Base Mean: 7.00
  Expl Mean: 3.86
  t-statistic: 1.1371
  p-value: 0.1480
Column: Effort
  Base Mean: 10.67
  Expl Mean: 9.43
  t-statistic: 0.3811
  p-value: 0.3557
Column: Frustration
  Base Mean: 10.50
  Expl Mean: 2.57
  t-statistic: 2.9174
  p-value: 0.0105
