In [1]:
import pandas as pd
import scipy.stats as stats
import os

In [2]:
from constants import meta_cols, question_dict, type_dict

In [3]:
def get_tam_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "usefulness" in col or "ease_of_use" in col]]

def get_self_efficacy_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "self_efficacy" in col]]

def get_load_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[[col for col in df.columns if "load" in col]]

In [4]:
raw_res = pd.read_csv("./data/survey_results.csv", index_col="id", dtype=type_dict, parse_dates=["submitdate", "startdate", "datestamp"], date_format="%Y-%m-%dT%H:%M:%S%z")

# fix scale for load question 4 (needs to be reversed)
raw_res["load[SQ004]"] = 20 - raw_res["load[SQ004]"]

In [5]:
working_df = raw_res.drop(columns=["lastpage", "startlanguage", "gender_other", "seed", "token", "refurl"])

meta_df = working_df[meta_cols]
data_df = working_df.drop(columns=meta_cols)

# sanity check df format
data_df.columns

Index(['group', 'usefulness[SQ001]', 'usefulness[SQ002]', 'usefulness[SQ003]',
       'usefulness[SQ004]', 'usefulness[SQ005]', 'usefulness[SQ006]',
       'ease_of_use[SQ001]', 'ease_of_use[SQ002]', 'ease_of_use[SQ003]',
       'ease_of_use[SQ004]', 'ease_of_use[SQ005]', 'ease_of_use[SQ006]',
       'self_efficacy[SQ001]', 'self_efficacy[SQ002]', 'self_efficacy[SQ003]',
       'self_efficacy[SQ004]', 'self_efficacy[SQ005]', 'self_efficacy[SQ006]',
       'self_efficacy[SQ007]', 'self_efficacy[SQ008]', 'self_efficacy[SQ009]',
       'self_efficacy[SQ010]', 'load[SQ001]', 'load[SQ002]', 'load[SQ003]',
       'load[SQ004]', 'load[SQ005]', 'load[SQ006]'],
      dtype='object')

In [6]:
# produce separate output csv
expl_df = data_df[raw_res["group"] == "E"]
base_df = data_df[raw_res["group"] == "B"]

os.makedirs("./out", exist_ok=True)

expl_df.to_csv("./out/survey_results_expl.csv")
base_df.to_csv("./out/survey_results_base.csv")

### Evaluation

The evaluation is performed using simple statistical tests to compare the different groups. The evaluation will be split between the different questionnaires used in the study (TAM, Self-Efficacy, NASA TLX), where TAM is split into its two subscales (Perceived Usefulness and Perceived Ease of Use), CSE is evaluated as a whole, and NASA TLX is evaluated for four of its six subscales (Mental Demand, Performance, Effort, Frustration).

The validity of the statistical tests is validated using Mann-Whitney U tests and (Welch's) t-tests for all comparisons. since the small sample size does not allow for a reliable assessment (or assumption) of normality, the non-parametric Mann-Whitney U test is preferred. However, for completeness, t-tests are also reported.

Statistical tests are performed using the `scipy.stats` library, and visualizations are created using `seaborn` and `matplotlib`.

In [9]:
# split data into constructs
tam_df = get_tam_df(data_df)
self_efficacy_df = get_self_efficacy_df(data_df)
load_df = get_load_df(data_df)

Evaluate NASA TLX results as “raw TLX” as per https://doi.org/10.1177/154193120605000909. Since the given task was not time-constrained and did not involve physical effort, the corresponding subscales are ignored in the evaluation.

In [None]:
# columns for mental demand, performance, effort, frustration
cols = [question_dict["load[SQ001]"], question_dict["load[SQ004]"], question_dict["load[SQ005]"], question_dict["load[SQ006]"]]

# rename columns according to question_dict for easier interpretation
load_df = load_df.rename(columns=question_dict)

# split groups; don't worry about the indices, it looks shady but it's fine)
base_group = load_df[raw_res["group"] == "B"]
expl_group = load_df[raw_res["group"] == "E"]

for col in cols:
    b = base_group[col]
    e = expl_group[col]

    u_res = stats.mannwhitneyu(b, e, alternative="greater")
    u_pval = u_res.pvalue
    u_stat = u_res.statistic

    t_res = stats.ttest_ind(b, e, equal_var=False, alternative="greater")
    t_pval = t_res.pvalue
    t_stat = t_res.statistic

    print(f"{col}:")
    print(f"  Mann-Whitney U: stat={u_stat:.3f}, p={u_pval:.3f}")
    print(f"  t-test: stat={t_stat:.3f}, p={t_pval:.3f}")

Mental Demand:
  Mann-Whitney U: stat=24.500, p=0.332
  t-test: stat=0.507, p=0.311
Performance:
  Mann-Whitney U: stat=27.000, p=0.213
  t-test: stat=1.137, p=0.148
Effort:
  Mann-Whitney U: stat=21.500, p=0.500
  t-test: stat=0.381, p=0.356
Frustration:
  Mann-Whitney U: stat=37.500, p=0.010
  t-test: stat=2.917, p=0.011
