In [34]:
import pandas as pd
import scipy.stats as stats
import os

In [35]:
from constants import meta_cols, question_dict, type_dict, pu_cols, peou_cols, se_cols, load_cols

In [36]:
def get_pu_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[pu_cols + ["group"]]

def get_peou_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[peou_cols + ["group"]]

def get_self_efficacy_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[se_cols + ["group"]]

def get_load_df(df: pd.DataFrame) -> pd.DataFrame:
    return df[load_cols + ["group"]]

In [37]:
raw_res = pd.read_csv("./data/survey_results.csv", index_col="id", dtype=type_dict, parse_dates=["submitdate", "startdate", "datestamp"], date_format="%Y-%m-%dT%H:%M:%S%z")

# fix scale for load question 4 (needs to be reversed)
raw_res["load[SQ004]"] = 20 - raw_res["load[SQ004]"]

In [38]:
working_df = raw_res.drop(columns=["lastpage", "startlanguage", "gender_other", "seed", "token", "refurl"])

meta_df = working_df[meta_cols]
data_df = working_df.drop(columns=meta_cols)

# sanity check df format
# data_df

In [39]:
# produce separate output csv
expl_df = data_df[raw_res["group"] == "E"]
base_df = data_df[raw_res["group"] == "B"]

os.makedirs("./out", exist_ok=True)

expl_df.to_csv("./out/survey_results_expl.csv")
base_df.to_csv("./out/survey_results_base.csv")

### Evaluation

The evaluation is performed using simple statistical tests to compare the different groups. The evaluation will be split between the different questionnaires used in the study (TAM, Self-Efficacy, NASA TLX), where TAM is split into its two subscales (Perceived Usefulness and Perceived Ease of Use), CSE is evaluated as a whole, and NASA TLX is evaluated for four of its six subscales (Mental Demand, Performance, Effort, Frustration).

The validity of the statistical tests is validated using Mann-Whitney U tests and (Welch's) t-tests for all comparisons. since the small sample size does not allow for a reliable assessment (or assumption) of normality, the non-parametric Mann-Whitney U test is preferred. However, for completeness, t-tests are also reported.

Statistical tests are performed using the `scipy.stats` library, and visualizations are created using `seaborn` and `matplotlib`.

In [None]:
# split data into constructs
pu_df = get_pu_df(data_df)
peou_df = get_peou_df(data_df)
se_df = get_self_efficacy_df(data_df)
load_df = get_load_df(data_df)

results = []

TAM is evaluated using the Perceived Usefulness (PU) and Perceived Ease of Use (PEOU) subscales. Each subscale consists of multiple items, which are averaged to obtain a single score for each participant.

In [None]:
pu_base_df = pu_df[pu_df["group"] == "B"].drop(columns=["group"])
pu_expl_df = pu_df[pu_df["group"] == "E"].drop(columns=["group"])

pu_b = pu_base_df.mean(axis=1)
pu_e = pu_expl_df.mean(axis=1)

u_res = stats.mannwhitneyu(pu_b, pu_e, alternative="less")
u_pval = u_res.pvalue
u_stat = u_res.statistic

t_res = stats.ttest_ind(pu_b, pu_e, alternative="less", equal_var=False)
t_pval = t_res.pvalue
t_stat = t_res.statistic

results.append(
    {
        "const": "Perceived Usefulness",
        "b_mean": pu_b.mean(),
        "e_mean": pu_e.mean(),
        "mannwhitneyu_stat": u_stat,
        "mannwhitneyu_p": u_pval,
        "ttest_stat": t_stat,
        "ttest_p": t_pval,
    }
)

In [None]:
peou_base_df = peou_df[peou_df["group"] == "B"].drop(columns=["group"])
peou_expl_df = peou_df[peou_df["group"] == "E"].drop(columns=["group"])

peou_b = peou_base_df.mean(axis=1)
peou_e = peou_expl_df.mean(axis=1)

u_res = stats.mannwhitneyu(peou_b, peou_e, alternative='less')
u_pval = u_res.pvalue
u_stat = u_res.statistic

t_res = stats.ttest_ind(peou_b, peou_e, alternative='less', equal_var=False)
t_pval = t_res.pvalue
t_stat = t_res.statistic

results.append(
    {
        "const": "Perceived Ease of Use",
        "b_mean": peou_b.mean(),
        "e_mean": peou_e.mean(),
        "mannwhitneyu_stat": u_stat,
        "mannwhitneyu_p": u_pval,
        "ttest_stat": t_stat,
        "ttest_p": t_pval,
    }
)

Computer Self-Efficacy (CSE) is evaluated as a whole, using all items from the CSE questionnaire. Similar to TAM, the items are averaged to obtain a single score for each participant.

In [None]:
se_base_df = se_df[se_df["group"] == "B"].drop(columns=["group"])
se_expl_df = se_df[se_df["group"] == "E"].drop(columns=["group"])

se_b = se_base_df.mean(axis=1)
se_e = se_expl_df.mean(axis=1)

u_res = stats.mannwhitneyu(se_b, se_e, alternative='less')
u_pval = u_res.pvalue
u_stat = u_res.statistic

t_res = stats.ttest_ind(se_b, se_e, alternative='less', equal_var=False)
t_pval = t_res.pvalue
t_stat = t_res.statistic

results.append(
    {
        "const": "Self-Efficacy",
        "b_mean": se_b.mean(),
        "e_mean": se_e.mean(),
        "mannwhitneyu_stat": u_stat,
        "mannwhitneyu_p": u_pval,
        "ttest_stat": t_stat,
        "ttest_p": t_pval,
    }
)

Evaluate NASA TLX results as “raw TLX” as per https://doi.org/10.1177/154193120605000909. Since the given task was not time-constrained and did not involve physical effort, the corresponding subscales are ignored in the evaluation.

In [None]:
# columns for mental demand, performance, effort, frustration
cols = [question_dict["load[SQ001]"], question_dict["load[SQ004]"], question_dict["load[SQ005]"], question_dict["load[SQ006]"]]

# rename columns according to question_dict for easier interpretation
load_df = load_df.rename(columns=question_dict)

# split groups
load_base_group = load_df[load_df["group"] == "B"].drop(columns=["group"])
load_expl_group = load_df[load_df["group"] == "E"].drop(columns=["group"])

for col in cols:
    b = load_base_group[col]
    e = load_expl_group[col]

    u_res = stats.mannwhitneyu(b, e, alternative="greater")
    u_pval = u_res.pvalue
    u_stat = u_res.statistic

    t_res = stats.ttest_ind(b, e, alternative="greater", equal_var=False)
    t_pval = t_res.pvalue
    t_stat = t_res.statistic

    results.append(
        {
            "const": col,
            "b_mean": b.mean(),
            "e_mean": e.mean(),
            "mannwhitneyu_stat": u_stat,
            "mannwhitneyu_p": u_pval,
            "ttest_stat": t_stat,
            "ttest_p": t_pval,
        }
    )

In [None]:
results_df = pd.DataFrame(results)
results_df = results_df.rename(columns={
    "const": "Construct",
    "b_mean": "Base Mean",
    "e_mean": "Explanation Mean",
    "mannwhitneyu_stat": "Mann-Whitney U Stat",
    "mannwhitneyu_p": "Mann-Whitney U p-value",
    "ttest_stat": "t-test Stat",
    "ttest_p": "t-test p-value"
})

results_df.to_csv("./out/statistical_test_results.csv", index=False)

results_df

Unnamed: 0,Construct,Base Mean,Explanation Mean,Mann-Whitney U Stat,Mann-Whitney U p-value,t-test Stat,t-test p-value
0,Perceived Usefulness,5.25,5.904762,11.0,0.084082,-1.726694,0.05614
1,Perceived Ease of Use,6.166667,5.928571,24.0,0.693676,0.566684,0.707922
2,Self-Efficacy,9.35,8.214286,26.0,0.785899,1.542875,0.91548
3,Mental Demand,12.333333,10.714286,24.5,0.331933,0.506887,0.31112
4,Performance,7.0,3.857143,27.0,0.213129,1.137086,0.147984
5,Effort,10.666667,9.428571,21.5,0.5,0.381143,0.35567
6,Frustration,10.5,2.571429,37.5,0.010499,2.917351,0.010525
