In [None]:
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import plot_likert as likert

from IPython import display

import scipy as sp
import math

In [None]:
# Import answers
data = pd.read_csv("Group Calendar Questionnaire (Responses).csv")
# Rename columns to more sensible names
data.rename(
    columns={
        "Please provide your ID that was assigned to you by your presenter:": "ID",
        "Please provide the variant you were testing:": "Variant",
        # Perceived Speed
        "How long do you think it took you to select the correct template in the first subtask (in seconds)?": "PS_Task1",
        "How long do you think it took you to select the correct template in the second subtask (in seconds)?": "PS_Task2",
        #"How long do you think it took you to select the correct template in the third subtask (in seconds)?": "PS_Task3",
        # NASA
        #"How mentally demanding was the task?": "NASA_MentalDemand",
        #"How physically demanding was the task?": "NASA_PhysicalDemand",
        #"How hurried or rushed was the pace of the task?": "NASA_Rush",
        #"How successful were you in accomplishing what you were asked to do?": "NASA_Success",
        #"How hard did you have to work to accomplish your level of performance?": "NASA_Hard",
        #"How insecure, discouraged, irritated, stressed and annoyed were you?": "NASA_Stress",
        # SUS
        "I think that I would like to use this system frequently.": "SUS_Frequency",
        "I found the system unnecessarily complex.": "SUS_Complexity",
        "I thought the system was easy to use.": "SUS_Ease",
        "I think that I would need the support of a technical person to be able to use this system.": "SUS_Support",
        "I found the various functions in this system were well integrated.": "SUS_Integration",
        "I thought there was too much inconsistency in this system.": "SUS_Inconsistency",
        "I would imagine that most people would learn to use this system very quickly .": "SUS_Learnability",
        "I found the system very cumbersome to use.": "SUS_Cumbersome",
        "I felt very confident using the system.": "SUS_Confidence",
        "I needed to learn a lot of things before I could get going with this system.": "SUS_Start"
    }, inplace=True
)
# Disregard timestamps
data.drop(labels=["Timestamp"], axis=1, inplace=True)

# Output preview of data
print("Preview of the Data:")
print(data.head(3))

In [None]:
# Analyze Perceived Speed
columns = list(filter(lambda x: x.startswith("PS"), data.columns.values))
data["PS_Total"] = data["PS_Task1"] + data["PS_Task2"]

columns = columns + ["PS_Total"]
titles = ["Perceived Speed (Task 1)", "Perceived Speed (Task 2)", "Perceived Speed"]

# Set font size
plt.rc('axes', titlesize=15)     # fontsize of the plot title
plt.rc('axes', labelsize=15)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=10)    # fontsize of the tick labels
plt.rc('ytick', labelsize=10)    # fontsize of the tick labels

# For each column
for column in zip(columns, titles):
    print("Analyzing column " + column[0] + ":")
    # Extract relevant part
    temp = data[["ID", "Variant", column[0]]].copy()
    # temp = temp.loc[temp["ID"].isin([0, 1, 2, 3, 5, 7])]
    tempA = temp.loc[temp["Variant"] == "A"]
    tempB = temp.loc[temp["Variant"] == "B"]
    # Print Average and Standard Deviation
    avgA, avgB = tempA[column[0]].mean(), tempB[column[0]].mean()
    medianA, medianB = tempA[column[0]].median(), tempB[column[0]].median()
    stdA, stdB = tempA[column[0]].std(), tempB[column[0]].std()
    print("Average (A): " + str(avgA))
    print("Median (A): " + str(medianA))
    print("Standard Deviation (A): " + str(stdA))
    print("Average (B): " + str(avgB))
    print("Median (B): " + str(medianB))
    print("Standard Deviation (B): " + str(stdB))
    # Histogram Plot
    # plot = sns.histplot(data=temp, x=column[0], hue="Variant", multiple="dodge", 
    #                     shrink=0.8, bins=([0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5,11.5,12.5,13.5,14.5,15.5,16.5,17.5,18.5,19.5,20.5] if column[0] == "PS_Total" else [0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5,8.5,9.5,10.5]))
    # plot.set_title(column[1])
    # plot.set_xlabel("Perceived Speed (seconds)")
    # plot.set_ylabel("Nubmer of Participants")
    # plot.set_xticks(ticks=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] if column[0] == "PS_Total" else [0,1,2,3,4,5,6,7,8,9,10])
    # plot.yaxis.set_major_locator(ticker.MaxNLocator(integer=True)) 
    # display.display(plt.gcf())
    # plt.clf()
    # Box Plot with standard deviation
    # plt.figure(figsize=(8, 6))
    # box = plt.boxplot([tempA[column[0]].dropna(), tempB[column[0]].dropna()], labels=['A', 'B'], patch_artist=True)

    # colors = ['#6797C4', '#F3A254']
    # for patch, color in zip(box['boxes'], colors):
    #     patch.set_facecolor(color)
    # for median in box['medians']:
    #     median.set_color('red') 

    # plt.title(column[1])
    # plt.ylabel("Perceived Speed (seconds)")
    # plt.show()
    # Bar Plot with standard deviation
    plt.figure(figsize=(2, 5))
    sns.barplot(x='Variant', y=column[0], data=temp, capsize=.1, errorbar='sd', hue="Variant", order=["A", "B"], estimator="mean")
    plt.title(column[1])
    plt.ylabel("Perceived Speed (seconds)")
    plt.ylim(0, 20)
    plt.show()
    # Check Normality & Equal Variance:
    _, shapiroA = sp.stats.shapiro(tempA[column[0]])
    _, shapiroB = sp.stats.shapiro(tempB[column[0]])
    print("Shapiro-Wilk (A/B):\t" + str(shapiroA) + "\t/\t" + str(shapiroB))
    print("Shapiro-Wilk Results (A/B):\t" + ("Normally distributed" if shapiroA > 0.05 else "Not Normally distributed") + "\t/\t" + ("Normally distributed" if shapiroB > 0.05 else "Not Normally distributed"))
    _, levene = sp.stats.levene(tempA[column[0]], tempB[column[0]], center="median" if column[0] == "Errors" else "mean")
    print("Levene: " + str(levene))
    print("Levene Result: " + ("Same Variance" if levene > 0.05 else "Different Variance") + "\n")
    # Use either a Parametric or Non-Parametric test depending on the results of the previous tests
    pValue = None
    if shapiroA > 0.05 and shapiroB > 0.05 and levene > 0.05:
        # Normally Distributed and Equal Variance, Can use a parametric test
        res = sp.stats.ttest_rel(tempA[column[0]], tempB[column[0]], nan_policy="raise")
        pValue = res.pvalue
        print("T-Test (Parametric) Degrees Of Freedom: " + str(res.df)) # Note: This considers only the differences, e.g. looking at 2n values will give dof n-1 instead of 2(n-1).
        print("T-Test (Parametric) T-Value: " + str(res.statistic))
        print("T-Test (Parametric) Confidence Interval: " + str(res.confidence_interval(0.95)))
        cohen = (avgA - avgB)/(pd.Series([stdA, stdB]).mean())
        print("Effect Size (Cohen's d): " + str(cohen))
    else:
        res = sp.stats.wilcoxon(tempA[column[0]], tempB[column[0]], nan_policy="raise", method="approx")
        pValue = res.pvalue
        zValue = res.zstatistic
        print("Wilcoxon (Non-Parametric) Z-Value: " + str(zValue))
        effect_size = zValue / math.sqrt(12)
        print("Effect Size (Wilcoxon): " + str(effect_size))
    print("\nP-Value: " + str(pValue))
    # Interpret results:
    if pValue <= 0.05:
        print("REJECT the null hypothesis, difference is statistically significant")
    else:
        print("FAIL TO REJECT the null hypothesis, difference is not statistically significant")
    print("\n\n")

In [None]:
# Analyze Usability
columns = list(filter(lambda x: x.startswith("SUS"), data.columns.values))
data["SUS_TOTAL"] = 2.5 * ((data["SUS_Frequency"] + data["SUS_Ease"] + data["SUS_Integration"] + data["SUS_Learnability"] + data["SUS_Confidence"] - 5) + (25 - data["SUS_Complexity"] - data["SUS_Support"] - data["SUS_Inconsistency"] - data["SUS_Cumbersome"] - data["SUS_Start"]))

columns = columns + ["SUS_TOTAL"]
titles = ["How frequently would you use the system?", "How complex was the system?", "How easy was the system to use?", 
          "How much support would you need to use the system?", "How well integrated were the functions of the system?", 
          "How inconsistent was the system?", "How easy was it to learn to use the system?", "How cumbersome was the system?", 
          "How confident did you feel using the system?", "How much did you need to learn before you could get going with the system?", "Total SUS Score"] # Give more meaningful names

y_ticks_limit =  [10, 8, 10, 5, 10, 5, 10, 5, 12, 5, 150]

print(columns)

# Set font size
plt.rc('axes', titlesize=15)     # fontsize of the plot title
# plt.rc('axes', labelsize=15)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=10)   # fontsize of the tick labels
plt.rc('ytick', labelsize=10)    # fontsize of the tick labels

# For each column
for column in zip(columns, titles, y_ticks_limit):
    # fontsize of the x and y labels
    plt.rc('axes', labelsize=12) if column[0] != "SUS_TOTAL" else plt.rc('axes', labelsize=15)
    print("Analyzing column " + column[0] + ":")
    # Extract relevant part
    temp = data[["ID", "Variant", column[0]]].copy()
    # temp = temp.loc[temp["ID"].isin([0, 1, 2, 3, 5, 7])]
    tempA = temp.loc[temp["Variant"] == "A"]
    tempB = temp.loc[temp["Variant"] == "B"]
    # Print Average and Standard Deviation
    avgA, avgB = tempA[column[0]].mean(), tempB[column[0]].mean()
    medianA, medianB = tempA[column[0]].median(), tempB[column[0]].median()
    stdA, stdB = tempA[column[0]].std(), tempB[column[0]].std()
    print("Average (A): " + str(avgA))
    print("Median (A): " + str(medianA))
    print("Standard Deviation (A): " + str(stdA))
    print("Average (B): " + str(avgB))
    print("Median (B): " + str(medianB))
    print("Standard Deviation (B): " + str(stdB))
    # Plot
    # plot = sns.histplot(data=temp, x=column[0], hue="Variant", multiple="dodge",
    #                     shrink=0.8, bins=100 if column[0] == "SUS_TOTAL" else [0.5,1.5,2.5,3.5,4.5,5.5])
    # # plot.set_xticks(ticks=[0,1,2,3,4,5,6,7,8,9,10] if column != "SUS_TOTAL" else [])
    # plot.set_title(column[1])
    # if column[0] != "SUS_TOTAL":    # Do something different for SUS_TOTAL
    #     plot.set_xlabel("1: Strongly Disagree - 5: Strongly Agree")
    # plot.set_ylabel("Number of Participants")
    # display.display(plt.gcf())
    # plt.clf()
    # Box Plot with standard deviation
    # plt.figure(figsize=(8, 6))
    # box = plt.boxplot([tempA[column[0]].dropna(), tempB[column[0]].dropna()], labels=['A', 'B'], patch_artist=True)

    # colors = ['#6797C4', '#F3A254']
    # for patch, color in zip(box['boxes'], colors):
    #     patch.set_facecolor(color)
    # for median in box['medians']:
    #     median.set_color('red') 
          
    # plt.title(column[1])
    # plt.ylabel("1: Storngly Disagree, 5: Strongly Agree")
    # plt.show()
    # Bar Plot with standard deviation
    plt.figure(figsize=(2, 5))
    sns.barplot(x='Variant', y=column[0], data=temp, capsize=.1, errorbar='sd', hue="Variant", order=["A", "B"], estimator="median")
    plt.title(column[1])
    plt.ylabel("1: Storngly Disagree, 5: Strongly Agree" if column[0] != "SUS_TOTAL" else "SUS Score")
    plt.ylim(0, column[2])
    plt.show()
    # Perform Wilcoxon test
    # res = sp.stats.wilcoxon(tempA[column[0]], tempB[column[0]], nan_policy="raise", method="approx")
    # pValue = res.pvalue
    # print("Wilcoxon (Non-Parametric) Z-Value: " + str(res.zstatistic))
    # print("\nP-Value: " + str(pValue))
    # # Interpret results:
    # if pValue <= 0.05:
    #     print("REJECT the null hypothesis, difference is statistically significant")
    # else:
    #     print("FAIL TO REJECT the null hypothesis, difference is not statistically significant")
    # effect_size = zValue / math.sqrt(12)
    # print("Effect Size (Wilcoxon): " + str(effect_size))
    # print("\n\n")

In [None]:
# Represent data with likert scale
modifiedData = data.copy()

# Modify column names
modifiedData.rename(
    columns={
        "SUS_Frequency": "Frequency",
        "SUS_Complexity": "Complexity",
        "SUS_Ease": "Ease",
        "SUS_Support": "Support",
        "SUS_Integration": "Integration",
        "SUS_Inconsistency": "Inconsistency",
        "SUS_Learnability": "Learnability",
        "SUS_Cumbersome": "Cumbersome",
        "SUS_Confidence": "Confidence",
        "SUS_Start": "Start"
    }, inplace=True
)

# Replace numeric values to likert scale
scale = ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"]
scaleDataFrame = pd.DataFrame({"Code": [1, 2, 3, 4, 5],
                      "Label": scale})

modifiedData = modifiedData.replace(scaleDataFrame.set_index("Code")["Label"])
tempA = modifiedData.copy().loc[data["Variant"] == "A"].drop(labels=["ID", "PS_Task1", "PS_Task2", "PS_Total", "Variant", "Email Address", "SUS_TOTAL"], axis=1)
tempB = modifiedData.copy().loc[data["Variant"] == "B"].drop(labels=["ID", "PS_Task1", "PS_Task2", "PS_Total", "Variant", "Email Address", "SUS_TOTAL"], axis=1)

plotA = likert.plot_likert(tempA, scale)
plotB = likert.plot_likert(tempB, scale)
plotA.set_title("Likert Scale (Variant A)")
plotB.set_title("Likert Scale (Variant B)")
plotA.legend(bbox_to_anchor=(-0.205, 0, 1.2, -0.15), loc="upper right", mode="expand", borderaxespad=0, ncol=5)
plotB.legend(bbox_to_anchor=(-0.205, 0, 1.2, -0.15), loc="upper right", mode="expand", borderaxespad=0, ncol=5)


In [None]:
# Analyze Mental Load (Not relevant anymore)
"""
columns = list(filter(lambda x: x.startswith("NASA"), data.columns.values))
# TODO: Find raw TLX index in case intdermediate TLX results are interesting.

# For each column
for column in columns:
    print("Analyzing column " + column + ":")
    # Extract relevant part
    temp = data[["ID", "Variant", column]].copy()
    tempA = temp.loc[temp["Variant"] == "A"]
    tempB = temp.loc[temp["Variant"] == "B"]
    # Print Average and Standard Deviation
    avgA, avgB = tempA[column].mean(), tempB[column].mean()
    medianA, medianB = tempA[column].median(), tempB[column].median()
    stdA, stdB = tempA[column].std(), tempB[column].std()
    print("Average (A): " + str(avgA))
    print("Median (A): " + str(medianA))
    print("Standard Deviation (A): " + str(stdA))
    print("Average (B): " + str(avgB))
    print("Median (B): " + str(medianB))
    print("Standard Deviation (B): " + str(stdB))
    # Plot
    plot = sns.histplot(data=temp, x=column, hue="Variant", bins=[0.5,1.5,2.5,3.5,4.5,5.5,6.5,7.5])
    display.display(plt.gcf())
    plt.clf()
    # Perform Wilcoxon test
    res = sp.stats.wilcoxon(tempA[column], tempB[column], nan_policy="raise", method="approx")
    pValue = res.pvalue
    print("Wilcoxon (Non-Parametric) Z-Value: " + str(res.zstatistic))
    print("\nP-Value: " + str(pValue))
    # Interpret results:
    if pValue <= 0.05:
        print("REJECT the null hypothesis, difference is statistically significant")
    else:
        print("FAIL TO REJECT the null hypothesis, difference is not statistically significant")
    cohen = (avgA - avgB)/(pd.Series([stdA, stdB]).mean())
    print("Effect Size (Cohen's d): " + str(cohen))
    print("\n\n")
"""