In [1]:
import pandas as pd
import json
from pathlib import Path
import scipy.stats
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

%cd evaluation/ustdy_1_need

/Users/yasith/projects/streaminghub/evaluation/ustdy_1_need


In [2]:
data_path = Path.cwd() / "data"

data = {}
poll = ["N", "Y"]
for path in data_path.glob("*.json"):
    with open(path) as f:
        answer = json.load(f)
        for k, v in answer.items():
            answer[k] = poll.index(v) if v in poll else int(v)
        data[path.stem] = answer

df_raw = pd.DataFrame(data)
# replace Y/N in {b4,b5,c2} with 0/1
df_raw.to_csv("stats/raw.csv")
df_raw.agg(["mean", "std"], axis=1).to_csv("stats/agg.csv")

In [3]:
from matplotlib import pyplot as plt

r = 9 / 16

titles = dict(
    a1="Documenting Datasets",
    a2="Pre-Analysis Data Curation",
    a3="Post-Collection Data Curation",
    a4="Data Loading",
    b1="Understanding Experiment Variables",
    b2="Understanding Data Fields",
    b3="Mimic Real Workload",
    b4="B/M Peak WL",
    b5="B/M Real WL",
    c1="Connecting Data Streams with Algorithms",
    c2="Data Access",
    c3_1="Curator",
    c3_2=" DataMux",
)

indices = dict(
    a1=["Variables", "Metadata", "Fields", "Units"],
    a2=["C+F+", "C+F-", "C-F+", "C-F-"],
    a3=["C+F+", "C+F-", "C-F+", "C-F-"],
    a4=["Reusing", "Scratch"],
    b1=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    b2=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    b3=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    c1=["Scratch"],
)

multi_cond_qs = ("a1", "a2", "a3", "a4", "b1", "b2", "b3", "c1")
bnary_cond_qs = ("b4", "b5", "c2")
ratng_cond_qs = ("c3",)

# NOTE make sure A1 is interpeted as four different questions
for q in multi_cond_qs:
    qdata = df_raw.loc[df_raw.index.str.startswith(q)]
    qdata_d_cols = qdata.index.str.endswith("_d")
    qdata_t_cols = qdata.index.str.endswith("_t")
    qdata_d = qdata.loc[qdata_d_cols]
    qdata_t = qdata.loc[qdata_t_cols]
    # cleanup index names
    idx = pd.Index(indices[q])
    if q in indices:
        qdata_d.index = idx
        qdata_t.index = idx
    qdf_mean = pd.DataFrame({"difficulty": qdata_d.mean(1), "time demand": qdata_t.mean(1)})
    qdf_std = pd.DataFrame({"difficulty": qdata_d.std(1), "time demand": qdata_t.std(1)})
    is_b = q[0] == 'b'
    qdf_mean.plot.bar(title=f"Q{q.upper()} - {titles[q]}", yerr=qdf_std, rot=0, stacked=is_b)
    plt.ylim(1, 10 if is_b else 5)
    plt.tight_layout()
    plt.savefig(f"figures/q_{q}.pdf")
    plt.close()

In [4]:
# visualize columns without difficulty / time demand
qdata = df_raw.loc[df_raw.index.str.startswith(bnary_cond_qs)].agg(["mean", "std"], axis=1)
qdata.index = qdata.index.map(titles.get)
qdata["mean"].plot.bar(title=f"Need for Automation Tools", rot=0)
plt.ylim(0.75, 1)
plt.tight_layout()
plt.savefig(f"figures/q_binary.pdf")
plt.close()

# visualize columns without difficulty / time demand
qdata = df_raw.loc[df_raw.index.str.startswith(ratng_cond_qs)].agg(["mean", "std"], axis=1)
qdata.index = qdata.index.map(titles.get)
qdata["mean"].plot.bar(title=f"Potential Impact", yerr=qdata["std"], rot=0)
plt.ylim(4, 5)
plt.tight_layout()
plt.savefig(f"figures/q_rating.pdf")
plt.close()

In [10]:
al: dict = dict(alternative="less", equal_var=False)
ag: dict = dict(alternative="greater", equal_var=False)

In [16]:
# comparing difficulty to time demand in a1
qdf = df_raw.loc[df_raw.index.str.startswith("a1")]
qdf_d_cols = qdf.index.str.endswith("_d")
qdf_t_cols = qdf.index.str.endswith("_t")
qdf_d = qdf.loc[qdf_d_cols]
qdf_t = qdf.loc[qdf_t_cols]
# cleanup index names
idx = pd.Index(indices["a1"])
qdf_d.index = idx
qdf_t.index = idx
# casewise test
for row in idx:
    t, p = scipy.stats.ttest_ind(qdf_t.loc[row], qdf_d.loc[row], **ag)
    print(f"(t>d; {row[:4]},a1) : t={t:.4f}, p={p:.4f}")
# marginal test
t, p = scipy.stats.ttest_ind(qdf_t.mean(0), qdf_d.mean(0), **ag)
print(f"(t>d; Mean,a1) : t={t:.4f}, p={max(p, 1e-4):.4f}")
print()

(t>d; Vari,a1) : t=1.4387, p=0.0793
(t>d; Meta,a1) : t=0.7419, p=0.2314
(t>d; Fiel,a1) : t=1.2583, p=0.1080
(t>d; Unit,a1) : t=0.6326, p=0.2654
(t>d; Mean,a1) : t=1.2181, p=0.1154



In [17]:
# comparing condition-to-condition in a2 and a3
for q in ["a2", "a3"]:
    qdf = df_raw.loc[df_raw.index.str.startswith(q)]
    qdf_d_cols = qdf.index.str.endswith("_d")
    qdf_t_cols = qdf.index.str.endswith("_t")
    qdf_d = qdf.loc[qdf_d_cols]
    qdf_t = qdf.loc[qdf_t_cols]
    # cleanup index names
    idx = pd.Index(indices[q])
    qdf_d.index = idx
    qdf_t.index = idx
    c_p = idx.str.contains(r"C\+")
    c_n = idx.str.contains(r"C\-")
    f_p = idx.str.contains(r"F\+")
    f_n = idx.str.contains(r"F\-")
    # condition-wise test
    t_dc, p_dc = scipy.stats.ttest_ind(qdf_d.loc[c_p].mean(0), qdf_d.loc[c_n].mean(0), **al)
    t_df, p_df = scipy.stats.ttest_ind(qdf_d.loc[f_p].mean(0), qdf_d.loc[f_n].mean(0), **al)
    t_tc, p_tc = scipy.stats.ttest_ind(qdf_t.loc[c_p].mean(0), qdf_t.loc[c_n].mean(0), **al)
    t_tf, p_tf = scipy.stats.ttest_ind(qdf_t.loc[f_p].mean(0), qdf_t.loc[f_n].mean(0), **al)
    print(f"(C+<C-; diff,{q}) : t={t_dc:.4f}, p={max(p_dc, 1e-4):.4f}")
    print(f"(F+<F-; diff,{q}) : t={t_df:.4f}, p={max(p_df, 1e-4):.4f}")
    print(f"(C+<C-; time,{q}) : t={t_tc:.4f}, p={max(p_tc, 1e-4):.4f}")
    print(f"(F+<F-; time,{q}) : t={t_tf:.4f}, p={max(p_tf, 1e-4):.4f}")
    print()
    # impact test
    t_dΔ, p_dΔ = scipy.stats.ttest_ind((qdf_d.loc[c_n].values - qdf_d.loc[c_p].values).mean(0), (qdf_d.loc[f_n].values - qdf_d.loc[f_p].values).mean(0), **ag)
    t_tΔ, p_tΔ = scipy.stats.ttest_ind((qdf_t.loc[c_n].values - qdf_t.loc[c_p].values).mean(0), (qdf_t.loc[f_n].values - qdf_t.loc[f_p].values).mean(0), **ag)
    print(f"(ΔC>ΔF; diff,{q}) : t={t_dΔ:.4f}, p={max(p_dΔ, 1e-4):.4f}")
    print(f"(ΔC>ΔF; time,{q}) : t={t_tΔ:.4f}, p={max(p_tΔ, 1e-4):.4f}")
    print()
print()

(C+<C-; diff,a2) : t=-7.8889, p=0.0001
(F+<F-; diff,a2) : t=-4.5128, p=0.0001
(C+<C-; time,a2) : t=-5.7292, p=0.0001
(F+<F-; time,a2) : t=-4.6453, p=0.0001

(ΔC>ΔF; diff,a2) : t=3.2187, p=0.0013
(ΔC>ΔF; time,a2) : t=1.3465, p=0.0933

(C+<C-; diff,a3) : t=-4.2488, p=0.0001
(F+<F-; diff,a3) : t=-2.5731, p=0.0073
(C+<C-; time,a3) : t=-4.3995, p=0.0001
(F+<F-; time,a3) : t=-4.6143, p=0.0001

(ΔC>ΔF; diff,a3) : t=2.1192, p=0.0217
(ΔC>ΔF; time,a3) : t=-0.1840, p=0.5725




In [18]:
# comparing a2 to a3
qdf_a2 = df_raw.loc[df_raw.index.str.startswith("a2")]
qdf_a3 = df_raw.loc[df_raw.index.str.startswith("a3")]
qdf_a2_d_cols = qdf_a2.index.str.endswith("_d")
qdf_a2_t_cols = qdf_a2.index.str.endswith("_t")
qdf_a3_d_cols = qdf_a3.index.str.endswith("_d")
qdf_a3_t_cols = qdf_a3.index.str.endswith("_t")
qdf_a2_d = qdf_a2.loc[qdf_a2_d_cols]
qdf_a2_t = qdf_a2.loc[qdf_a2_t_cols]
qdf_a3_d = qdf_a3.loc[qdf_a3_d_cols]
qdf_a3_t = qdf_a3.loc[qdf_a3_t_cols]
# cleanup index names (a2 and a3 have same indices)
idx = pd.Index(indices["a2"])
qdf_a2_d.index = idx
qdf_a3_d.index = idx
qdf_a2_t.index = idx
qdf_a3_t.index = idx
c_p = idx.str.contains(r"C\+")
c_n = idx.str.contains(r"C\-")
f_p = idx.str.contains(r"F\+")
f_n = idx.str.contains(r"F\-")
# condition-wise test
t_dcp, p_dcp = scipy.stats.ttest_ind(qdf_a3_d.loc[c_p].mean(0), qdf_a2_d.loc[c_p].mean(0), **al)
t_dcn, p_dcn = scipy.stats.ttest_ind(qdf_a3_d.loc[c_n].mean(0), qdf_a2_d.loc[c_n].mean(0), **al)
t_dfp, p_dfp = scipy.stats.ttest_ind(qdf_a3_d.loc[f_p].mean(0), qdf_a2_d.loc[f_p].mean(0), **al)
t_dfn, p_dfn = scipy.stats.ttest_ind(qdf_a3_d.loc[f_n].mean(0), qdf_a2_d.loc[f_n].mean(0), **al)
t_tcp, p_tcp = scipy.stats.ttest_ind(qdf_a3_t.loc[c_p].mean(0), qdf_a2_t.loc[c_p].mean(0), **al)
t_tcn, p_tcn = scipy.stats.ttest_ind(qdf_a3_t.loc[c_n].mean(0), qdf_a2_t.loc[c_n].mean(0), **al)
t_tfp, p_tfp = scipy.stats.ttest_ind(qdf_a3_t.loc[f_p].mean(0), qdf_a2_t.loc[f_p].mean(0), **al)
t_tfn, p_tfn = scipy.stats.ttest_ind(qdf_a3_t.loc[f_n].mean(0), qdf_a2_t.loc[f_n].mean(0), **al)
print(f"(a3<a2; diff,C+) : t={t_dcp:.4f}, p={max(p_dcp, 1e-4):.4f}")
print(f"(a3<a2; diff,C-) : t={t_dcn:.4f}, p={max(p_dcn, 1e-4):.4f}")
print(f"(a3<a2; diff,F+) : t={t_dfp:.4f}, p={max(p_dfp, 1e-4):.4f}")
print(f"(a3<a2; diff,F-) : t={t_dfn:.4f}, p={max(p_dfn, 1e-4):.4f}")
print(f"(a3<a2; time,C+) : t={t_tcp:.4f}, p={max(p_tcp, 1e-4):.4f}")
print(f"(a3<a2; time,C-) : t={t_tcn:.4f}, p={max(p_tcn, 1e-4):.4f}")
print(f"(a3<a2; time,F+) : t={t_tfp:.4f}, p={max(p_tfp, 1e-4):.4f}")
print(f"(a3<a2; time,F-) : t={t_tfn:.4f}, p={max(p_tfn, 1e-4):.4f}")
print()

(a3<a2; diff,C+) : t=-0.0986, p=0.4610
(a3<a2; diff,C-) : t=-1.7685, p=0.0435
(a3<a2; diff,F+) : t=-0.5247, p=0.3014
(a3<a2; diff,F-) : t=-1.5605, p=0.0636
(a3<a2; time,C+) : t=-0.4370, p=0.3323
(a3<a2; time,C-) : t=-1.8428, p=0.0366
(a3<a2; time,F+) : t=-1.2363, p=0.1120
(a3<a2; time,F-) : t=-1.2329, p=0.1126



In [23]:
# comparing a4 to c1
qdf = df_raw.loc[df_raw.index.str.startswith(("a4", "c1"))]
qdf_d_cols = qdf.index.str.endswith("_d")
qdf_t_cols = qdf.index.str.endswith("_t")
qdf_d = qdf.loc[qdf_d_cols].sort_index()
qdf_t = qdf.loc[qdf_t_cols].sort_index()
# cleanup index names
idx = pd.Index(["DL_F", "DL_S", "PL_S"])
qdf_d.index = qdf_t.index = idx
# impact test
t_da, p_da = scipy.stats.ttest_ind(qdf_d.loc["DL_F"], qdf_d.loc["DL_S"], **al)
t_db, p_db = scipy.stats.ttest_ind(qdf_d.loc["DL_F"], qdf_d.loc["PL_S"], **al)
t_ta, p_ta = scipy.stats.ttest_ind(qdf_t.loc["DL_F"], qdf_t.loc["DL_S"], **al)
t_tb, p_tb = scipy.stats.ttest_ind(qdf_t.loc["DL_F"], qdf_t.loc["PL_S"], **al)
print(f"(DL_F<DL_S; diff) : t={t_da:.4f}, p={max(p_da, 1e-4):.4f}")
print(f"(DL_F<PL_S; diff) : t={t_db:.4f}, p={max(p_db, 1e-4):.4f}")
print(f"(DL_F<DL_S; time) : t={t_ta:.4f}, p={max(p_ta, 1e-4):.4f}")
print(f"(DL_F<PL_S; time) : t={t_tb:.4f}, p={max(p_tb, 1e-4):.4f}")
print()

(DL_F<DL_S; diff) : t=-3.5467, p=0.0005
(DL_F<PL_S; diff) : t=-3.9261, p=0.0002
(DL_F<DL_S; time) : t=-5.3203, p=0.0001
(DL_F<PL_S; time) : t=-8.3121, p=0.0001



In [22]:
dcf_qs = ["b1", "b2", "b3"]
for q in dcf_qs:
    qdata = df_raw.loc[df_raw.index.str.startswith(q)]
    qdata_d_cols = qdata.index.str.endswith("_d")
    qdata_t_cols = qdata.index.str.endswith("_t")
    qdata_d = qdata.loc[qdata_d_cols]
    qdata_t = qdata.loc[qdata_t_cols]
    # cleanup index names
    idx = pd.Index(indices[q])
    if q in indices:
        qdata_d.index = idx
        qdata_t.index = idx
    d_p = qdata_d.index.str.contains(r"D\+")
    d_n = qdata_d.index.str.contains(r"D\-")
    c_p = qdata_d.index.str.contains(r"C\+")
    c_n = qdata_d.index.str.contains(r"C\-")
    f_p = qdata_d.index.str.contains(r"F\+")
    f_n = qdata_d.index.str.contains(r"F\-")
    # condition-wise test
    t_dd, p_dd = scipy.stats.ttest_ind(qdata_d.loc[d_p].mean(0), qdata_d.loc[d_n].mean(0), **al)
    t_dc, p_dc = scipy.stats.ttest_ind(qdata_d.loc[c_p].mean(0), qdata_d.loc[c_n].mean(0), **al)
    t_df, p_df = scipy.stats.ttest_ind(qdata_d.loc[f_p].mean(0), qdata_d.loc[f_n].mean(0), **al)
    t_td, p_td = scipy.stats.ttest_ind(qdata_t.loc[d_p].mean(0), qdata_t.loc[d_n].mean(0), **al)
    t_tc, p_tc = scipy.stats.ttest_ind(qdata_t.loc[c_p].mean(0), qdata_t.loc[c_n].mean(0), **al)
    t_tf, p_tf = scipy.stats.ttest_ind(qdata_t.loc[f_p].mean(0), qdata_t.loc[f_n].mean(0), **al)
    print(f"(D+<D-; diff,{q}) : t={t_dd:.4f}, p={max(p_dd, 1e-4):.4f}")
    print(f"(C+<C-; diff,{q}) : t={t_dc:.4f}, p={max(p_dc, 1e-4):.4f}")
    print(f"(F+<F-; diff,{q}) : t={t_df:.4f}, p={max(p_df, 1e-4):.4f}")
    print(f"(D+<D-; time,{q}) : t={t_td:.4f}, p={max(p_td, 1e-4):.4f}")
    print(f"(C+<C-; time,{q}) : t={t_tc:.4f}, p={max(p_tc, 1e-4):.4f}")
    print(f"(F+<F-; time,{q}) : t={t_tf:.4f}, p={max(p_tf, 1e-4):.4f}")
    print()
    # impact test
    t_d_Δdc, p_d_Δdc = scipy.stats.ttest_ind((qdata_d.loc[d_n].values - qdata_d.loc[d_p].values).mean(0), (qdata_d.loc[c_n].values - qdata_d.loc[c_p].values).mean(0), **ag)
    t_t_Δdc, p_t_Δdc = scipy.stats.ttest_ind((qdata_t.loc[d_n].values - qdata_t.loc[d_p].values).mean(0), (qdata_t.loc[c_n].values - qdata_t.loc[c_p].values).mean(0), **ag)
    t_d_Δcf, p_d_Δcf = scipy.stats.ttest_ind((qdata_d.loc[c_n].values - qdata_d.loc[c_p].values).mean(0), (qdata_d.loc[f_n].values - qdata_d.loc[f_p].values).mean(0), **ag)
    t_t_Δcf, p_t_Δcf = scipy.stats.ttest_ind((qdata_t.loc[c_n].values - qdata_t.loc[c_p].values).mean(0), (qdata_t.loc[f_n].values - qdata_t.loc[f_p].values).mean(0), **ag)
    print(f"(ΔD>ΔC; diff,{q}) : t={t_d_Δdc:.4f}, p={max(p_d_Δdc, 1e-4):.4f}")
    print(f"(ΔD>ΔC; time,{q}) : t={t_t_Δdc:.4f}, p={max(p_t_Δdc, 1e-4):.4f}")
    print(f"(ΔC>ΔF; diff,{q}) : t={t_d_Δcf:.4f}, p={max(p_d_Δcf, 1e-4):.4f}")
    print(f"(ΔC>ΔF; time,{q}) : t={t_t_Δcf:.4f}, p={max(p_t_Δcf, 1e-4):.4f}")
    print()

(D+<D-; diff,b1) : t=-10.4198, p=0.0001
(C+<C-; diff,b1) : t=-6.1363, p=0.0001
(F+<F-; diff,b1) : t=-3.1675, p=0.0018
(D+<D-; time,b1) : t=-8.2099, p=0.0001
(C+<C-; time,b1) : t=-6.9195, p=0.0001
(F+<F-; time,b1) : t=-4.4103, p=0.0001

(ΔD>ΔC; diff,b1) : t=3.5309, p=0.0006
(ΔD>ΔC; time,b1) : t=1.9298, p=0.0311
(ΔC>ΔF; diff,b1) : t=3.3216, p=0.0011
(ΔC>ΔF; time,b1) : t=1.9156, p=0.0318

(D+<D-; diff,b2) : t=-10.7346, p=0.0001
(C+<C-; diff,b2) : t=-5.2575, p=0.0001
(F+<F-; diff,b2) : t=-2.8497, p=0.0038
(D+<D-; time,b2) : t=-7.3462, p=0.0001
(C+<C-; time,b2) : t=-6.8925, p=0.0001
(F+<F-; time,b2) : t=-4.6988, p=0.0001

(ΔD>ΔC; diff,b2) : t=5.7915, p=0.0001
(ΔD>ΔC; time,b2) : t=1.7648, p=0.0448
(ΔC>ΔF; diff,b2) : t=3.2152, p=0.0015
(ΔC>ΔF; time,b2) : t=2.0353, p=0.0251

(D+<D-; diff,b3) : t=-8.5552, p=0.0001
(C+<C-; diff,b3) : t=-4.1679, p=0.0001
(F+<F-; diff,b3) : t=-2.1585, p=0.0190
(D+<D-; time,b3) : t=-6.8881, p=0.0001
(C+<C-; time,b3) : t=-4.9940, p=0.0001
(F+<F-; time,b3) : t=-2.716