In [1]:
import pandas as pd
import json
from pathlib import Path
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.width', 1000)

%cd evaluation/ustdy_1_need

/Users/yasith/projects/streaminghub/evaluation/ustdy_1_need


In [5]:
data_path = Path.cwd() / "data"

data = {}
poll = ["N", "Y"]
for path in data_path.glob("*.json"):
    with open(path) as f:
        answer = json.load(f)
        for k, v in answer.items():
            answer[k] = poll.index(v) if v in poll else int(v)
        data[path.stem] = answer

df_raw = pd.DataFrame(data)
# replace Y/N in {b4,b5,c2} with 0/1
df_raw.to_csv("stats/raw.csv")
df_raw.agg(["mean", "std"], axis=1).to_csv("stats/agg.csv")

In [36]:
from matplotlib import pyplot as plt

r = 9 / 16

titles = dict(
    a1="Documenting Datasets",
    a2="Pre-Analysis Data Curation",
    a3="Post-Collection Data Curation",
    a4="Data Loading",
    b1="Understanding Experiment Variables",
    b2="Understanding Data Fields",
    b3="Mimic Real Workload",
    b4="B/M Peak WL",
    b5="B/M Real WL",
    c1="Connecting Data Streams with Algorithms",
    c2="Data Access",
    c3_1="Curator",
    c3_2=" DataMux",
)

indices = dict(
    a1=["Variables", "Metadata", "Fields", "Units"],
    a2=["C+F+", "C+F-", "C-F+", "C-F-"],
    a3=["C+F+", "C+F-", "C-F+", "C-F-"],
    a4=["Reusing", "Scratch"],
    b1=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    b2=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    b3=["D+C+F+", "D+C+F-", "D+C-F+", "D+C-F-", "D-C+F+", "D-C+F-", "D-C-F+", "D-C-F-"],
    c1=[""],
)

multi_cond_qs = ("a1", "a2", "a3", "a4", "b1", "b2", "b3", "c1")
bnary_cond_qs = ("b4", "b5", "c2")
ratng_cond_qs = ("c3",)

# NOTE make sure A1 is interpeted as four different questions

for q in multi_cond_qs:
    qdata = df_raw.loc[df_raw.index.str.startswith(q)]
    qdata_d_cols = qdata.index.str.endswith("_d")
    qdata_t_cols = qdata.index.str.endswith("_t")
    qdata_d = qdata.loc[qdata_d_cols].agg(["mean", "std"], axis=1)
    qdata_t = qdata.loc[qdata_t_cols].agg(["mean", "std"], axis=1)
    # cleanup index names
    if q in indices:
        qdata_d.index = pd.Index(indices[q])
        qdata_t.index = pd.Index(indices[q])
    qdf_mean = pd.DataFrame({"difficulty": qdata_d['mean'], "time demand": qdata_t['mean']})
    qdf_std = pd.DataFrame({"difficulty": qdata_d['std'], "time demand": qdata_t['std']})
    is_b = q[0] == 'b'
    qdf_mean.plot.bar(title=f"Q{q.upper()} - {titles[q]}", yerr=qdf_std, rot=0, stacked=is_b)
    plt.ylim(1, 10 if is_b else 5)
    plt.tight_layout()
    plt.savefig(f"figures/q_{q}.pdf")
    plt.close()

In [41]:
# visualize columns without difficulty / time demand
qdata = df_raw.loc[df_raw.index.str.startswith(bnary_cond_qs)].agg(["mean", "std"], axis=1)
qdata.index = qdata.index.map(titles.get)
qdata["mean"].plot.bar(title=f"Need for Automation Tools", rot=0)
plt.ylim(0.75, 1)
plt.tight_layout()
plt.savefig(f"figures/q_binary.pdf")
plt.close()

# visualize columns without difficulty / time demand
qdata = df_raw.loc[df_raw.index.str.startswith(ratng_cond_qs)].agg(["mean", "std"], axis=1)
qdata.index = qdata.index.map(titles.get)
qdata["mean"].plot.bar(title=f"Potential Impact", yerr=qdata["std"], rot=0)
plt.ylim(4, 5)
plt.tight_layout()
plt.savefig(f"figures/q_rating.pdf")
plt.close()