In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import analysis

In [None]:
analyser = analysis.Analyser("ycsb")
all_data = analyser.get_data()
all_data

In [None]:
ignore_vars = [
    "ledger_chunk_bytes",
    "snapshot_tx_interval",
    "sig_tx_interval",
    "sig_ms_interval",
    "nodes",
]

# All var plots

In [None]:
# plot_data = all_data.copy(deep=False)
# plot_data = plot_data[plot_data["start_ms"] > 250]
# analyser.plot_scatter(plot_data, col="workload", ignore_vars=ignore_vars)

In [None]:
plot_data = all_data.copy(deep=False)
plot_data = plot_data[plot_data["start_ms"] > 250]
analyser.plot_ecdf(plot_data, col="workload", ignore_vars=ignore_vars)

In [None]:
plot_data = all_data.copy(deep=False)
p = analyser.plot_percentile_latency_over_time(
    plot_data, col="workload", ignore_vars=ignore_vars
)
p.set(xlabel="time (ms)", ylabel="latency (ms)")

In [None]:
plot_data = all_data.copy(deep=False)
p = analyser.plot_throughput_over_time(
    plot_data, col="workload", ignore_vars=ignore_vars
)
p.set(xlabel="time (ms)", ylabel="achieved throughput (req/s)")

# Workload comparison

In [None]:
def plot_latency_and_throughput(datasets, labels, col_headers, ignore_vars):
    figure, axis = plt.subplots(
        2, len(datasets), sharex="col", sharey="row", figsize=(10, 4)
    )

    for axis_y in axis:
        for axis_x in axis_y:
            axis_x.grid(True)

    for dataset in datasets:
        for d in dataset:
            # check that we don't have hidden variables grouped
            var, invariant_vars = analysis.condense_vars(d, ignore_vars)
            assert len(var) == 0, set(var)

    print("Invariants:", invariant_vars)

    def percentile_latencies(data):
        end = data["start_s"].max()
        print(end)
        group_cols = [pd.cut(data["start_s"], np.arange(0, end, interval))]
        grouped = data.groupby(group_cols)
        latencies = grouped.quantile(percentile, numeric_only=True)
        mid = latencies.index.map(lambda x: (x.left + x.right) // 2)
        latencies["mid"] = mid
        x = latencies["mid"]
        y = latencies["latency_ms"]
        return x, y

    def throughput_over_time(data):
        x = data["start_s"]
        end = data["start_s"].max()
        group_cols = [pd.cut(data["start_s"], np.arange(0, end, interval))]
        grouped = data.groupby(group_cols)
        throughputs = grouped.count() // interval
        mid = throughputs.index.map(lambda x: (x.left + x.right) // 2)
        throughputs["mid"] = mid
        x = throughputs["mid"]
        y = throughputs["latency_ms"]
        return x, y

    interval = 1
    percentile = 0.99

    # set titles on first row
    for ax, col in zip(axis[0], col_headers):
        ax.set_title(col)

    for (dataset, ax) in zip(datasets, axis[0]):
        for (d, l) in zip(dataset, labels):
            x, y = percentile_latencies(d)

            ax.plot(x, y, label=l)

        # ax.legend()

    axis[0][0].set_ylabel("Request latency (ms, 99%)")

    for (dataset, ax) in zip(datasets, axis[1]):
        for (d, l) in zip(dataset, labels):
            x, y = throughput_over_time(d)
            ax.plot(x, y, label=l)

    axis[0][-1].legend(bbox_to_anchor=(1.05, 1))

    axis[1][2].set_xlabel("Time (s)")
    axis[1][0].set_ylabel("Achieved throughput (req/s)")

    return figure, axis

In [None]:
plot_data = all_data.copy(deep=False)
plot_data = plot_data[plot_data["threads"] == 10]

etcd_data = plot_data[plot_data["store"] == "etcd"]
lskv_data = plot_data[plot_data["store"] == "lskv"]
sgx_data = lskv_data[lskv_data["enclave"] == "sgx"]
virtual_data = lskv_data[lskv_data["enclave"] == "virtual"]

etcd_a_data = etcd_data[etcd_data["workload"] == "workloada"]
etcd_b_data = etcd_data[etcd_data["workload"] == "workloadb"]
etcd_c_data = etcd_data[etcd_data["workload"] == "workloadc"]
etcd_d_data = etcd_data[etcd_data["workload"] == "workloadd"]
etcd_e_data = etcd_data[etcd_data["workload"] == "workloade"]
etcd_f_data = etcd_data[etcd_data["workload"] == "workloadf"]

sgx_a_data = sgx_data[sgx_data["workload"] == "workloada"]
sgx_b_data = sgx_data[sgx_data["workload"] == "workloadb"]
sgx_c_data = sgx_data[sgx_data["workload"] == "workloadc"]
sgx_d_data = sgx_data[sgx_data["workload"] == "workloadd"]
sgx_e_data = sgx_data[sgx_data["workload"] == "workloade"]
sgx_f_data = sgx_data[sgx_data["workload"] == "workloadf"]

virtual_a_data = virtual_data[virtual_data["workload"] == "workloada"]
virtual_b_data = virtual_data[virtual_data["workload"] == "workloadb"]
virtual_c_data = virtual_data[virtual_data["workload"] == "workloadc"]
virtual_d_data = virtual_data[virtual_data["workload"] == "workloadd"]
virtual_e_data = virtual_data[virtual_data["workload"] == "workloade"]
virtual_f_data = virtual_data[virtual_data["workload"] == "workloadf"]

datasets = [
    [etcd_a_data, sgx_a_data, virtual_a_data],
    [etcd_b_data, sgx_b_data, virtual_b_data],
    [etcd_c_data, sgx_c_data, virtual_c_data],
    [etcd_d_data, sgx_d_data, virtual_d_data],
    # [etcd_e_data, sgx_e_data, virtual_e_data],
    [etcd_f_data, sgx_f_data, virtual_f_data],
]
for dataset in datasets:
    for d in dataset:
        d["start_ms"] -= d["start_ms"].min()
        d["start_s"] = d["start_ms"] / 1000

fig, axes = plot_latency_and_throughput(
    datasets,
    ["etcd", "SGX CKVS", "Virtual CKVS"],
    ["Workload A", "Workload B", "Workload C", "Workload D", "Workload F"],
    ignore_vars + ["start_s", "operation"],
)

fig.tight_layout()
fig.savefig("../plots/ycsb/anon-workloads-comparison.png")

fig, axes = plot_latency_and_throughput(
    datasets,
    ["etcd", "SGX LSKV", "Virtual LSKV"],
    ["Workload A", "Workload B", "Workload C", "Workload D", "Workload F"],
    ignore_vars + ["start_s", "operation"],
)

fig.tight_layout()
fig.savefig("../plots/ycsb/final-workloads-comparison.png")

# workload e, since it is slow

In [None]:
def plot_latency_cdf_single_workload(datasets, labels, ignore_vars):
    figure = plt.figure()

    plt.grid(True)

    for dataset in datasets:
        # check that we don't have hidden variables grouped
        var, invariant_vars = analysis.condense_vars(dataset, ignore_vars)
        assert len(var) == 0, set(var)

    print("Invariants:", invariant_vars)

    for (dataset, label) in zip(datasets, labels):
        sns.ecdfplot(data=dataset["latency_ms"], label=label)

    figure.legend(bbox_to_anchor=(0.9, 0.5))
    plt.xlabel("Request latency (ms)")
    plt.ylabel("Proportion of requests")

    return figure

In [None]:
plot_data = all_data.copy(deep=False)
plot_data = plot_data[plot_data["threads"] == 10]

etcd_data = plot_data[plot_data["store"] == "etcd"]
lskv_data = plot_data[plot_data["store"] == "lskv"]
sgx_data = lskv_data[lskv_data["enclave"] == "sgx"]
virtual_data = lskv_data[lskv_data["enclave"] == "virtual"]

etcd_e_data = etcd_data[etcd_data["workload"] == "workloade"]

virtual_e_data = virtual_data[virtual_data["workload"] == "workloade"]

sgx_e_data = sgx_data[sgx_data["workload"] == "workloade"]

datasets = [
    etcd_e_data,
    sgx_e_data,
    virtual_e_data,
]
for dataset in datasets:
    dataset["start_ms"] -= dataset["start_ms"].min()
    dataset["start_s"] = dataset["start_ms"] / 1000

fig = plot_latency_cdf_single_workload(
    datasets,
    ["etcd", "SGX CKVS", "Virtual CKVS"],
    ignore_vars + ["start_s", "operation"],
)

# fig.tight_layout()
fig.savefig("../plots/ycsb/anon-workloade.png")

fig = plot_latency_cdf_single_workload(
    datasets,
    ["etcd", "SGX LSKV", "Virtual LSKV"],
    ignore_vars + ["start_s", "operation"],
)

fig.savefig("../plots/ycsb/final-workloade.png")

# Headline stat

In [None]:
def headline_stats(workload, enclave, debug=False):
    data = all_data.copy(deep=False)

    data = data[data["workload"] == workload]

    quantile = 0.99

    etcd_data = data[data["store"] == "etcd"]
    var, _ = analysis.condense_vars(etcd_data, ignore_vars + ["operation"])
    assert len(var) == 0, set(var)

    etcd_latency = etcd_data["latency_ms"].quantile(quantile)
    etcd_end = etcd_data["end_ms"].max()
    etcd_count = etcd_data["latency_ms"].count()
    etcd_throughput = etcd_count / (etcd_end / 1000)
    if debug:
        print("etcd latency", etcd_latency)
        print("etcd throughput", etcd_throughput)

    lskv_data = data[data["store"] == "lskv"]
    lskv_data = lskv_data[lskv_data["enclave"] == enclave]
    var, _ = analysis.condense_vars(lskv_data, ignore_vars + ["operation"])
    assert len(var) == 0, set(var)

    lskv_latency = lskv_data["latency_ms"].quantile(quantile)
    lskv_end = lskv_data["end_ms"].max()
    lskv_count = lskv_data["latency_ms"].count()
    lskv_throughput = lskv_count / (lskv_end / 1000)
    if debug:
        print("lskv latency", lskv_latency)
        print("lskv throughput", lskv_throughput)

    latency_ratio = lskv_latency / etcd_latency
    throughput_ratio = lskv_throughput / etcd_throughput
    if debug:
        print("latency improvement", latency_ratio)
        print("throughput ratio", throughput_ratio)

    return latency_ratio, throughput_ratio

In [None]:
best_latency = 1
best_throughput = 1
best_workload = ""
for workload in [
    "workloada",
    "workloadb",
    "workloadc",
    "workloadd",
    "workloade",
    "workloadf",
]:
    lat, through = headline_stats(workload, "sgx")
    if lat < best_latency and through > best_throughput:
        best_latency = lat
        best_throughput = through
        best_workload = workload
print("sgx", best_latency, best_throughput, best_workload)

best_latency = 1
best_throughput = 1
best_workload = ""
for workload in [
    "workloada",
    "workloadb",
    "workloadc",
    "workloadd",
    "workloade",
    "workloadf",
]:
    lat, through = headline_stats(workload, "virtual")
    if lat < best_latency and through > best_throughput:
        best_latency = lat
        best_throughput = through
        best_workload = workload
print("virtual", best_latency, best_throughput, best_workload)

In [None]:
plot_data = all_data.copy(deep=False)
analyser.plot_achieved_throughput_bar(
    plot_data, ignore_vars=ignore_vars + ["operation"]
)

In [None]:
plot_data = all_data.copy(deep=False)
analyser.plot_throughput_bar(
    plot_data, row="nodes", col="operation", ignore_vars=ignore_vars
)

In [None]:
plot_data = all_data.copy(deep=False)
analyser.plot_target_throughput_latency_line(
    plot_data, col="nodes", ignore_vars=ignore_vars
)