In [None]:
%load_ext watermark


In [None]:
import os

from IPython.display import display
import pandas as pd
import polars as pl
import seaborn as sns
from teeplot import teeplot as tp


In [None]:
%watermark -diwmuv -iv


In [None]:
teeplot_subdir = os.environ.get("NOTEBOOK_NAME", "2025-01-18-cpp-bench-memory")
teeplot_subdir


## Prep Data


In [None]:
df = pl.concat(
    [
        pl.read_csv("https://osf.io/m6wne/download"),
        pl.read_csv("https://osf.io/sb4zw/download"),
    ],
)
df = (
    df.cast(
        {
            "memory_bytes": pl.Int32,
            "num_items": pl.Int32,
            "num_sites": pl.Int32,
            "duration_s": pl.Float64,
            "replicate": pl.Int32,
        },
    )
    .with_columns(
        duration_per_item_ns=(
            pl.col("duration_s") * 1_000_000_000 / pl.col("num_items")
        )
    )
    .with_columns(
        algorithm=pl.col("algo_name").map_elements(
            {
                "control_ring_algo": "control: ringbuf",
                "control_throwaway_algo": "control: discard",
                "dstream.steady_algo": "dstream steady",
                "dstream.stretched_algo": "dstream stretched",
                "dstream.tilted_algo": "dstream tilted",
                "naive_steady_algo": "naive steady",
                "zhao_steady_algo": "zhao steady",
                "zhao_tilted_algo": "zhao tilted",
            }.__getitem__,
            return_dtype=str,
        )
    )
b)

display(df.describe()), display(df.head()), display(df.tail());


In [None]:
with pd.option_context("display.max_columns", None):
    with pd.option_context("display.max_rows", None):
        display(
            df.group_by(
                ["num_items", "num_sites", "algorithm"],
            ).agg(
                pl.col("memory_bytes").n_unique(),
            ).to_pandas(),
        )


In [None]:
df = pl.concat(
    [
        df.with_columns(pl.lit("bit").alias("data type")),
        df.with_columns(
            pl.lit("byte").alias("data type"),
            (
                pl.col("memory_bytes") + pl.col("num_sites") // 8 * 7
            ).alias("memory_bytes"),
        ),
        df.with_columns(
            pl.lit("double word").alias("data type"),
            (
                pl.col("memory_bytes") + pl.col("num_sites") // 8 * 31
            ).alias("memory_bytes"),
        ),
        df.with_columns(
            pl.lit("quad word").alias("data type"),
            (
                pl.col("memory_bytes") + pl.col("num_sites") // 8 * 63
            ).alias("memory_bytes"),
        ),
    ],
)


## Plot


## Memory Use


In [None]:
with tp.teed(
    sns.relplot,
    data=df,
    x="num_sites",
    y="memory_bytes",
    hue="algo_name",
    col="num_items",
    row="data type",
    palette="muted",
    kind="line",
    errorbar=("pi", 100),
    teeplot_subdir=teeplot_subdir,
) as g:
    g.set(xscale="log", yscale="log")


In [None]:
# Plot speedup
with tp.teed(
    sns.catplot,
    data=df.filter(
        pl.col("data type") == "bit"
    ).filter(
        pl.col("algo_name").is_in(
            [
                "dstream.steady_algo",
                "zhao_steady_algo",
            ],
        ),
    ).with_columns(
        strategy=pl.col("algo_name").map_elements(
            {
                "dstream.steady_algo": "dstream",
                "zhao_steady_algo": "naive",
            }.__getitem__,
            return_dtype=str,
        ),
    ),
    x="num_sites",
    y="memory_bytes",
    hue="strategy",
    kind="bar",
    aspect=1.5,
    height=2,
    errorbar=("pi", 100),
    palette="Set2",
    teeplot_subdir=teeplot_subdir,
) as g:
    g.set(yscale="log")
    g.set(ylim=(0, None))
    g.set_xlabels("Buffer Capacity (item count)")
    g.set_ylabels("Memory (bytes B)")
    for ax in g.axes.flat:
        ax.bar_label(
            ax.containers[0],
            fmt="%d B",
            label_type="edge",
            rotation=90,
            padding=3,
        )
        ax.bar_label(
            ax.containers[1],
            fmt="%d B",
            label_type="center",
            rotation=90,
            color="white",
        )
    sns.move_legend(
        g,
        "lower center",
        bbox_to_anchor=(0.5, 1),
        ncol=3,
        frameon=False,
        title=None,
    )


In [None]:
with tp.teed(
    sns.catplot,
    data=df.filter(
        pl.col("algo_name").is_in(
            [
                "dstream.steady_algo",
                "zhao_steady_algo",
            ],
        ),
    ).with_columns(
        pl.col("data type").str.replace_all(" ", "\n"),
        strategy=pl.col("algo_name").map_elements(
            {
                "dstream.steady_algo": "dstream",
                "zhao_steady_algo": "naive",
            }.__getitem__,
            return_dtype=str,
        ),
    ),
    col="num_sites",
    y="memory_bytes",
    hue="strategy",
    row="data type",
    kind="bar",
    errorbar=("pi", 100),
    margin_titles=True,
    aspect=0.4,
    height=1.3,
    palette="Set2",
    sharey=False,
    teeplot_subdir=teeplot_subdir,
) as g:
    # g.set(yscale="log")
    g.fig.subplots_adjust(wspace=0, hspace=0)
    g.set(ylim=(0, None))
    # g.set_xlabels("Buffer Capacity (item count)")
    g.set_titles(row_template="{row_name}", col_template="{col_name}\nsites")
    g.set_ylabels("Memory\n(bytes B)")
    for row, axs in enumerate(g.axes):
        for col, ax in enumerate(axs):
            ax.bar_label(
                ax.containers[0],
                fmt="  %d B",
                label_type="edge" if row < 2 else "center",
                rotation=90,
                padding=5 if row < 2 else 3 if row < 3 else -2,
            )
            ax.bar_label(
                ax.containers[1],
                fmt=" %d B",
                label_type="center",
                rotation=90,
                color="white",
            )
    sns.move_legend(
        g,
        "lower center",
        bbox_to_anchor=(0.4, 0.95),
        ncol=3,
        frameon=False,
        title=None,
    )
    g.set(yticks=[], xticks=[])


In [None]:
# Plot speedup
with tp.teed(
    sns.catplot,
    data=df.filter(
        pl.col("algo_name").is_in(
            [
                "dstream.steady_algo",
                "zhao_steady_algo",
            ],
        ),
    ).with_columns(
        strategy=pl.col("algo_name").map_elements(
            {
                "dstream.steady_algo": "dstream",
                "zhao_steady_algo": "naive",
            }.__getitem__,
            return_dtype=str,
        ),
    ),
    row="num_sites",
    x="memory_bytes",
    hue="strategy",
    col="data type",
    kind="bar",
    errorbar=("pi", 100),
    margin_titles=True,
    aspect=4,
    height=0.5,
    palette="Set2",
    sharex=False,
    teeplot_subdir=teeplot_subdir,
) as g:
    # g.set(yscale="log")
    g.fig.subplots_adjust(wspace=0, hspace=0)
    g.set(xlim=(0, None))
    # g.set_xlabels("Buffer Capacity (item count)")
    g.set_titles(
        col_template="{col_name} data", row_template=""
    )
    g.set_xlabels("Memory (bytes B)")
    g.axes[0][0].set(ylabel="64")
    g.axes[1][0].set(ylabel="256")
    g.axes[2][0].set(ylabel="1024")
    g.axes[3][0].set(ylabel="4096")
    for row, axs in enumerate(g.axes):
        for col, ax in enumerate(axs):
            ax.bar_label(
                ax.containers[0],
                fmt="%d B",
                label_type="edge" if col < 2 else "center",
                rotation=0,
                padding=5 if col < 2 else 3 if col < 3 else -2,
            )
            ax.bar_label(
                ax.containers[1],
                fmt=" %d B",
                label_type="center",
                rotation=0,
                color="white",
            )
    g.fig.supylabel(
        "Buffer Size (num sites)", fontsize="medium", x=0.085
    )
    # sns.move_legend(
    #     g,
    #     "lower center",
    #     bbox_to_anchor=(0.4, 0.95),
    #     ncol=3,
    #     frameon=False,
    #     title=None,
    # )
    g.set(yticks=[], xticks=[])


## relative memory savings


In [None]:
df_memory_savings = (
    df.filter(
        pl.col("algo_name").is_in(["dstream.steady_algo", "naive_steady_algo"])
    )
    .group_by(["num_items", "num_sites", "replicate"])
    .agg(
        memory_savings=(
            pl.col("memory_bytes")
            .filter(pl.col("algo_name") == "naive_steady_algo")
            .mean()
            / pl.col("memory_bytes")
            .filter(pl.col("algo_name") == "dstream.steady_algo")
            .mean()
        )
    )
)
with pd.option_context("display.max_columns", None):
    with pd.option_context("display.max_rows", None):
        display(df_memory_savings.group_by(["num_sites"]).mean().to_pandas())


# Plot savings
with tp.teed(
    sns.catplot,
    data=df_memory_savings.cast({"num_sites": str}),
    x="num_sites",
    y="memory_savings",
    kind="bar",
    aspect=1.5,
    height=2,
    teeplot_subdir=teeplot_subdir,
) as g:
    g.set(ylim=(0, None))
    for ax in g.axes.flat:
        ax.bar_label(ax.containers[0], fmt="$%d\\times$")


## absolute memory savings


In [None]:
df_memory_savings = (
    df.filter(
        pl.col("algo_name").is_in(["dstream.steady_algo", "naive_steady_algo"])
    )
    .group_by(["num_items", "num_sites", "replicate"])
    .agg(
        memory_savings=(
            pl.col("memory_bytes")
            .filter(pl.col("algo_name") == "naive_steady_algo")
            .mean()
            - pl.col("memory_bytes")
            .filter(pl.col("algo_name") == "dstream.steady_algo")
            .mean()
        )
    )
)

# Plot savings
with tp.teed(
    sns.catplot,
    data=df_memory_savings.cast({"num_sites": str}),
    x="num_sites",
    y="memory_savings",
    kind="bar",
    aspect=1.5,
    height=2,
    teeplot_subdir=teeplot_subdir,
) as g:
    g.set(ylim=(0, None))
    for ax in g.axes.flat:
        ax.bar_label(ax.containers[0], fmt="%d B")
