In [None]:
from iterpop import iterpop as ip
from keyname import keyname as kn
from matplotlib import pyplot as plt
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from teeplot import teeplot as tp

In [None]:
from conduitpylib.utils import consolidate_merge

In [None]:
nbm.print_metadata()

# Get Data

In [None]:
df_inlet = pd.read_csv(
    'https://osf.io/2rdj6/download',
    compression='gzip',
).dropna(
    subset=['Process Instance UUID'],
)
nbm.print_dataframe_summary(*eval(nbm.nvp_expr(
    'df_inlet'
)))

In [None]:
df_outlet = pd.read_csv(
    'https://osf.io/9utpr/download',
    compression='gzip',
).dropna(
    subset=['Process Instance UUID'],
)
nbm.print_dataframe_summary(*eval(nbm.nvp_expr(
    'df_outlet'
)))

In [None]:
df = consolidate_merge(
    df_inlet,
    df_outlet,
    on=['Process Instance UUID', 'Update'],
    suffixes=(' Inlet', ' Outlet'),
    how='outer',
)
nbm.print_dataframe_synopsis(*eval(nbm.nvp_expr(
    'df'
)))

# Prep Data

In [None]:
df = df.astype({
    'Num Inlets' : 'int64',
    'Num Outlets' : 'int64',
    'Num Puts Attempted' : 'int64',
    'Num Try Puts Attempted' : 'int64',
    'Num Blocking Puts' : 'int64',
    'Num Try Puts That Succeeded' : 'int64',
    'Num Puts That Succeeded Eventually' : 'int64',
    'Num Blocking Puts That Succeeded Immediately' : 'int64',
    'Num Puts That Succeeded Immediately' : 'int64',
    'Num Puts That Blocked' : 'int64',
    'Num Dropped Puts' : 'int64',
    'Num Round Trip Touches Inlet' : 'int64',
    'Net Flux Through Duct' : 'int64',
    'proc' : 'int64',
    'Snapshot' : 'int64',
    'Has Execution Blur' : 'bool',
    'Replicate' : 'int64',
    'Async Mode' : 'int64',
    'Num Threads' : 'int64',
    'Num Processes' : 'int64',
    'SLURM_NNODES' : 'int64',
    'SLURM_NTASKS' : 'int64',
    'SLURM_CPUS_ON_NODE' : 'int64',
})

In [None]:
df['Hostname'] = df.apply(
    lambda row: kn.unpack(row['Source File Inlet'])['_hostname'],
    axis=1,
)

In [None]:
df['Num Nodes'] = df['SLURM_NNODES']
df['Num Tasks'] = df['SLURM_NTASKS']
df['Num Simels Per Cpu'] = df['Num Simulation Elements Per Cpu']
df['Num Cpus'] = df['Num Threads'] * df['Num Processes']
df['Allocated Tasks Per Node'] = df['Num Tasks'] // df['Num Nodes']
df['Delivery Time Inlet'] = (df['Num Puts Attempted'] - 1) / df['Num Round Trip Touches Inlet']
df['Delivery Time Outlet'] = (df['Num Pulls Attempted'] - 1) / df['Num Round Trip Touches Outlet']
df['Intermittancy'] = df['Num Pulls That Were Laden Immediately'] / df[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df['Inlet-Seconds Elapsed'] = df['Num Inlets'] * df['Runtime Seconds Elapsed Inlet']
df['Outlet-Seconds Elapsed'] = df['Num Outlets'] * df['Runtime Seconds Elapsed Outlet']
df['Latency Simsteps Inlet'] = df['Delivery Time Inlet']
df['Latency Simsteps Outlet'] = df['Delivery Time Inlet']
df['Simstep Period Inlet (s)'] = df['Inlet-Seconds Elapsed'] / df['Num Puts Attempted']
df['Simstep Period Outlet (s)'] =  df['Outlet-Seconds Elapsed'] / df['Num Pulls Attempted']
df['Latency Walltime Inlet (s)'] = df['Latency Simsteps Inlet'] * df['Simstep Period Inlet (s)']
df['Latency Walltime Outlet (s)'] = df['Latency Simsteps Outlet'] * df['Simstep Period Outlet (s)']
df['Log Num Processes'] = np.log(df['Num Processes']) / np.log(4)

# Prep DataFrame Variants

In [None]:
# https://stackoverflow.com/a/40629420
df_finalized_observations = df.sort_values('Update', ascending=False).drop_duplicates(['Process Instance UUID'])

In [None]:
df_blurry_snapshots = df[df['Has Execution Blur'].astype(bool)]

In [None]:
df_world_sum = df_finalized_observations.groupby([
    'Replicate',
    'Async Mode',
    'Num Processes',
    'Num Nodes',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
],  as_index=False).sum()

df_world_sum['Fraction Messages Utilized'] = df_world_sum['Num Reads That Were Fresh'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Fraction Messages Delivered'] = df_world_sum['Num Try Puts That Succeeded'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Delivery Failure Rate'] = 1.0 - df_world_sum['Fraction Messages Delivered']
df_world_sum['Fraction Messages Dropped'] = df_world_sum['Delivery Failure Rate']
df_world_sum['Fraction Try Pulls That Were Laden'] = df_world_sum['Num Try Pulls That Were Laden'] / df_world_sum['Num Try Pulls Attempted']
df_world_sum['Round Trip Touches Per Attempted Pull'] = df_world_sum['Num Round Trip Touches Outlet'] / df_world_sum['Num Try Pulls Attempted']
df_world_sum['Round Trip Touches Per Attempted Put'] = df_world_sum['Num Round Trip Touches Inlet'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Num Inflight Messages'] = 2.0 / df_world_sum['Round Trip Touches Per Attempted Put'] - 1
df_world_sum['Fraction Duct Flux Stepped Through'] = df_world_sum['Num Revisions Pulled'] / df_world_sum['Net Flux Through Duct']
df_world_sum['Fraction Duct Flux Jumped Over'] = 1.0 - df_world_sum['Fraction Duct Flux Stepped Through']
df_world_sum['Round Trip Touches Per Runtime Second'] = df_world_sum['Num Round Trip Touches Inlet'] / df_world_sum['Runtime Seconds Elapsed Inlet']
df_world_sum['Latency Simsteps Inlet'] = (df_world_sum['Num Puts Attempted'] - 1) / df_world_sum['Num Round Trip Touches Inlet']
df_world_sum['Latency Simsteps Outlet'] = (df_world_sum['Num Pulls Attempted'] - 1) / df_world_sum['Num Round Trip Touches Outlet']
df_world_sum['Delivery Burstiness'] = df_world_sum['Num Pulls That Were Laden Immediately'] / df_world_sum[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df_world_sum['Intermittancy'] = df_world_sum['Delivery Burstiness']
df_world_sum['Simstep Period Inlet (s)'] = df_world_sum['Inlet-Seconds Elapsed'] / df_world_sum['Num Puts Attempted']
df_world_sum['Simstep Period Outlet (s)'] = df_world_sum['Outlet-Seconds Elapsed'] / df_world_sum['Num Pulls Attempted']
df_world_sum['Latency Walltime Inlet (s)'] = df_world_sum['Latency Simsteps Inlet'] * df_world_sum['Simstep Period Inlet (s)']
df_world_sum['Latency Walltime Outlet (s)'] = df_world_sum['Latency Simsteps Outlet'] * df_world_sum['Simstep Period Outlet (s)']

In [None]:
df_snapshot_diffs = df_blurry_snapshots.groupby(
    [
        'Process Instance UUID',
        'Snapshot',
        # subsequent items aren't meaningful to groupby
        # but are just included so they pass through untouched
        'Async Mode',
        'Num Nodes',
        'Allocated Tasks Per Node',
        'Num Processes',
        'Log Num Processes',
        'Num Simels Per Cpu',
        'Replicate',
        'proc',
        'Hostname',
        'Num Inlets',
        'Num Outlets',
        'Execution Instance UUID',
        'Num Threads',
    ],
    as_index=False,
).aggregate({
    'Num Puts Attempted' : np.ptp,
    'Num Try Puts Attempted' : np.ptp,
    'Num Blocking Puts'  : np.ptp,
    'Num Try Puts That Succeeded' : np.ptp,
    'Num Puts That Succeeded Eventually' : np.ptp,
    'Num Blocking Puts That Succeeded Immediately' : np.ptp,
    'Num Puts That Succeeded Immediately' : np.ptp,
    'Num Puts That Blocked' : np.ptp,
    'Num Dropped Puts' : np.ptp,
    'Num Reads Performed' : np.ptp,
    'Num Reads That Were Fresh' : np.ptp,
    'Num Reads That Were Stale' : np.ptp,
    'Num Revisions Pulled' : np.ptp,
    'Num Try Pulls Attempted' : np.ptp,
    'Num Blocking Pulls' : np.ptp,
    'Num Blocking Pulls That Blocked' : np.ptp,
    'Num Revisions From Try Pulls' : np.ptp,
    'Num Revisions From Blocking Pulls' : np.ptp,
    'Num Pulls Attempted' : np.ptp,
    'Num Pulls That Were Laden Eventually' : np.ptp,
    'Num Blocking Pulls That Were Laden Immediately' : np.ptp,
    'Num Blocking Pulls That Were Laden Eventually' : np.ptp,
    'Num Pulls That Were Laden Immediately' : np.ptp,
    'Num Try Pulls That Were Laden' : np.ptp,
    'Num Try Pulls That Were Unladen' : np.ptp,
    'Net Flux Through Duct' : np.ptp,
    'Num Round Trip Touches Inlet' : np.ptp,
    'Num Round Trip Touches Outlet' : np.ptp,
# why are these missing?
#     'Row Initial Timepoint (ns) Inlet' : np.ptp,
#     'Row Initial Timepoint (ns) Outlet' : np.ptp,
    'Row Final Timepoint (ns) Inlet' : np.ptp,
    'Row Final Timepoint (ns) Outlet' : np.ptp,
    'Runtime Seconds Elapsed Inlet' : np.mean,
    'Runtime Seconds Elapsed Outlet' : np.mean,
})

In [None]:
df_snapshot_diffs['Fraction Messages Delivered'] = (
    df_snapshot_diffs['Num Try Puts That Succeeded']
    / df_snapshot_diffs['Num Try Puts Attempted']
)
df_snapshot_diffs['Delivery Success Rate'] = (
    df_snapshot_diffs['Num Try Puts That Succeeded']
    / df_snapshot_diffs['Num Try Puts Attempted']
)
df_snapshot_diffs['Delivery Failure Rate'] = 1 - df_snapshot_diffs['Delivery Success Rate']
df_snapshot_diffs['Fraction Messages Dropped'] = df_snapshot_diffs['Delivery Failure Rate']
df_snapshot_diffs['Fraction Try Pulls That Were Laden'] = (
    df_snapshot_diffs['Num Try Pulls That Were Laden']
    / df_snapshot_diffs['Num Try Pulls Attempted']
)

df_snapshot_diffs['Round Trip Touches Per Attempted Put'] = (
    df_snapshot_diffs['Num Round Trip Touches Inlet']
) / df_snapshot_diffs['Num Try Puts Attempted']

df_snapshot_diffs['Round Trip Touches Per Attempted Pull'] = (
    df_snapshot_diffs['Num Round Trip Touches Outlet']
) / df_snapshot_diffs['Num Try Pulls Attempted']

df_snapshot_diffs['Round Trip Touches Per Runtime Nanosecond'] = (
    df_snapshot_diffs['Num Round Trip Touches Outlet']
) / df_snapshot_diffs['Row Final Timepoint (ns) Outlet']

df_snapshot_diffs['Latency Simsteps Inlet'] = df_snapshot_diffs['Num Puts Attempted'] / df_snapshot_diffs['Num Round Trip Touches Inlet']
df_snapshot_diffs['Latency Simsteps Outlet'] = df_snapshot_diffs['Num Pulls Attempted'] / df_snapshot_diffs['Num Round Trip Touches Outlet']
df_snapshot_diffs['Delivery Burstiness'] = df_snapshot_diffs['Num Pulls That Were Laden Immediately'] / df_snapshot_diffs[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df_snapshot_diffs['Intermittancy'] = df_snapshot_diffs['Delivery Burstiness']
df_snapshot_diffs['Inlet-Nanoseconds Elapsed'] = df_snapshot_diffs['Num Inlets'] * df_snapshot_diffs['Row Final Timepoint (ns) Inlet']
df_snapshot_diffs['Outlet-Nanoseconds Elapsed'] = df_snapshot_diffs['Num Outlets'] * df_snapshot_diffs['Row Final Timepoint (ns) Outlet']
df_snapshot_diffs['Simsteps Elapsed Inlet'] = df_snapshot_diffs['Num Puts Attempted'] / df_snapshot_diffs['Num Inlets']
df_snapshot_diffs['Simsteps Elapsed Outlet'] = df_snapshot_diffs['Num Pulls Attempted'] / df_snapshot_diffs['Num Outlets']
df_snapshot_diffs['Simstep Period Inlet (ns)'] = df_snapshot_diffs['Inlet-Nanoseconds Elapsed'] / df_snapshot_diffs['Num Puts Attempted']
df_snapshot_diffs['Simstep Period Outlet (ns)'] = df_snapshot_diffs['Outlet-Nanoseconds Elapsed'] / df_snapshot_diffs['Num Pulls Attempted']
df_snapshot_diffs['Latency Walltime Inlet (ns)'] = df_snapshot_diffs['Latency Simsteps Inlet'] * df_snapshot_diffs['Simstep Period Inlet (ns)']
df_snapshot_diffs['Latency Walltime Outlet (ns)'] = df_snapshot_diffs['Latency Simsteps Outlet'] * df_snapshot_diffs['Simstep Period Outlet (ns)']

In [None]:
df_snapshot_diffs = df_snapshot_diffs.astype({
    'Num Inlets' : 'int64',
    'Num Outlets' : 'int64',
    'proc' : 'int64',
    'Snapshot' : 'int64',
    'Replicate' : 'int64',
    'Async Mode' : 'int64',
    'Num Threads' : 'int64',
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Nodes' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

# End-state Data Analysis

This data appears to be skewed by ragged network launch/completion.

In [None]:
def facet_boxplot(*, data, col, row, x, y, showfliers=False):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
    )
    g.map_dataframe(
        sns.boxplot,
        x,
        y,
        showfliers=showfliers,
    )
    g.set_axis_labels(
        x_var="Percentage Depth", y_var="Number of Defects")

## Latency Walltime

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Latency Walltime Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Latency Walltime Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

## Latency Simsteps

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Latency Simsteps Inlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Latency Simsteps Outlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

## Delivery Failure Rate

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Delivery Failure Rate',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

## Delivery Burtiness

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Delivery Burstiness',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

## Simstep Period

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Simstep Period Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    col='Allocated Tasks Per Node',
    row='Num Simels Per Cpu',
    x='Num Processes',
    y='Simstep Period Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
)

# Live Snapshot Analysis

In [None]:
def facet_barplot(*, data, col, row, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.barplot,
        x=x,
        y=y,
        hue=hue,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

In [None]:
def facet_boxplot_withfliers(*, data, col, row, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.boxplot,
        x=x,
        y=y,
        hue=hue,
        showfliers=True,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

In [None]:
def facet_boxplot_nofliers(*, data, col, row, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.boxplot,
        x=x,
        y=y,
        hue=hue,
        showfliers=False,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

## Latency Walltime

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Latency Walltime Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Latency Walltime Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

## Latency Simsteps

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Latency Simsteps Inlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Latency Simsteps Outlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

## Delivery Failure Rate

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Delivery Failure Rate',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

## Delivery Burstiness

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Delivery Burstiness',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

## Simstep Period

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Simstep Period Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Num Processes',
        y='Simstep Period Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

# Model Fits

In [None]:
def make_regression_row(*, data, dependent_variable, regression, row_filter):

    filtered_data = data[ data.apply(eval(row_filter), axis=1) ]

    regfun = {
        'Ordinary Least Squares Regression' : smf.ols,
        'Quantile Regression' : smf.quantreg,
    }[regression]
    mod = regfun(f"Q('{dependent_variable}') ~ Q('Log Num Processes')", filtered_data)
    res = mod.fit()

    slope = res.params["Q('Log Num Processes')"]
    intercept = res.params['Intercept']

    slope_ci_lb, slope_ci_ub = res.conf_int().loc["Q('Log Num Processes')"].tolist()
    intercept_ci_lb, intercept_ci_ub = res.conf_int().loc['Intercept'].tolist()

    return {
        'Independent Variable' : 'Log Num Processes',
        'Dependent Variable' : dependent_variable,
        'Allocated Tasks Per Node' : ip.pophomogeneous( data['Allocated Tasks Per Node'] ),
        'Num Simels Per Cpu' : ip.pophomogeneous( data['Num Simels Per Cpu'] ),
        'Slope Estimate' : slope,
        'Slope Estimate 95% CI Lower Bound' : slope_ci_lb,
        'Slope Estimate 95% CI Upper Bound' : slope_ci_ub,
        'Intercept Estimate' : intercept,
        'Intercept Estimate 95% CI Lower Bound' : intercept_ci_lb,
        'Intercept Estimate 95% CI Upper Bound' : intercept_ci_ub,
        'R^2' : res.rsquared,
        'p' : res.pvalues.loc["Q('Log Num Processes')"],
        'n' : len(filtered_data),
        'Filter' : row_filter,
        'Regression Model' : regression,
        'Summary' : res.summary(),
    }

In [None]:
dependent_variables = [
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Delivery Failure Rate',
    'Delivery Burstiness',
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
]

# best-case approximation to replace infs/nans
# see listings of infs/nans below
df_snapshot_diffs_copy = df_snapshot_diffs.copy()
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

regression_data_tuples = [
    (
        'Ordinary Least Squares Regression',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).median().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
    (
        'Quantile Regression',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).median().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
]

row_filters = [
    "lambda row: True",
    "lambda row: row['Num Processes'] in (16, 64)",
    "lambda row: row['Num Processes'] in (64, 256)",
]

fit_results = pd.DataFrame.from_records([
    make_regression_row(
        data=data_subset,
        dependent_variable=dependent_variable,
        regression=regression,
        row_filter=row_filter,
    )
    for row_filter in row_filters
    for regression, data in regression_data_tuples
    for _, data_subset in data.groupby([
        'Allocated Tasks Per Node',
        'Num Simels Per Cpu',
    ])
    for dependent_variable in dependent_variables
])

In [None]:
fit_results
# TODO print to csv
# TODO save to csv

In [None]:
def quantile_regplot(fit_reg=True, color=None, *args, **kwargs):
    x, y, data = kwargs['x'], kwargs['y'], kwargs['data']
    sns.regplot(
        *args,
        **kwargs,
        fit_reg=False,
        color=color,
    )

    if fit_reg:
        model = smf.quantreg(
            f"Q('{y}') ~ Q('{x}')",
            data
        )
        res = model.fit(q=0.5)
        m = res.params[f"Q('{x}')"]
        b = res.params['Intercept']

        m_ci = res.conf_int().loc[f"Q('{x}')"].tolist()
        b_ci = res.conf_int().loc['Intercept'].tolist()

        center_x = np.mean([data[x].min(), data[x].max()])
        center_y = m * center_x + b

        xs = sorted(set(data[x]) | {center_x})
        ys = [
            m * x_ + b
            for x_ in xs
        ]
        y1 = [ min(
                m_ * ( x_ - center_x ) + center_y
                for m_ in m_ci
        ) for x_ in xs ]
        y2 = [ max(
                m_ * ( x_ - center_x ) + center_y
                for m_ in m_ci
        ) for x_ in xs ]

        plt.gca().plot(
            xs,
            ys,
            color=color,
        )
        plt.gca().fill_between(
            xs,
            y1,
            y2,
            alpha=0.2,
            color=color,
        )

In [None]:
def unsplit_regression(*args, regplot, **kwargs):
    del kwargs['color']
    regplot(
        *args,
        **kwargs,
        color='black',
        fit_reg=False,
    )
    regplot(
        *args,
        **kwargs,
        color='purple',
        scatter=False,
    )

def facet_unsplit_regression(*, data, col, row, x, y, regression, **kwargs):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey=False,
    )
    g.map_dataframe(
        unsplit_regression,
        regplot={
            'Ordinary Least Squares Regression' : sns.regplot,
            'Quantile Regression' : quantile_regplot,
        }[regression],
        x=x,
        y=y,
        **kwargs,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

    # adapted from https://stackoverflow.com/a/29814281
    plt.gcf().subplots_adjust(top=0.9)
    plt.gcf().suptitle(regression)

In [None]:
def split_regression(*args, regplot, **kwargs):
    del kwargs['color']
    regplot(
        *args,
        **kwargs,
        color='black',
        fit_reg=False,
    )
    data = kwargs['data']
    del kwargs['data']
    regplot(
        *args,
        **kwargs,
        color='red',
        data=data[ data['Log Num Processes'] >= 3 ],
        scatter=False,
    )
    regplot(
        *args,
        **kwargs,
        color='blue',
        data=data[ data['Log Num Processes'] <= 3 ],
        scatter=False,
    )

def facet_split_regression(*, data, col, row, x, y, regression, **kwargs):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey=False,
    )
    g.map_dataframe(
        split_regression,
        x=x,
        y=y,
        regplot={
            'Ordinary Least Squares Regression' : sns.regplot,
            'Quantile Regression' : quantile_regplot,
        }[regression],
        **kwargs,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

    # adapted from https://stackoverflow.com/a/29814281
    plt.gcf().subplots_adjust(top=0.9)
    plt.gcf().suptitle(regression)

In [None]:
def split_unsplit_regression(*args, regplot, **kwargs):
    del kwargs['color']
    regplot(
        *args,
        **kwargs,
        color='black',
        fit_reg=False,
    )
    regplot(
        *args,
        **kwargs,
        color='purple',
        scatter=False,
    )
    data = kwargs['data']
    del kwargs['data']
    regplot(
        *args,
        **kwargs,
        color='red',
        data=data[ data['Log Num Processes'] >= 3 ],
        scatter=False,
    )
    regplot(
        *args,
        **kwargs,
        color='blue',
        data=data[ data['Log Num Processes'] <= 3 ],
        scatter=False,
    )

def facet_split_unsplit_regression(*, data, col, row, x, y, regression, **kwargs):
    g = sns.FacetGrid(
        data,
        col=col,
        row=row,
        margin_titles=True,
        sharey=False,
    )
    g.map_dataframe(
        split_unsplit_regression,
        x=x,
        y=y,
        regplot={
            'Ordinary Least Squares Regression' : sns.regplot,
            'Quantile Regression' : quantile_regplot,
        }[regression],
        **kwargs,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

    # adapted from https://stackoverflow.com/a/29814281
    plt.gcf().subplots_adjust(top=0.9)
    plt.gcf().suptitle(regression)

## Latency Walltime Inlet (ns)

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Inlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Latency Walltime Outlet (ns)

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Outlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)


data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Latency Simsteps Inlet

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Inlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Latency Simsteps Outlet

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Outlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Delivery Failure Rate

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Failure Rate'])
][[
    'Delivery Failure Rate',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Delivery Burstiness

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Burstiness'])
][[
    'Delivery Burstiness',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Delivery Burstiness',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Delivery Burstiness',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Simstep Period Inlet (ns)

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Inlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


## Simstep Period Outlet (ns)

In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Outlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Num Processes',
]]

In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
    )

In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})


for viz in facet_split_regression, facet_split_unsplit_regression, facet_unsplit_regression:
    tp.tee(
        viz,
        data=data,
        col='Allocated Tasks Per Node',
        row='Num Simels Per Cpu',
        x='Log Num Processes',
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


# Outlier Analysis

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 1)
]

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 2048)
]