In [None]:
import warnings

from iterdub import iterdub as ib
from iterpop import iterpop as ip
from keyname import keyname as kn
from matplotlib import pyplot as plt
import matplotlib
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
import pathlib
from scipy import stats
import seaborn as sns
from slugify import slugify
import statsmodels.api as sm
import statsmodels.formula.api as smf
from teeplot import teeplot as tp


In [None]:
from conduitpylib.utils import (
    consolidate_merge,
    count_outliers,
    count_nonoutliers,
    count_proportion_outliers,
)

from conduitpylib.viz import (
    performance_semantics_plot,
)


In [None]:
nbm.print_metadata()


# Get Data


In [None]:
df_inlet = pd.read_csv(
    'https://osf.io/jgpnv/download',
    compression='gzip',
).dropna(
    subset=['Process Instance UUID'],
)
nbm.print_dataframe_summary(*eval(nbm.nvp_expr(
    'df_inlet'
)))


In [None]:
df_outlet = pd.read_csv(
    'https://osf.io/ncdfq/download',
    compression='gzip',
).dropna(
    subset=['Process Instance UUID'],
)
nbm.print_dataframe_summary(*eval(nbm.nvp_expr(
    'df_outlet'
)))


In [None]:
df = consolidate_merge(
    df_inlet,
    df_outlet,
    on=['Process Instance UUID', 'Update'],
    suffixes=(' Inlet', ' Outlet'),
    how='outer',
)
if all(df_inlet['Runtime Seconds Elapsed'] == df_outlet['Runtime Seconds Elapsed']):
    df['Runtime Seconds Elapsed Inlet'] = df['Runtime Seconds Elapsed']
    df['Runtime Seconds Elapsed Outlet'] = df['Runtime Seconds Elapsed']
nbm.print_dataframe_synopsis(*eval(nbm.nvp_expr(
    'df'
)))


# Prep Data


In [None]:
df = df.astype({
    'Num Inlets' : 'int64',
    'Num Outlets' : 'int64',
    'Num Puts Attempted' : 'int64',
    'Num Try Puts Attempted' : 'int64',
    'Num Blocking Puts' : 'int64',
    'Num Try Puts That Succeeded' : 'int64',
    'Num Puts That Succeeded Eventually' : 'int64',
    'Num Blocking Puts That Succeeded Immediately' : 'int64',
    'Num Puts That Succeeded Immediately' : 'int64',
    'Num Puts That Blocked' : 'int64',
    'Num Dropped Puts' : 'int64',
    'Num Round Trip Touches Inlet' : 'int64',
    'Net Flux Through Duct' : 'int64',
    'proc' : 'int64',
    'Snapshot' : 'int64',
    'Has Execution Blur' : 'bool',
    'Replicate' : 'int64',
    'Async Mode' : 'int64',
    'Num Threads' : 'int64',
    'Num Processes' : 'int64',
    'SLURM_NNODES' : 'int64',
    'SLURM_NTASKS' : 'int64',
    'SLURM_CPUS_ON_NODE' : 'int64',
})


In [None]:
df['Hostname'] = df.apply(
    lambda row: kn.unpack(row['Source File Inlet'])['_hostname'],
    axis=1,
)


In [None]:
df['Num Nodes'] = df['SLURM_NNODES']
df['Num Tasks'] = df['SLURM_NTASKS']
df['Num Simels Per Cpu'] = df['Num Simulation Elements Per Cpu']
df['Num Cpus'] = df['Num Threads'] * df['Num Processes']
df['Allocated Tasks Per Node'] = df['Num Tasks'] // df['Num Nodes']
df['Cpus Per Node'] = df['Allocated Tasks Per Node']
df['Delivery Time Inlet'] = (df['Num Puts Attempted'] - 1) / df['Num Round Trip Touches Inlet']
df['Delivery Time Outlet'] = (df['Num Pulls Attempted'] - 1) / df['Num Round Trip Touches Outlet']
df['Intermittancy'] = df['Num Pulls That Were Laden Immediately'] / df[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df['Inlet-Seconds Elapsed'] = df['Num Inlets'] * df['Runtime Seconds Elapsed Inlet']
df['Outlet-Seconds Elapsed'] = df['Num Outlets'] * df['Runtime Seconds Elapsed Outlet']
df['Latency Simsteps Inlet'] = df['Delivery Time Inlet']
df['Latency Simsteps Outlet'] = df['Delivery Time Inlet']
df['Simstep Period Inlet (s)'] = df['Inlet-Seconds Elapsed'] / df['Num Puts Attempted']
df['Simstep Period Outlet (s)'] =  df['Outlet-Seconds Elapsed'] / df['Num Pulls Attempted']
df['Latency Walltime Inlet (s)'] = df['Latency Simsteps Inlet'] * df['Simstep Period Inlet (s)']
df['Latency Walltime Outlet (s)'] = df['Latency Simsteps Outlet'] * df['Simstep Period Outlet (s)']
df['Log Num Processes'] = np.log(df['Num Processes']) / np.log(4)


In [None]:
row_distiller = lambda row: {k : v for k, v in row.items() if k in ('Num Nodes', 'Num Processes')}

allocation_idx_mapper = {
    val : idx
    for idx, val
    in enumerate(df['Allocation'].unique())
}
allocation_idx_mapped_title = ' | '.join(f'{idx} = {val}' for val, idx in allocation_idx_mapper.items())
df[allocation_idx_mapped_title] = df.apply(
    lambda row: allocation_idx_mapper[row['Allocation']],
    axis=1,
)


# Prep DataFrame Variants


In [None]:
# https://stackoverflow.com/a/40629420
df_finalized_observations = df.sort_values('Update', ascending=False).drop_duplicates(['Process Instance UUID'])


In [None]:
df_blurry_snapshots = df[
    df['Has Execution Blur'].astype(bool)
    & (df['Snapshot'] <= 5 )
    # exclude excess, unintended snapshots from runs that took a while to shut down
    # (i.e., from at the 6 minute mark and beyond)
]


In [None]:
df_world_sum = df_finalized_observations.groupby([
    'Replicate',
    'Async Mode',
    'Num Processes',
    'Num Nodes',
    'Num Simels Per Cpu',
    'Allocated Tasks Per Node',
    'Cpus Per Node',
    'Allocation',
    allocation_idx_mapped_title,
],  as_index=False).sum()

df_world_sum['Fraction Messages Utilized'] = df_world_sum['Num Reads That Were Fresh'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Fraction Messages Delivered'] = df_world_sum['Num Try Puts That Succeeded'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Delivery Failure Rate'] = 1.0 - df_world_sum['Fraction Messages Delivered']
df_world_sum['Fraction Messages Dropped'] = df_world_sum['Delivery Failure Rate']
df_world_sum['Fraction Try Pulls That Were Laden'] = df_world_sum['Num Try Pulls That Were Laden'] / df_world_sum['Num Try Pulls Attempted']
df_world_sum['Round Trip Touches Per Attempted Pull'] = df_world_sum['Num Round Trip Touches Outlet'] / df_world_sum['Num Try Pulls Attempted']
df_world_sum['Round Trip Touches Per Attempted Put'] = df_world_sum['Num Round Trip Touches Inlet'] / df_world_sum['Num Try Puts Attempted']
df_world_sum['Num Inflight Messages'] = 2.0 / df_world_sum['Round Trip Touches Per Attempted Put'] - 1
df_world_sum['Fraction Duct Flux Stepped Through'] = df_world_sum['Num Revisions Pulled'] / df_world_sum['Net Flux Through Duct']
df_world_sum['Fraction Duct Flux Jumped Over'] = 1.0 - df_world_sum['Fraction Duct Flux Stepped Through']
df_world_sum['Round Trip Touches Per Runtime Second'] = df_world_sum['Num Round Trip Touches Inlet'] / df_world_sum['Runtime Seconds Elapsed Inlet']
df_world_sum['Latency Simsteps Inlet'] = (df_world_sum['Num Puts Attempted'] - 1) / df_world_sum['Num Round Trip Touches Inlet']
df_world_sum['Latency Simsteps Outlet'] = (df_world_sum['Num Pulls Attempted'] - 1) / df_world_sum['Num Round Trip Touches Outlet']
df_world_sum['Delivery Clumpiness'] = 1.0 - df_world_sum['Num Pulls That Were Laden Immediately'] / df_world_sum[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df_world_sum['Intermittancy'] = df_world_sum['Delivery Clumpiness']
df_world_sum['Simstep Period Inlet (s)'] = df_world_sum['Inlet-Seconds Elapsed'] / df_world_sum['Num Puts Attempted']
df_world_sum['Simstep Period Outlet (s)'] = df_world_sum['Outlet-Seconds Elapsed'] / df_world_sum['Num Pulls Attempted']
df_world_sum['Latency Walltime Inlet (s)'] = df_world_sum['Latency Simsteps Inlet'] * df_world_sum['Simstep Period Inlet (s)']
df_world_sum['Latency Walltime Outlet (s)'] = df_world_sum['Latency Simsteps Outlet'] * df_world_sum['Simstep Period Outlet (s)']


In [None]:
df_snapshot_diffs = df_blurry_snapshots.groupby(
    [
        'Process Instance UUID',
        'Snapshot',
        # subsequent items aren't meaningful to groupby
        # but are just included so they pass through untouched
        'Async Mode',
        'Num Nodes',
        'Allocated Tasks Per Node',
        'Cpus Per Node',
        'Num Processes',
        'Log Num Processes',
        'Num Simels Per Cpu',
        'Replicate',
        'proc',
        'Hostname',
        'Num Inlets',
        'Num Outlets',
        'Execution Instance UUID',
        'Num Threads',
        'Allocation',
        allocation_idx_mapped_title,

    ],
    as_index=False,
).aggregate({
    'Num Puts Attempted' : np.ptp,
    'Num Try Puts Attempted' : np.ptp,
    'Num Blocking Puts'  : np.ptp,
    'Num Try Puts That Succeeded' : np.ptp,
    'Num Puts That Succeeded Eventually' : np.ptp,
    'Num Blocking Puts That Succeeded Immediately' : np.ptp,
    'Num Puts That Succeeded Immediately' : np.ptp,
    'Num Puts That Blocked' : np.ptp,
    'Num Dropped Puts' : np.ptp,
    'Num Reads Performed' : np.ptp,
    'Num Reads That Were Fresh' : np.ptp,
    'Num Reads That Were Stale' : np.ptp,
    'Num Revisions Pulled' : np.ptp,
    'Num Try Pulls Attempted' : np.ptp,
    'Num Blocking Pulls' : np.ptp,
    'Num Blocking Pulls That Blocked' : np.ptp,
    'Num Revisions From Try Pulls' : np.ptp,
    'Num Revisions From Blocking Pulls' : np.ptp,
    'Num Pulls Attempted' : np.ptp,
    'Num Pulls That Were Laden Eventually' : np.ptp,
    'Num Blocking Pulls That Were Laden Immediately' : np.ptp,
    'Num Blocking Pulls That Were Laden Eventually' : np.ptp,
    'Num Pulls That Were Laden Immediately' : np.ptp,
    'Num Try Pulls That Were Laden' : np.ptp,
    'Num Try Pulls That Were Unladen' : np.ptp,
    'Net Flux Through Duct' : np.ptp,
    'Num Round Trip Touches Inlet' : np.ptp,
    'Num Round Trip Touches Outlet' : np.ptp,
# why are these missing?
#     'Row Initial Timepoint (ns) Inlet' : np.ptp,
#     'Row Initial Timepoint (ns) Outlet' : np.ptp,
    'Row Final Timepoint (ns) Inlet' : np.ptp,
    'Row Final Timepoint (ns) Outlet' : np.ptp,
    'Runtime Seconds Elapsed Inlet' : np.mean,
    'Runtime Seconds Elapsed Outlet' : np.mean,
})


In [None]:
df_snapshot_diffs['Fraction Messages Delivered'] = (
    df_snapshot_diffs['Num Try Puts That Succeeded']
    / df_snapshot_diffs['Num Try Puts Attempted']
)
df_snapshot_diffs['Delivery Success Rate'] = (
    df_snapshot_diffs['Num Try Puts That Succeeded']
    / df_snapshot_diffs['Num Try Puts Attempted']
)
df_snapshot_diffs['Delivery Failure Rate'] = 1 - df_snapshot_diffs['Delivery Success Rate']
df_snapshot_diffs['Fraction Messages Dropped'] = df_snapshot_diffs['Delivery Failure Rate']
df_snapshot_diffs['Fraction Try Pulls That Were Laden'] = (
    df_snapshot_diffs['Num Try Pulls That Were Laden']
    / df_snapshot_diffs['Num Try Pulls Attempted']
)

df_snapshot_diffs['Round Trip Touches Per Attempted Put'] = (
    df_snapshot_diffs['Num Round Trip Touches Inlet']
) / df_snapshot_diffs['Num Try Puts Attempted']

df_snapshot_diffs['Round Trip Touches Per Attempted Pull'] = (
    df_snapshot_diffs['Num Round Trip Touches Outlet']
) / df_snapshot_diffs['Num Try Pulls Attempted']

df_snapshot_diffs['Round Trip Touches Per Runtime Nanosecond'] = (
    df_snapshot_diffs['Num Round Trip Touches Outlet']
) / df_snapshot_diffs['Row Final Timepoint (ns) Outlet']

df_snapshot_diffs['Latency Simsteps Inlet'] = df_snapshot_diffs['Num Puts Attempted'] / df_snapshot_diffs['Num Round Trip Touches Inlet']
df_snapshot_diffs['Latency Simsteps Outlet'] = df_snapshot_diffs['Num Pulls Attempted'] / df_snapshot_diffs['Num Round Trip Touches Outlet']
df_snapshot_diffs['Delivery Clumpiness'] = 1.0 - df_snapshot_diffs['Num Pulls That Were Laden Immediately'] / df_snapshot_diffs[['Net Flux Through Duct', 'Num Pulls Attempted']].min(axis=1)
df_snapshot_diffs['Intermittancy'] = df_snapshot_diffs['Delivery Clumpiness']
df_snapshot_diffs['Inlet-Nanoseconds Elapsed'] = df_snapshot_diffs['Num Inlets'] * df_snapshot_diffs['Row Final Timepoint (ns) Inlet']
df_snapshot_diffs['Outlet-Nanoseconds Elapsed'] = df_snapshot_diffs['Num Outlets'] * df_snapshot_diffs['Row Final Timepoint (ns) Outlet']
df_snapshot_diffs['Simsteps Elapsed Inlet'] = df_snapshot_diffs['Num Puts Attempted'] / df_snapshot_diffs['Num Inlets']
df_snapshot_diffs['Simsteps Elapsed Outlet'] = df_snapshot_diffs['Num Pulls Attempted'] / df_snapshot_diffs['Num Outlets']
df_snapshot_diffs['Simstep Period Inlet (ns)'] = df_snapshot_diffs['Inlet-Nanoseconds Elapsed'] / df_snapshot_diffs['Num Puts Attempted']
df_snapshot_diffs['Simstep Period Outlet (ns)'] = df_snapshot_diffs['Outlet-Nanoseconds Elapsed'] / df_snapshot_diffs['Num Pulls Attempted']
df_snapshot_diffs['Latency Walltime Inlet (ns)'] = df_snapshot_diffs['Latency Simsteps Inlet'] * df_snapshot_diffs['Simstep Period Inlet (ns)']
df_snapshot_diffs['Latency Walltime Outlet (ns)'] = df_snapshot_diffs['Latency Simsteps Outlet'] * df_snapshot_diffs['Simstep Period Outlet (ns)']


In [None]:
df_snapshot_diffs = df_snapshot_diffs.astype({
    'Num Inlets' : 'int64',
    'Num Outlets' : 'int64',
    'proc' : 'int64',
    'Snapshot' : 'int64',
    'Replicate' : 'int64',
    'Async Mode' : 'int64',
    'Num Threads' : 'int64',
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Nodes' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',

})


# End-state Data Analysis

This data appears to be skewed by ragged network launch/completion.


In [None]:
def facet_boxplot(*, data, col=None, row=None, x, y, showfliers=False):
    g = sns.FacetGrid(
        data,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.boxplot,
        x,
        y,
        showfliers=showfliers,
    )


## Latency Walltime


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Walltime Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-walltime-inlet-s',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Walltime Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-walltime-outlet-s',
)


## Latency Simsteps


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Simsteps Inlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-simsteps-inlet',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Simsteps Outlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-simsteps-outlet',
)


## Delivery Failure Rate


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,

    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Delivery Failure Rate',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='delivery-failure-rate',
)


## Delivery Clumpiness


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Delivery Clumpiness',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='delivery-clumpiness',
)


## Simstep Period


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Simstep Period Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='simstep-period-inlet-s',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Simstep Period Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='simstep-period-outlet-s',
)


# Live Snapshot Analysis


In [None]:
def facet_barplot(*, data, col=None, row=None, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.barplot,
        x=x,
        y=y,
        hue=hue,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)


In [None]:
def facet_boxplot_withfliers(*, data, col=None, row=None, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.boxplot,
        x=x,
        y=y,
        hue=hue,
        showfliers=True,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)


In [None]:
def facet_boxplot_nofliers(*, data, col=None, row=None, x, y, hue=None):
    g = sns.FacetGrid(
        data,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey='row',
    )
    g.map_dataframe(
        sns.boxplot,
        x=x,
        y=y,
        hue=hue,
        showfliers=False,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)


## Latency Walltime


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Walltime Inlet (ns)' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Walltime Inlet (ns)': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': np.mean
})

baseline = median_of_medians.loc[1, "Latency Walltime Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Walltime Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Walltime Inlet (ns)': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': np.median
})

baseline = median_of_medians.loc[1, "Latency Walltime Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Walltime Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Latency Walltime Inlet (ns)']
xdf['median_abs_deviation'] = xdf['Latency Walltime Inlet (ns)']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Latency Walltime Inlet (ns)"],
            nonoutlier_counts["Latency Walltime Inlet (ns)"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


### Distributions


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Walltime Outlet (ns)' : [
        np.mean,
        np.median,
    ],
})


## Latency Simsteps


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Simsteps Inlet' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Simsteps Inlet': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': np.mean
})

baseline = median_of_medians.loc[1, "Latency Simsteps Inlet"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Simsteps Inlet"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Simsteps Inlet': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': np.median
})

baseline = median_of_medians.loc[1, "Latency Simsteps Inlet"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Simsteps Inlet"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Latency Simsteps Inlet']
xdf['median_abs_deviation'] = xdf['Latency Simsteps Inlet']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Latency Simsteps Inlet"],
            nonoutlier_counts["Latency Simsteps Inlet"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Simsteps Outlet' : [
        np.mean,
        np.median,
    ],
})


## Delivery Failure Rate


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Delivery Failure Rate' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Failure Rate': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': np.mean
})

baseline = median_of_medians.loc[1, "Delivery Failure Rate"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Failure Rate"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Failure Rate': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': np.median
})

baseline = median_of_medians.loc[1, "Delivery Failure Rate"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Failure Rate"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Delivery Failure Rate']
xdf['median_abs_deviation'] = xdf['Delivery Failure Rate']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

try:
    print(len(group1), len(group2))
    print(group1.isna().all(), group2.isna().all())
    res = stats.mannwhitneyu(group1, group2)
    display(res)
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)


In [None]:
try:
    ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)

try:
    xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Delivery Failure Rate"],
            nonoutlier_counts["Delivery Failure Rate"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


## Delivery Clumpiness


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Delivery Clumpiness' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Clumpiness': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': np.mean
})

baseline = median_of_medians.loc[1, "Delivery Clumpiness"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Clumpiness"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Clumpiness': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': np.median
})

baseline = median_of_medians.loc[1, "Delivery Clumpiness"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Clumpiness"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Delivery Clumpiness']
xdf['median_abs_deviation'] = xdf['Delivery Clumpiness']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Delivery Clumpiness"],
            nonoutlier_counts["Delivery Clumpiness"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


## Simstep Period


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Simstep Period Inlet (ns)' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Simstep Period Inlet (ns)': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': np.mean
})

baseline = median_of_medians.loc[1, "Simstep Period Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Simstep Period Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Simstep Period Inlet (ns)': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': np.median
})

baseline = median_of_medians.loc[1, "Simstep Period Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Simstep Period Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Simstep Period Inlet (ns)']
xdf['median_abs_deviation'] = xdf['Simstep Period Inlet (ns)']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Simstep Period Inlet (ns)"],
            nonoutlier_counts["Simstep Period Inlet (ns)"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Simstep Period Outlet (ns)' : [
        np.mean,
        np.median,
    ],
})


# Model Fits


In [None]:
def make_regression_row(*, data, independent_variable, dependent_variable, regression, row_filter):

    filtered_data = data[ data.apply(eval(row_filter), axis=1) ]

    regfun = {
        'Quantile Regression over Means' : smf.quantreg,
        'Quantile Regression over Medians' : smf.quantreg,
    }[regression]
    model = regfun(f"Q('{dependent_variable}') ~ Q('{independent_variable}')", filtered_data)
    fit_model = model.fit()

    slope = fit_model.params[f"Q('{independent_variable}')"]
    intercept = fit_model.params['Intercept']

    slope_ci_lb, slope_ci_ub = fit_model.conf_int().loc[f"Q('{independent_variable}')"].tolist()
    intercept_ci_lb, intercept_ci_ub = fit_model.conf_int().loc['Intercept'].tolist()

    p = fit_model.pvalues.loc[f"Q('{independent_variable}')"]

    # normalize to "control", i.e., lowest num processes observed
    effect_size_normalization_data = data[
        data[independent_variable] == data[independent_variable].min()
    ][dependent_variable]
    effect_size_normalization_factor = {
        'Quantile Regression over Means' : lambda x: x.mean(),
        'Quantile Regression over Medians' : lambda x: x.median(),
    }[regression](effect_size_normalization_data)
    relative_effect_size = slope / effect_size_normalization_factor
    relative_effect_size_ci_lb = slope_ci_lb / effect_size_normalization_factor
    relative_effect_size_ci_ub = slope_ci_ub / effect_size_normalization_factor
    relative_effect_size_ci_width = (
        relative_effect_size_ci_ub
        - relative_effect_size_ci_lb
    )

    is_significant = p < 0.05 if np.isfinite(p) else None

    res = {
        'Independent Variable' : independent_variable,
        'Dependent Variable' : dependent_variable,
        'Dependent Variable Slug' : slugify(dependent_variable),
        'Cpus Per Node' : ib.dub( data['Cpus Per Node'] ),
        'Num Simels Per Cpu' : ip.pophomogeneous( data['Num Simels Per Cpu'] ),
        'Slope Estimate' : slope,
        'Slope Estimate 95% CI Lower Bound' : slope_ci_lb,
        'Slope Estimate 95% CI Upper Bound' : slope_ci_ub,
        'Absolute Effect Size' : slope,
        'Absolute Effect Size 95% CI Lower Bound' : slope_ci_lb,
        'Absolute Effect Size 95% CI Upper Bound' : slope_ci_ub,
        'Absolute Effect Size 95% CI Width' : slope_ci_ub - slope_ci_lb,
        'Relative Effect Size' : relative_effect_size,
        'Relative Effect Size 95% CI Lower Bound' : relative_effect_size_ci_lb,
        'Relative Effect Size 95% CI Upper Bound' : relative_effect_size_ci_ub,
        'Relative Effect Size 95% CI Width' : relative_effect_size_ci_width,
        'Intercept Estimate' : intercept,
        'Intercept Estimate 95% CI Lower Bound' : intercept_ci_lb,
        'Intercept Estimate 95% CI Upper Bound' : intercept_ci_ub,
        'R^2' : fit_model.rsquared,
        'p' : fit_model.pvalues.loc[f"Q('{independent_variable}')"],
        'Significant?' : is_significant,
        'Significant Effect Sign' : (
            '-' if is_significant and slope < 0
            else '+' if is_significant and slope > 0
            else '0' if is_significant is not None
            else None
        ),
        'n' : len(filtered_data),
        'Filter' : row_filter,
        'Num Processes' : ib.dub(filtered_data['Num Processes']),
        'Num Processes Prettyprint' : (
            '/'.join(filtered_data['Num Processes'].sort_values().astype(str).unique())
        ),
        'Regression Model' : regression,
        'Regression Model Slug' : slugify(regression),
        'Statistic' : {
            'Quantile Regression over Means' : 'mean',
            'Quantile Regression over Medians' : 'median',
        }[regression],
    }

    # dump regression summary to file
    summary_filename = kn.pack({
        **{
            'a' : 'regression_summary',
            'ext' : '.txt',
        },
        **{
            slugify(k) : slugify(str(v))
            for k, v in res.items()
            if k in [
                'Independent Variable',
                'Dependent Variable',
                'Cpus Per Node',
                'Num Simels Per Cpu',
                'Regression Model',
            ]
        },
    })

    pathlib.Path('outplots').mkdir(parents=True, exist_ok=True)
    with open(f'outplots/{summary_filename}', 'w') as file:
        print(fit_model.summary(), file=file)

    return res


In [None]:
dependent_variables = [
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Delivery Failure Rate',
    'Delivery Clumpiness',
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
]

# best-case approximation to replace infs/nans
# see listings of infs/nans below
df_snapshot_diffs_copy = df_snapshot_diffs.copy()
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

regression_data_tuples = [
    (
        'Quantile Regression over Means',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).mean().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Cpus Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
    (
        'Quantile Regression over Medians',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).median().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Cpus Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
]

row_filters = [
    'lambda row: True',
]

regression_results = pd.DataFrame.from_records([
    make_regression_row(
        data=data_subset,
        independent_variable=allocation_idx_mapped_title,
        dependent_variable=dependent_variable,
        regression=regression,
        row_filter=row_filter,
    )
    for row_filter in row_filters
    for regression, data in regression_data_tuples
    for _, data_subset in data.groupby([
        'Num Simels Per Cpu',
    ])
    for dependent_variable in dependent_variables
])


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

regression_results[ ~np.isfinite(regression_results['p']) ]


In [None]:
input_attrs = pd.DataFrame.from_records([
    {
        k : v
        for k, v in kn.unpack(source_filename).items()
        if k and k[0] != '_' and k != 'ext'
    }
    for source_filename in [
        *df['Source File Inlet'].unique(),
        *df['Source File Outlet'].unique(),
    ]
]).dropna(
    axis='columns',
    how='any',
)

out_filename = lambda readability: kn.pack({
    **{
        col : ib.dub(input_attrs[col])
        for col in input_attrs.columns
    },
    **{
        'a' : 'with_lac_417_vs_sans_lac_417_regression_results',
        'readability' : readability,
        'ext' : '.csv',
    },
})

out_filepath = f"outplots/{out_filename('human')}"
print(out_filepath)

pathlib.Path('outplots').mkdir(parents=True, exist_ok=True)
regression_results.to_csv(
    out_filepath,
    index=False,
)

out_filepath = f"outplots/{out_filename('latexcsvreader')}"
print(out_filepath)

pathlib.Path('outplots').mkdir(parents=True, exist_ok=True)
regression_results.rename(
    columns=lambda col: ''.join(filter(str.isalnum, col)),
).to_csv(
    out_filepath,
    index=False,
    float_format=lambda col: [
        '{:_.0f}'.format(float(f'{x:.2g}')).replace('_', "'")
         if 10 < abs(x) < 10e5
         else f'{x:.2g}' for x in col
    ],
    na_rep='NaN',
)


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
regression_results


In [None]:
# adapted from https://stackoverflow.com/questions/30385975/seaborn-factor-plot-custom-error-bars
# and https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html#visualization-errorbars
def errplot(x, y, err_lb, err_ub, **kwargs):
    ax = plt.gca()
    data = kwargs.pop('data')
    yerr=np.abs(
        data[[err_lb, err_ub]].to_numpy()
        - data[[y, y]].to_numpy()
    ).transpose()
    plt.axhline(
        y=0,
        zorder=1,
        color='black',
        linewidth=2,
    )
    data.plot(
        x=x,
        y=y,
        yerr=yerr,
        kind='bar',
        ax=ax,
        zorder=3,
        **kwargs,
    ).grid(
        axis='y',
        zorder=0,
    )

    if x is None:
        # adapted from https://stackoverflow.com/a/12998531
        plt.tick_params(
            axis='x',          # changes apply to the x-axis
            which='both',      # both major and minor ticks are affected
            bottom=False,      # ticks along the bottom edge are off
            top=False,         # ticks along the top edge are off
            labelbottom=False,
        )


def facet_errplot(*, data, x=None, y, err_lb, err_ub, estimated_statistic, col=None, row=None, size_inches=None, **kwargs):
    g = sns.FacetGrid(
        subset,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey=False,
    )
    g.map_dataframe(
        errplot,
        x,
        y,
        err_lb,
        err_ub,
        **kwargs,
    )

    if size_inches is not None:
        plt.gcf().set_size_inches(*size_inches)

    # adapted from https://stackoverflow.com/a/29814281
    plt.gcf().subplots_adjust(top=0.9)
    plt.gcf().suptitle(
        f"Estimated Statistic = {estimated_statistic}",
    )


In [None]:
for regression, subset in regression_results.groupby([
    'Regression Model',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Relative Effect Size 95% CI Lower Bound',
            err_ub='Relative Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        x='Dependent Variable',
        y='Relative Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : 'Median',
            'Quantile Regression over Means' : 'Mean',
        }[regression],
        size_inches=(8, 8),
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


In [None]:
# relative estimates, alternate

for (regression, dependent_variable), subset in regression_results.groupby([
    'Regression Model',
    'Dependent Variable',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Relative Effect Size 95% CI Lower Bound',
            err_ub='Relative Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        y='Relative Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : f'{dependent_variable} Median',
            'Quantile Regression over Means' : f'{dependent_variable} Mean',
        }[regression],
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir=slugify(dependent_variable),
    )


In [None]:
# absolute estimates

for (regression, dependent_variable), subset in regression_results.groupby([
    'Regression Model',
    'Dependent Variable',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Absolute Effect Size 95% CI Lower Bound',
            err_ub='Absolute Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        y='Absolute Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : f'{dependent_variable} Median',
            'Quantile Regression over Means' : f'{dependent_variable} Mean',
        }[regression],
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir=slugify(dependent_variable),
    )


In [None]:
def quantile_regplot(fit_reg=True, color=None, *args, **kwargs):
    x, y, data = kwargs['x'], kwargs['y'], kwargs['data']
    sns.regplot(
        *args,
        **kwargs,
        fit_reg=False,
        color=color,
    )

    if fit_reg:
        model = smf.quantreg(
            f"Q('{y}') ~ Q('{x}')",
            data
        )
        res = model.fit(q=0.5)
        m = res.params[f"Q('{x}')"]
        b = res.params['Intercept']

        m_ci = res.conf_int().loc[f"Q('{x}')"].tolist()
        b_ci = res.conf_int().loc['Intercept'].tolist()

        center_x = np.mean([data[x].min(), data[x].max()])
        center_y = m * center_x + b

        xs = sorted(set(data[x]) | {center_x})
        ys = [
            m * x_ + b
            for x_ in xs
        ]
        y1 = [ min(
                m_ * ( x_ - center_x ) + center_y
                for m_ in m_ci
        ) for x_ in xs ]
        y2 = [ max(
                m_ * ( x_ - center_x ) + center_y
                for m_ in m_ci
        ) for x_ in xs ]

        plt.gca().plot(
            xs,
            ys,
            color=color,
        )
        plt.gca().fill_between(
            xs,
            y1,
            y2,
            alpha=0.2,
            color=color,
        )


In [None]:
def unsplit_regression(*args, regplot, **kwargs):
    del kwargs['color']
    regplot(
        *args,
        **kwargs,
        color='black',
        fit_reg=False,
    )
    regplot(
        *args,
        **kwargs,
        color='purple',
        scatter=False,
    )

    # adapted from https://www.scivision.dev/matplotlib-force-integer-labeling-of-axis/
    plt.gca().xaxis.set_major_locator(
        matplotlib.ticker.MaxNLocator(
            integer=True,
        ),
    )

def facet_unsplit_regression(*, data, col=None, row=None, x, y, regression, **kwargs):
    g = sns.FacetGrid(
        data,
        col=col if col is not None and data[col].nunique() > 1 else None,
        row=row if row is not None and data[row].nunique() > 1 else None,
        margin_titles=True,
        sharey=False,
    )
    g.map_dataframe(
        unsplit_regression,
        regplot={
            'Ordinary Least Squares Regression' : quantile_regplot,
            'Quantile Regression' : quantile_regplot,
        }[regression],
        x=x,
        y=y,
        **kwargs,
    )

    # adapted from https://stackoverflow.com/a/48208266
    g.set_axis_labels(x_var=x, y_var=y)

    # adapted from https://stackoverflow.com/a/29814281
    plt.gcf().subplots_adjust(top=0.8)
    plt.gcf().suptitle(
        {
            "Quantile Regression": "Quantile Regression over Medians",
            "Ordinary Least Squares Regression": "Quantile Regression over Means",
        }[regression],
    )


## Latency Walltime Inlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Inlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


## Latency Walltime Outlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Outlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


## Latency Simsteps Inlet


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Inlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


## Latency Simsteps Outlet


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Outlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


## Delivery Failure Rate


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Failure Rate'])
][[
    'Delivery Failure Rate',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


## Delivery Clumpiness


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Clumpiness'])
][[
    'Delivery Clumpiness',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})


for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


## Simstep Period Inlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Inlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


## Simstep Period Outlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Outlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


# Outlier Analysis


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 1)
]


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 2048)
]


In [None]:
df_snapshot_diffs["Num Messages Per Pull"] = (
    df_snapshot_diffs['Net Flux Through Duct']
    / df_snapshot_diffs['Num Pulls That Were Laden Immediately']
    / df_snapshot_diffs["Delivery Success Rate"]
).clip(lower=1)
assert (df_snapshot_diffs["Num Messages Per Pull"] >= 1).all()
df_snapshot_diffs["Any Messages Dropped"] = df_snapshot_diffs["Fraction Messages Dropped"].astype(bool)


In [None]:
performance_semantics_plot(
    data=df_snapshot_diffs,
    hue="Allocation",
    hue_order=["Sans lac-417", "With lac-417"],
)
