In [None]:
import warnings

from iterdub import iterdub as ib
from iterpop import iterpop as ip
from keyname import keyname as kn
from matplotlib import pyplot as plt
import matplotlib
from nbmetalog import nbmetalog as nbm
import numpy as np
import pandas as pd
import pathlib
from scipy import stats
import seaborn as sns
from slugify import slugify
import statsmodels.api as sm
import statsmodels.formula.api as smf
from teeplot import teeplot as tp


In [None]:
from conduitpylib.utils import (
    consolidate_merge,
    count_outliers,
    count_nonoutliers,
    count_proportion_outliers,
)

from conduitpylib.wrangle import (
    find_treat_idx_mapped_col,
    retrieve_and_prepare_delta_dataframes,
    wrangle_world_sums,
)


In [None]:
nbm.print_metadata()


# Get Data


In [None]:
merge_df, (
    df_finalized_observations,
    df_snapshot_diffs,
) = retrieve_and_prepare_delta_dataframes(
    df_inlet_url="https://osf.io/jgpnv/download",
    df_outlet_url="https://osf.io/ncdfq/download",
    treatment_column="Allocation",
    return_merge_df=True,
)

allocation_idx_mapped_title = find_treat_idx_mapped_col(df_snapshot_diffs)
allocation_idx_mapped_title


In [None]:
df_world_sum = wrangle_world_sums(df_finalized_observations)


# End-state Data Analysis

This data appears to be skewed by ragged network launch/completion.


In [None]:
from conduitpylib.viz import facet_boxplot


## Latency Walltime


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Walltime Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-walltime-inlet-s',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Walltime Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-walltime-outlet-s',
)


## Latency Simsteps


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Simsteps Inlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-simsteps-inlet',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Latency Simsteps Outlet',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='latency-simsteps-outlet',
)


## Delivery Failure Rate


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,

    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Delivery Failure Rate',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='delivery-failure-rate',
)


## Delivery Clumpiness


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Delivery Clumpiness',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='delivery-clumpiness',
)


## Simstep Period


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Simstep Period Inlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='simstep-period-inlet-s',
)


In [None]:
tp.tee(
    facet_boxplot,
    data=df_world_sum,
    row='Num Simels Per Cpu',
    x=allocation_idx_mapped_title,
    y='Simstep Period Outlet (s)',
    showfliers=True,
    teeplot_outattrs={
        **{
            'transform' : 'endstate_sumedbyrep',
        },
        **nbm.collate_outattr_metadata(),
    },
    teeplot_subdir='simstep-period-outlet-s',
)


# Live Snapshot Analysis


In [None]:
from conduitpylib.viz import (
    facet_barplot,
    facet_boxplot_withfliers,
    facet_boxplot_nofliers,
)


## Latency Walltime


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Walltime Inlet (ns)' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Walltime Inlet (ns)': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': np.mean
})

baseline = median_of_medians.loc[1, "Latency Walltime Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Walltime Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Walltime Inlet (ns)': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': np.median
})

baseline = median_of_medians.loc[1, "Latency Walltime Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Walltime Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Latency Walltime Inlet (ns)']
xdf['median_abs_deviation'] = xdf['Latency Walltime Inlet (ns)']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Latency Walltime Inlet (ns)"],
            nonoutlier_counts["Latency Walltime Inlet (ns)"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Walltime Inlet (ns)': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


### Distributions


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Walltime Outlet (ns)' : [
        np.mean,
        np.median,
    ],
})


## Latency Simsteps


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Latency Simsteps Inlet' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Simsteps Inlet': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': np.mean
})

baseline = median_of_medians.loc[1, "Latency Simsteps Inlet"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Simsteps Inlet"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Latency Simsteps Inlet': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': np.median
})

baseline = median_of_medians.loc[1, "Latency Simsteps Inlet"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Latency Simsteps Inlet"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Latency Simsteps Inlet']
xdf['median_abs_deviation'] = xdf['Latency Simsteps Inlet']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Latency Simsteps Inlet"],
            nonoutlier_counts["Latency Simsteps Inlet"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Latency Simsteps Inlet': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


In [None]:
# adapted fmke_regr_make_regression_rowrom https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby(
    [
        allocation_idx_mapped_title,
    ]
).agg(
    {
        "Latency Simsteps Outlet": [
            np.mean,
            np.median,
        ],
    }
)


## Delivery Failure Rate


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Delivery Failure Rate' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Failure Rate': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': np.mean
})

baseline = median_of_medians.loc[1, "Delivery Failure Rate"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Failure Rate"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Failure Rate': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': np.median
})

baseline = median_of_medians.loc[1, "Delivery Failure Rate"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Failure Rate"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Delivery Failure Rate']
xdf['median_abs_deviation'] = xdf['Delivery Failure Rate']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

try:
    print(len(group1), len(group2))
    print(group1.isna().all(), group2.isna().all())
    res = stats.mannwhitneyu(group1, group2)
    display(res)
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)


In [None]:
try:
    ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)

try:
    xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()
except Exception as e:
    warnings.warn(f"{type(e).__name__}: {e}", RuntimeWarning)


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Delivery Failure Rate"],
            nonoutlier_counts["Delivery Failure Rate"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Failure Rate': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


## Delivery Clumpiness


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Delivery Clumpiness' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Clumpiness': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': np.mean
})

baseline = median_of_medians.loc[1, "Delivery Clumpiness"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Clumpiness"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Delivery Clumpiness': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': np.median
})

baseline = median_of_medians.loc[1, "Delivery Clumpiness"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Delivery Clumpiness"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Delivery Clumpiness']
xdf['median_abs_deviation'] = xdf['Delivery Clumpiness']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Delivery Clumpiness"],
            nonoutlier_counts["Delivery Clumpiness"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Delivery Clumpiness': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


## Simstep Period


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


### Simple Mean and Median


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Simstep Period Inlet (ns)' : [
        np.mean,
        np.median,
    ],
})


### Median of Replicate Means


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Simstep Period Inlet (ns)': np.mean
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': np.mean
})

baseline = median_of_medians.loc[1, "Simstep Period Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Simstep Period Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median of Replicate Medians


In [None]:
group_medians = df_snapshot_diffs.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'Simstep Period Inlet (ns)': np.median
}).reset_index()

median_of_medians = group_medians.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': np.median
})

baseline = median_of_medians.loc[1, "Simstep Period Inlet (ns)"].squeeze()
median_of_medians["normed delta"] = (median_of_medians["Simstep Period Inlet (ns)"] - baseline) / baseline * 100
median_of_medians


### Median Absolute Deviance


In [None]:
xdf = df_snapshot_diffs.copy()
xdf['median'] = xdf['Simstep Period Inlet (ns)']
xdf['median_abs_deviation'] = xdf['Simstep Period Inlet (ns)']
xdf = xdf.groupby([allocation_idx_mapped_title, 'Replicate']).agg({
    'median': np.median,
    'median_abs_deviation': stats.median_abs_deviation,
}).reset_index()
xdf["normed median_abs_deviation"] = xdf["median_abs_deviation"] / xdf["median"] * 100

name1, name2 = xdf[allocation_idx_mapped_title].unique()

# Extract medians for each group
group1 = xdf[xdf[allocation_idx_mapped_title] == name1]['normed median_abs_deviation']
group2 = xdf[xdf[allocation_idx_mapped_title] == name2]['normed median_abs_deviation']

print(len(group1), len(group2))
stats.mannwhitneyu(group1, group2)


In [None]:
ax = sns.histplot(data=xdf, x="normed median_abs_deviation", hue=allocation_idx_mapped_title)

xdf.groupby(allocation_idx_mapped_title)['normed median_abs_deviation'].median().reset_index()


### Percent Outliers


In [None]:
nonoutlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': count_nonoutliers,
}).reset_index()
nonoutlier_counts


In [None]:
outlier_counts = df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': count_outliers,
}).reset_index()
outlier_counts


In [None]:
stats.chi2_contingency(
    np.vstack(
        [
            outlier_counts["Simstep Period Inlet (ns)"],
            nonoutlier_counts["Simstep Period Inlet (ns)"],
        ],
    ),
)


In [None]:
df_snapshot_diffs.groupby(allocation_idx_mapped_title).agg({
    'Simstep Period Inlet (ns)': lambda x: count_proportion_outliers(x) * 100,
}).reset_index()


In [None]:
for viz in facet_barplot, facet_boxplot_withfliers, facet_boxplot_nofliers:
    tp.tee(
        viz,
        data=df_snapshot_diffs,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


In [None]:
# adapted from https://stackoverflow.com/a/13592901
df_snapshot_diffs.groupby([
    allocation_idx_mapped_title,
]).agg({
    'Simstep Period Outlet (ns)' : [
        np.mean,
        np.median,
    ],
})


# Model Fits


In [None]:
from conduitpylib.wrangle import make_regression_row


In [None]:
dependent_variables = [
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Delivery Failure Rate',
    'Delivery Clumpiness',
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
]

# best-case approximation to replace infs/nans
# see listings of infs/nans below
df_snapshot_diffs_copy = df_snapshot_diffs.copy()
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

regression_data_tuples = [
    (
        'Quantile Regression over Means',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).mean().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Cpus Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
    (
        'Quantile Regression over Medians',
        df_snapshot_diffs.groupby([
            'Execution Instance UUID',
        ]).median().reset_index().astype({
            'Num Processes' : 'int64',
            'Allocated Tasks Per Node' : 'int64',
            'Cpus Per Node' : 'int64',
            'Num Simels Per Cpu' : 'int64',
        })
    ),
]

row_filters = [
    'lambda row: True',
]

regression_results = pd.DataFrame.from_records([
    make_regression_row(
        data=data_subset,
        independent_variable=allocation_idx_mapped_title,
        dependent_variable=dependent_variable,
        regression=regression,
        row_filter=row_filter,
    )
    for row_filter in row_filters
    for regression, data in regression_data_tuples
    for _, data_subset in data.groupby([
        'Num Simels Per Cpu',
    ])
    for dependent_variable in dependent_variables
])


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

regression_results[ ~np.isfinite(regression_results['p']) ]


In [None]:
input_attrs = pd.DataFrame.from_records([
    {
        k : v
        for k, v in kn.unpack(source_filename).items()
        if k and k[0] != '_' and k != 'ext'
    }
    for source_filename in [
        *merge_df['Source File Inlet'].unique(),
        *merge_df['Source File Outlet'].unique(),
    ]
]).dropna(
    axis='columns',
    how='any',
)

out_filename = lambda readability: kn.pack({
    **{
        col : ib.dub(input_attrs[col])
        for col in input_attrs.columns
    },
    **{
        'a' : 'with_lac_417_vs_sans_lac_417_regression_results',
        'readability' : readability,
        'ext' : '.csv',
    },
})

out_filepath = f"outplots/{out_filename('human')}"
print(out_filepath)

pathlib.Path('outplots').mkdir(parents=True, exist_ok=True)
regression_results.to_csv(
    out_filepath,
    index=False,
)

out_filepath = f"outplots/{out_filename('latexcsvreader')}"
print(out_filepath)

pathlib.Path('outplots').mkdir(parents=True, exist_ok=True)
regression_results.rename(
    columns=lambda col: ''.join(filter(str.isalnum, col)),
).to_csv(
    out_filepath,
    index=False,
    float_format=lambda col: [
        '{:_.0f}'.format(float(f'{x:.2g}')).replace('_', "'")
         if 10 < abs(x) < 10e5
         else f'{x:.2g}' for x in col
    ],
    na_rep='NaN',
)


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
regression_results


In [None]:
from conduitpylib.viz import errplot, facet_errplot


In [None]:
for regression, subset in regression_results.groupby([
    'Regression Model',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Relative Effect Size 95% CI Lower Bound',
            err_ub='Relative Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        x='Dependent Variable',
        y='Relative Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : 'Median',
            'Quantile Regression over Means' : 'Mean',
        }[regression],
        size_inches=(8, 8),
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
    )


In [None]:
# relative estimates, alternate

for (regression, dependent_variable), subset in regression_results.groupby([
    'Regression Model',
    'Dependent Variable',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Relative Effect Size 95% CI Lower Bound',
            err_ub='Relative Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        y='Relative Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : f'{dependent_variable} Median',
            'Quantile Regression over Means' : f'{dependent_variable} Mean',
        }[regression],
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir=slugify(dependent_variable),
    )


In [None]:
# absolute estimates

for (regression, dependent_variable), subset in regression_results.groupby([
    'Regression Model',
    'Dependent Variable',
]):
    tp.tee(
        # prevent filename length error
        lambda *args, **kwargs: facet_errplot(
            err_lb='Absolute Effect Size 95% CI Lower Bound',
            err_ub='Absolute Effect Size 95% CI Upper Bound',
            *args,
            **kwargs,
        ),
        data=subset,

        row='Num Simels Per Cpu',
        y='Absolute Effect Size',
        estimated_statistic={
            'Quantile Regression over Medians' : f'{dependent_variable} Median',
            'Quantile Regression over Means' : f'{dependent_variable} Mean',
        }[regression],
        teeplot_outattrs={
            **{
                'transform' : 'fit_regression',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir=slugify(dependent_variable),
    )


In [None]:
from conduitpylib.viz import quantile_regplot


In [None]:
from conduitpylib.viz import unsplit_regression, facet_unsplit_regression


## Latency Walltime Inlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Inlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-inlet-ns',
    )


## Latency Walltime Outlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Walltime Outlet (ns)'])
][[
    'Latency Walltime Inlet (ns)',
    'Latency Walltime Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Walltime Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Walltime Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-walltime-outlet-ns',
    )


## Latency Simsteps Inlet


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Inlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Inlet'] = (
    df_snapshot_diffs_copy['Num Puts Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Inlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Inlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-inlet',
    )


## Latency Simsteps Outlet


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Latency Simsteps Outlet'])
][[
    'Latency Simsteps Inlet',
    'Latency Simsteps Outlet',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs/nans
# see listing of infs/nans above
df_snapshot_diffs_copy['Latency Simsteps Outlet'] = (
    df_snapshot_diffs_copy['Num Pulls Attempted']
    / np.maximum(df_snapshot_diffs_copy['Num Round Trip Touches Outlet'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Latency Simsteps Outlet',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='latency-simsteps-outlet',
    )


## Delivery Failure Rate


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Failure Rate'])
][[
    'Delivery Failure Rate',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Failure Rate',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-failure-rate',
    )


## Delivery Clumpiness


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Delivery Clumpiness'])
][[
    'Delivery Clumpiness',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})


for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Delivery Clumpiness',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='delivery-clumpiness',
    )


## Simstep Period Inlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Inlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Inlet (ns)'] = (
    df_snapshot_diffs_copy['Inlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Puts Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Inlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-inlet-ns',
    )


## Simstep Period Outlet (ns)


In [None]:
df_snapshot_diffs[
    ~np.isfinite(df_snapshot_diffs['Simstep Period Outlet (ns)'])
][[
    'Simstep Period Inlet (ns)',
    'Simstep Period Outlet (ns)',
    'Snapshot',
    'Runtime Seconds Elapsed Outlet',
    'Hostname',
    'Replicate',
    'Num Simels Per Cpu',
    'Cpus Per Node',
    'Num Processes',
]]


In [None]:
df_snapshot_diffs_copy = df_snapshot_diffs.copy()

# best-case approximation to replace infs
# see listing of infs above
df_snapshot_diffs_copy['Simstep Period Outlet (ns)'] = (
    df_snapshot_diffs_copy['Outlet-Nanoseconds Elapsed']
    / np.maximum(df_snapshot_diffs_copy['Num Pulls Attempted'], 1)
)

data = df_snapshot_diffs_copy.groupby([
    'Execution Instance UUID',
]).mean().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Ordinary Least Squares Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-mean',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


In [None]:
data = df_snapshot_diffs.groupby([
    'Execution Instance UUID',
]).median().reset_index().astype({
    'Num Processes' : 'int64',
    'Allocated Tasks Per Node' : 'int64',
    'Cpus Per Node' : 'int64',
    'Num Simels Per Cpu' : 'int64',
    allocation_idx_mapped_title : 'int64',
})

for viz in facet_unsplit_regression,:
    tp.tee(
        viz,
        data=data,
        row='Num Simels Per Cpu',
        x=allocation_idx_mapped_title,
        y='Simstep Period Outlet (ns)',
        marker='+',
        x_jitter=0.15,
        regression='Quantile Regression',
        teeplot_outattrs={
            **{
                'transform' : 'snapshot_diffs-groupby_exec_instance-median',
            },
            **nbm.collate_outattr_metadata(),
        },
        teeplot_subdir='simstep-period-outlet-ns',
    )


# Outlier Analysis


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 1)
]


In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df_snapshot_diffs[
    (df_snapshot_diffs['Latency Simsteps Inlet'] > 50)
    & (df_snapshot_diffs['Num Simels Per Cpu'] == 2048)
]
