In [9]:
import os

import pandas as pd
from scipy import stats

import numpy as np

import performance
from report_util import *
from run_data import extract

reports_dir = '/home/katie/Downloads/galette/slurm-1191447/'
new_reports_dir = '/home/katie/Downloads/galette/slurm-1194149/'
data = extract(reports_dir, os.path.join(reports_dir, 'performance.csv'))
data2 = extract(new_reports_dir, os.path.join(new_reports_dir, 'performance.csv'))
data = pd.concat([data2, data[data['tool'] != 'galette']])
data.head()

Searching for runs in /home/katie/Downloads/galette/slurm-1191447/.
Found 1680 runs.
Checking runs.
	Failed run 557 --- {'benchmark': 'luindex', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 900 --- {'benchmark': 'tradesoap', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 1666 --- {'benchmark': 'h2o', 'tool': 'phosphor', 'status': 'RUN_FAILURE'}
	Failed run 313 --- {'benchmark': 'xalan', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 227 --- {'benchmark': 'tradebeans', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 664 --- {'benchmark': 'pmd', 'tool': 'phosphor', 'status': 'RUN_FAILURE'}
	Failed run 805 --- {'benchmark': 'h2o', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 305 --- {'benchmark': 'luindex', 'tool': 'mirror-taint', 'status': 'RUN_FAILURE'}
	Failed run 335 --- {'benchmark': 'zxing', 'tool': 'phosphor', 'status': 'RUN_FAILURE'}
	Failed run 167 --- {'benchmark': 'zxing', 'tool': 'phosphor', 'status': 'RU

Unnamed: 0,iteration,rss,elapsed_time,benchmark,tool,run_id
0,5,719204,4447,jme,galette,368
1,6,735556,4267,jme,galette,368
2,7,727344,4374,jme,galette,368
3,8,765496,3513,jme,galette,368
4,9,743480,3199,jme,galette,368


In [16]:
from arch.bootstrap import IndependentSamplesBootstrap


def overhead(baseline, treatment):
    med_b = np.median(baseline)
    med_t = np.median(treatment)
    return 100.0 * ((med_t - med_b) / med_b)


def bootstrap_ci(data, statistic):
    # Note: BCa fails for the baseline on pmd for memory
    ci = IndependentSamplesBootstrap(*data, seed=4034) \
        .conf_int(statistic, reps=1_000, method='bc', size=0.95, tail='two')
    return ci[0, 0], ci[1, 0]


def create_performance_row(data, y, tool, benchmark, sig_level):
    baseline = select(data, benchmark=benchmark, tool='none')[y]
    if tool == 'none':
        # Baseline (no tool used)
        value = np.median(baseline)
        lower, upper = bootstrap_ci((baseline,), np.median)
        return dict(benchmark=benchmark, tool=tool, value=value, LCL=lower, UCL=upper, p=np.NaN, a12=np.NaN, sig='')
    treatment = select(data, benchmark=benchmark, tool=tool)[y]
    if len(treatment) == 0:
        # No samples available for tool on benchmark
        return dict(benchmark=benchmark, tool=tool, value=np.NaN, LCL=np.NaN, UCL=np.NaN, p=np.NaN, a12=np.NaN, sig='')
    value = overhead(baseline, treatment)
    lower, upper = bootstrap_ci((baseline, treatment), overhead)
    if tool == 'galette':
        # The alternate tool
        return dict(benchmark=benchmark, tool=tool, value=value, LCL=lower, UCL=upper, p=np.NaN, a12=np.NaN, sig='')
    alternative = select(data, benchmark=benchmark, tool='galette')[y]
    p = mann_whitney(treatment, alternative)
    effect_size = a12(treatment, alternative)
    sig = ''
    if p < sig_level:
        sig = 'color: red;' if value < overhead(baseline, alternative) else 'color: green;'
    return dict(benchmark=benchmark, tool=tool, value=value, LCL=lower, UCL=upper, p=p, a12=effect_size, sig=sig)


def create_performance_table(data, y):
    rows = [create_performance_row(data, y, t, b, sig_level=0.05 / 3) for b in performance.BENCHMARKS for t in
            performance.TOOLS]
    return pd.DataFrame(rows)


def pivot_performance_table(table):
    table = format_tool_names(table) \
        .pivot(index=['benchmark'], values=['value', 'LCL', 'UCL'], columns=['tool']) \
        .reorder_levels(axis=1, order=['tool', None]) \
        .sort_index(axis=1) \
        .sort_index(axis=0) \
        .reindex(['Base', 'Galette', 'MirrorTaint', 'Phosphor'], axis=1, level=0) \
        .reindex(['value', 'LCL', 'UCL'], axis=1, level=1)
    table.index.names = [None for _ in table.index.names]
    table.columns.names = [None for _ in table.columns.names]
    table.columns = pd.MultiIndex.from_tuples([(tool, fix_column_name(tool, x)) for tool, x in table.columns])
    return table


def fix_column_name(tool, x):
    if x == 'value':
        return 'MED' if tool == 'Base' else 'OV%'
    return x


def create_sig_table(table):
    sig = pd.DataFrame(table)
    sig['value'] = sig['sig']
    sig['LCL'] = sig['sig']
    sig['UCL'] = sig['sig']
    return sig


def style_table(table, title):
    values = pivot_performance_table(table)
    sigs = pivot_performance_table(create_sig_table(table))
    formats = {c: "{:,.0f}" for c in values.columns if 'Base' in c}
    formats.update({c: "{:,.2f}" for c in values.columns if 'Base' not in c})
    return values.style.format(formats, na_rep='---') \
        .apply(lambda _: sigs, axis=None) \
        .set_caption(title)

For the baseline, we report the median peak memory usage (MED) in kilobytes.
For each taint tracking system, we report the peak memory usage overhead as a percentage (OV%).
For each statistic (median or overhead), we also report the lower confidence limit (LCL) and upper confidence limit (UCL) of a two-tailed, bias-corrected 95% bootstrap confidence interval for the statistic.
We used 1,000 resamples to compute each confidence interval.
For MirrorTaint and Phosphor, values that are statistically significantly greater than or less than Galette's are colored green and red, respectively.

In [17]:
memory = create_performance_table(data, 'rss')
s_memory = style_table(memory, 'Peak Memory Usage.')
s_memory

Unnamed: 0_level_0,Base,Base,Base,Galette,Galette,Galette,MirrorTaint,MirrorTaint,MirrorTaint,Phosphor,Phosphor,Phosphor
Unnamed: 0_level_1,MED,LCL,UCL,OV%,LCL,UCL,OV%,LCL,UCL,OV%,LCL,UCL
avrora,120864,118980,126868,80.0,71.51,83.23,1789.98,1704.14,1855.07,169.94,157.83,174.95
batik,222608,217384,225816,157.95,151.46,166.3,992.50,955.33,1024.53,---,---,---
biojava,172694,171549,173304,219.2,214.46,224.36,394.00,388.30,401.84,128.44,123.56,130.72
eclipse,301030,295688,311706,100.44,88.66,123.14,---,---,---,---,---,---
fop,142594,141164,147108,113.16,105.44,115.38,---,---,---,227.14,214.77,231.94
graphchi,412470,406610,417721,116.62,108.67,122.72,---,---,---,31.45,29.45,33.19
h2,332658,329884,334990,115.65,110.94,119.03,149.34,131.89,152.42,148.88,146.48,151.18
h2o,393566,384404,403824,68.83,63.68,73.67,---,---,---,---,---,---
jme,261894,251382,274226,173.74,159.9,186.89,860.27,808.54,902.21,134.88,121.00,145.08
jython,421112,415532,425402,667.98,628.22,693.56,---,---,---,---,---,---


In [12]:
print(s_memory.to_latex(multicol_align='c', hrules=True, multirow_align='t', convert_css=True))

\begin{table}
\caption{Peak Memory Usage.}
\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & \multicolumn{3}{c}{Base} & \multicolumn{3}{c}{Galette} & \multicolumn{3}{c}{MirrorTaint} & \multicolumn{3}{c}{Phosphor} \\
 & MED & LCL & UCL & OV & LCL & UCL & OV & LCL & UCL & OV & LCL & UCL \\
\midrule
avrora & 120,864 & 118,980 & 126,868 & 80.00 & 71.51 & 83.23 & \color{green} 1,789.98 & \color{green} 1,704.14 & \color{green} 1,855.07 & \color{green} 169.94 & \color{green} 157.83 & \color{green} 174.95 \\
batik & 222,608 & 217,384 & 225,816 & 157.95 & 151.46 & 166.30 & \color{green} 992.50 & \color{green} 955.33 & \color{green} 1,024.53 & --- & --- & --- \\
biojava & 172,694 & 171,549 & 173,304 & 219.20 & 214.46 & 224.36 & \color{green} 394.00 & \color{green} 388.30 & \color{green} 401.84 & \color{red} 128.44 & \color{red} 123.56 & \color{red} 130.72 \\
eclipse & 301,030 & 295,688 & 311,706 & 100.44 & 88.66 & 123.14 & --- & --- & --- & --- & --- & --- \\
fop & 142,594 & 141,164 & 147,108 & 113.16 

For the baseline, we report the median execution time (MED) in milliseconds.
For each taint tracking system, we report the execution time overhead as a percentage (OV%).
For each statistic (median or overhead), we also report the lower confidence limit (LCL) and upper confidence limit (UCL) of a two-tailed, bias-corrected 95% bootstrap confidence interval for the statistic.
We used 1,000 resamples to compute each confidence interval.
For MirrorTaint and Phosphor, values that are statistically significantly greater than or less than Galette's are colored green and red, respectively.

In [18]:
time = create_performance_table(data, 'elapsed_time')
s_time = style_table(time, 'Execution Time')
s_time

Unnamed: 0_level_0,Base,Base,Base,Galette,Galette,Galette,MirrorTaint,MirrorTaint,MirrorTaint,Phosphor,Phosphor,Phosphor
Unnamed: 0_level_1,MED,LCL,UCL,OV%,LCL,UCL,OV%,LCL,UCL,OV%,LCL,UCL
avrora,2432,2383,2522,87.97,79.72,95.24,57904.11,55604.87,59592.86,161.96,149.11,175.37
batik,265,250,279,1158.3,1077.91,1233.69,306529.25,290280.90,323906.75,---,---,---
biojava,154,148,158,2616.88,2475.01,2758.91,236252.60,222861.47,243522.08,1413.31,1344.32,1478.60
eclipse,4283,1272,16137,-67.81,-91.61,2.54,---,---,---,---,---,---
fop,118,103,137,346.81,272.39,411.0,---,---,---,574.47,464.65,656.83
graphchi,534,517,547,601.97,574.95,629.33,---,---,---,1340.82,1284.58,1396.34
h2,148,141,154,403.39,368.11,436.72,80113.56,76497.13,83594.89,780.34,713.16,842.66
h2o,604,540,611,1553.02,1529.43,1728.71,---,---,---,---,---,---
jme,1029,987,1079,305.25,280.32,335.33,387625.90,368215.51,403685.44,389.16,358.03,437.62
jython,392,382,402,935.92,901.5,974.29,---,---,---,---,---,---


In [19]:
print(s_time.to_latex(multicol_align='c', hrules=True, multirow_align='t', convert_css=True))

\begin{table}
\caption{Execution Time}
\begin{tabular}{lrrrrrrrrrrrr}
\toprule
 & \multicolumn{3}{c}{Base} & \multicolumn{3}{c}{Galette} & \multicolumn{3}{c}{MirrorTaint} & \multicolumn{3}{c}{Phosphor} \\
 & MED & LCL & UCL & OV% & LCL & UCL & OV% & LCL & UCL & OV% & LCL & UCL \\
\midrule
avrora & 2,432 & 2,383 & 2,522 & 87.97 & 79.72 & 95.24 & \color{green} 57,904.11 & \color{green} 55,604.87 & \color{green} 59,592.86 & \color{green} 161.96 & \color{green} 149.11 & \color{green} 175.37 \\
batik & 265 & 250 & 279 & 1,158.30 & 1,077.91 & 1,233.69 & \color{green} 306,529.25 & \color{green} 290,280.90 & \color{green} 323,906.75 & --- & --- & --- \\
biojava & 154 & 148 & 158 & 2,616.88 & 2,475.01 & 2,758.91 & \color{green} 236,252.60 & \color{green} 222,861.47 & \color{green} 243,522.08 & \color{red} 1,413.31 & \color{red} 1,344.32 & \color{red} 1,478.60 \\
eclipse & 4,283 & 1,272 & 16,137 & -67.81 & -91.61 & 2.54 & --- & --- & --- & --- & --- & --- \\
fop & 118 & 103 & 137 & 346.81 & 272.