In [72]:
import json
import os
import re
from collections import defaultdict

from report_util import *

DATA_FILE_NAME = 'report.csv'
INFO_FILE_NAME = 'info.json'
STATUSES = ['tag', 'success', 'timeout', 'crash', 'abort', 'fail', 'vm-crash']
MINIMUM_VERSIONS = defaultdict(
    lambda: 8,
    {
        'JdkUnsafe': 9,
        'MethodHandleJava9': 9,
        'RecordType': 16,
        'StringIndyConcat': 9,
        'VarHandle': 9
    }
)


class FunctionalTrial:
    """Represents the results of a taint tracking tool on the functional benchmarks."""

    def __init__(self, trial_dir):
        self.id = os.path.basename(trial_dir)
        self.data_file = os.path.join(trial_dir, DATA_FILE_NAME)
        self.info_file = os.path.join(trial_dir, INFO_FILE_NAME)
        self.valid = all(os.path.isfile(f) for f in [self.data_file, self.info_file])
        self.vendor = self.version = self.tool = None
        if os.path.isfile(self.info_file):
            with open(self.info_file, 'r') as f:
                info = json.load(f)
                self.version = int(info['version'])
                self.tool = info['tool']
                self.vendor = info['vendor']

    def get_id(self):
        return f"<id={self.id}, tool={self.tool}, vendor={self.vendor}, version={self.version}>"

    def get_data_frame(self):
        data = pd.read_csv(self.data_file) \
            .rename(columns=lambda x: x.strip())
        return set_columns(data, version=self.version, vendor=self.vendor, tool=self.tool)


def get_benchmark_group(test_identifier):
    m = re.search(r'\.([^\.]*?)ITCase', test_identifier)
    return m.group(1)


def compute_status(fp, fn, status):
    if status != 'success':
        return status
    else:
        return 'success' if (fp + fn == 0) else "tag"


def find_trials(input_dir):
    print(f'Searching for trials in {input_dir}.')
    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
    trials = list(map(FunctionalTrial, filter(os.path.isdir, files)))
    print(f"\tFound {len(trials)} trials.")
    return trials


def check_trials(trials):
    print(f'Checking trials.')
    result = []
    for c in trials:
        if not c.valid:
            print(f"\tMissing required files for {c.get_id()}.")
        else:
            result.append(c)
    print(f'\t{len(result)} trials were valid.')
    return result


def combine_data(trials, file):
    print('Creating functional benchmark CSV.')
    data = pd.concat([t.get_data_frame() for t in trials]) \
        .reset_index(drop=True)
    # Remove disabled tests
    data = data[data['status'] != 'disabled']
    # Convert test class names into benchmark groups
    data['group'] = data['test'].apply(get_benchmark_group)
    data.to_csv(file, index=False)
    print(f'\tWrote functional benchmark CSV to {file}.')
    return data


def complete_cartesian_index(data, columns, fill_value=0, categories=None):
    if categories is None:
        categories = [data[c].unique() for c in columns]
    # Create an index that it the cartesian product of all unique values in the specified columns
    index = pd.MultiIndex.from_product(categories, names=columns)
    # Set the index of the data to be the selected columns and align with the new index
    return data.set_index(columns) \
        .reindex(index, fill_value=fill_value) \
        .reset_index()


def compute_executions_counts(data):
    by = ['group', 'tool', 'version', 'vendor']
    executed = data.groupby(by)['status'] \
        .size() \
        .rename('executed') \
        .reset_index() \
        .drop_duplicates() \
        .reset_index(drop=True)
    return complete_cartesian_index(executed, by)


def check_execution_counts(executed):
    groups = list(executed['group'].unique())
    for group in groups:
        min_version = MINIMUM_VERSIONS[group]
        selected = select(executed, group=group)
        # Assert that there are no executions for each group on JDKs lower than its minimum version
        disabled = selected[selected['version'] < min_version]['executed']
        assert set(disabled) == {0} or len(disabled) == 0
        enabled = selected[selected['version'] >= min_version]['executed']
        # Assert that the total number tests executed in each group for each tool on each JDK is consistent
        assert len(set(enabled)) == 1


def assert_record_equality(frames):
    if len(frames) <= 1:
        return
    record_sets = [set(list(map(tuple, frame.to_dict(orient='records')))) for frame in frames]
    assert all(record_sets[0] == record_set for record_set in record_sets[1:])


def assert_vendor_matches(counts):
    vendors = list(counts['vendor'].unique())
    for version in list(counts['version'].unique()):
        assert_record_equality([select(counts, version=version, vendor=vendor) for vendor in vendors])


reports_dir = '/home/katie/Downloads/slurm-1189404/'
pd.set_option('display.max_rows', 100)
trials = check_trials(find_trials(reports_dir))
# Aggregate the individual CSVs
data = combine_data(trials, os.path.join(reports_dir, 'semantic.csv'))
data.head()

Searching for trials in /home/katie/Downloads/slurm-1189404/.
	Found 24 trials.
Checking trials.
	24 trials were valid.
Creating functional benchmark CSV.
	Wrote functional benchmark CSV to /home/katie/Downloads/slurm-1189404/semantic.csv.


Unnamed: 0,test,tp,fp,fn,status,version,vendor,tool,group
0,[engine:junit-jupiter]/[class:edu.neu.ccs.prl....,1,0,0,success,21,corretto,galette,RecordType
1,[engine:junit-jupiter]/[class:edu.neu.ccs.prl....,0,0,0,success,21,corretto,galette,RecordType
2,[engine:junit-jupiter]/[class:edu.neu.ccs.prl....,1,0,0,success,21,corretto,galette,RecordType
3,[engine:junit-jupiter]/[class:edu.neu.ccs.prl....,0,0,0,success,21,corretto,galette,RecordType
4,[engine:junit-jupiter]/[class:edu.neu.ccs.prl....,1,0,0,success,21,corretto,galette,RecordType


In [73]:
executed = compute_executions_counts(data)
check_execution_counts(executed)
executed

Unnamed: 0,group,tool,version,vendor,executed
0,ArrayAccess,galette,8,corretto,74
1,ArrayAccess,galette,8,temurin,74
2,ArrayAccess,galette,11,corretto,74
3,ArrayAccess,galette,11,temurin,74
4,ArrayAccess,galette,17,corretto,74
...,...,...,...,...,...
643,VarHandle,phosphor,11,temurin,1932
644,VarHandle,phosphor,17,corretto,1932
645,VarHandle,phosphor,17,temurin,1932
646,VarHandle,phosphor,21,corretto,1932


In [74]:
def create_count_table(data, executed):
    # Compute statuses
    data['status'] = data[['fp', 'fn', 'status']] \
        .apply(lambda x: compute_status(*x), axis=1)
    by = ['group', 'tool', 'version', 'vendor']
    # Count the number of entries in each status for each group for each tool on each JDK
    counts = data.groupby(by)['status'] \
        .value_counts() \
        .reset_index()
    # Fill in zeros for missing combinations
    # Pivot along the statuses to put the statues in columns
    categories = [data[c].unique() for c in by] + [STATUSES]
    counts = complete_cartesian_index(counts, by + ['status'], categories=categories) \
        .pivot(columns='status', index=by, values='count') \
        .fillna(0) \
        .astype('int64') \
        .reset_index()
    # Compute the total number of tests per group
    totals = executed.groupby(['group'])['executed'] \
        .max() \
        .rename('total') \
        .reset_index() \
        .drop_duplicates()
    # Add totals to the table
    counts = counts.merge(totals, on=['group'], how='left')
    executed = executed[['group', 'version', 'executed']].drop_duplicates()
    # Added executed and totals to table
    return counts.merge(executed, on=['group', 'version'], how='left')


counts = create_count_table(data, executed)
# Confirm that each tool performed the same on each JDK version for all vendors
assert_vendor_matches(counts)
# Since the results for different vendors are identical, select a single vendor
counts = select(counts, vendor='temurin') \
    .drop(columns='vendor') \
    .reset_index(drop=True)
counts

Unnamed: 0,group,tool,version,abort,crash,fail,success,tag,timeout,vm-crash,total,executed
0,ArrayAccess,galette,8,0,0,0,74,0,0,0,74,74
1,ArrayAccess,galette,11,0,0,0,74,0,0,0,74,74
2,ArrayAccess,galette,17,0,0,0,74,0,0,0,74,74
3,ArrayAccess,galette,21,0,0,0,74,0,0,0,74,74
4,ArrayAccess,mirror-taint,8,0,0,0,19,55,0,0,74,74
...,...,...,...,...,...,...,...,...,...,...,...,...
319,VarHandle,mirror-taint,21,0,0,0,579,1353,0,0,1932,1932
320,VarHandle,phosphor,8,0,0,0,0,0,0,0,1932,0
321,VarHandle,phosphor,11,0,0,190,1016,726,0,0,1932,1932
322,VarHandle,phosphor,17,0,0,0,0,0,0,1932,1932,1932


In [75]:
# Drop rows for groups where nothing was executed (because the minimum version was not satisfied)
counts = pd.DataFrame(counts[counts['executed'] != 0])
# Compute the total number of tests with statuses indicating a deviation from
# the original program semantics
counts['sem'] = counts['abort'] + counts['crash'] + counts['fail'] + counts['timeout'] + counts['vm-crash']
counts

Unnamed: 0,group,tool,version,abort,crash,fail,success,tag,timeout,vm-crash,total,executed,sem
0,ArrayAccess,galette,8,0,0,0,74,0,0,0,74,74,0
1,ArrayAccess,galette,11,0,0,0,74,0,0,0,74,74,0
2,ArrayAccess,galette,17,0,0,0,74,0,0,0,74,74,0
3,ArrayAccess,galette,21,0,0,0,74,0,0,0,74,74,0
4,ArrayAccess,mirror-taint,8,0,0,0,19,55,0,0,74,74,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
318,VarHandle,mirror-taint,17,0,0,0,579,1353,0,0,1932,1932,0
319,VarHandle,mirror-taint,21,0,0,0,579,1353,0,0,1932,1932,0
321,VarHandle,phosphor,11,0,0,190,1016,726,0,0,1932,1932,190
322,VarHandle,phosphor,17,0,0,0,0,0,0,1932,1932,1932,1932


In [76]:
failures = counts.melt(id_vars=['group', 'tool', 'version', 'total'], value_vars=['sem', 'tag'])
failures

Unnamed: 0,group,tool,version,total,variable,value
0,ArrayAccess,galette,8,74,sem,0
1,ArrayAccess,galette,11,74,sem,0
2,ArrayAccess,galette,17,74,sem,0
3,ArrayAccess,galette,21,74,sem,0
4,ArrayAccess,mirror-taint,8,74,sem,0
...,...,...,...,...,...,...
607,VarHandle,mirror-taint,17,1932,tag,1353
608,VarHandle,mirror-taint,21,1932,tag,1353
609,VarHandle,phosphor,11,1932,tag,726
610,VarHandle,phosphor,17,1932,tag,0


In [94]:
def format_column_names(names):
    return (
        names[0].title().replace('-', ''),
        names[1].title()
    )


def format_functional_table(failures):
    table = failures.pivot(index=['group', 'total', 'version'], values=['value'], columns=['tool', 'variable']) \
        .reorder_levels(axis=1, order=['tool', 'variable', None]) \
        .sort_index(axis=1) \
        .sort_index(axis=0) \
        .droplevel(2, axis=1)
    table.index.names = [x.title() for x in table.index.names]
    table.columns = table.columns.map(format_column_names)
    table.columns.names = [None for _ in table.columns.names]
    return table.style.format(precision=0, na_rep='---')


# Split the rows into two tables
groups = sorted(list(failures['group'].unique()))
styler = format_functional_table(failures)
display(styler)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Galette,Galette,MirrorTaint,MirrorTaint,Phosphor,Phosphor
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Sem,Tag,Sem,Tag,Sem,Tag
Group,Total,Version,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
ArrayAccess,74,8,0,0,0,55,0,0
ArrayAccess,74,11,0,0,0,55,0,0
ArrayAccess,74,17,0,0,0,55,74,0
ArrayAccess,74,21,0,0,0,55,74,0
ArrayLength,5,8,0,0,0,3,0,0
ArrayLength,5,11,0,0,0,3,0,0
ArrayLength,5,17,0,0,0,3,5,0
ArrayLength,5,21,0,0,0,3,5,0
ArrayReflection,75,8,0,0,0,65,0,7
ArrayReflection,75,11,0,0,0,65,0,7


In [95]:
latex = styler.to_latex(multicol_align='c', hrules=True, multirow_align='c', convert_css=True)

In [106]:
def fix_chunk(group, stripe):
    lines = list(map(str.strip, group.strip().split(r'\\')))
    lines = [l for l in lines if len(l) != 0]
    first = [x.strip() for x in lines[0].split('&')]
    group_name = first[0]
    total_tests = first[1]
    first[0] = ''
    first[1] = ''
    lines[0] = ' & '.join(first)
    match = re.search(r'\{(.*?)\}\{\*\}\{(.*?)\}', group_name)
    name = match.group(2)
    new_name = name[0] + ""
    for c in name[1:]:
        if c.isupper():
            new_name += r"\\"
        new_name += c
    match2 = re.search(r'\{(.*?)\}\{\*\}\{(.*?)\}', total_tests)
    last = [x.strip() for x in lines[-1].split('&')]
    last[0] = r'\multirow[c]{-' + match.group(1) + r'}{*}{\shortstack[c]{' + new_name + '}} '
    last[1] = r'\multirow[c]{-' + match2.group(1) + r'}{*}{' + match2.group(2) + '} '
    lines[-1] = ' & '.join(last)
    if stripe:
        lines = ['\\rowcolor{row-stripe}\n' + l for l in lines]
    return ' \\\\\n'.join(lines)


def create_table(prefix, suffix, chunks, stripe):
    processed = []
    for chunk in chunks:
        processed.append(fix_chunk(chunk, stripe))
        stripe = not stripe
    return prefix + " \\\\\n".join(processed) + suffix


def fix_latex(latex):
    start = latex.index(r'\multirow')
    end = latex.index(r'\bottomrule')
    chunks = re.split('\n\s*?\\\\multirow', latex[start:end])
    chunks = [r'\multirow' + g for g in chunks if len(g) > 0]
    # Split the table in half
    i = len(chunks) // 2
    return (create_table(latex[:start], latex[end:], chunks[:i], True),
            create_table(latex[:start], latex[end:], chunks[i:], False))


a, b = fix_latex(latex)
print(a)

\begin{tabular}{lllrrrrrr}
\toprule
 &  &  & \multicolumn{2}{c}{Galette} & \multicolumn{2}{c}{MirrorTaint} & \multicolumn{2}{c}{Phosphor} \\
 &  &  & Sem & Tag & Sem & Tag & Sem & Tag \\
Group & Total & Version &  &  &  &  &  &  \\
\midrule
\rowcolor{row-stripe}
 &  & 8 & 0 & 0 & 0 & 55 & 0 & 0 \\
\rowcolor{row-stripe}
&  & 11 & 0 & 0 & 0 & 55 & 0 & 0 \\
\rowcolor{row-stripe}
&  & 17 & 0 & 0 & 0 & 55 & 74 & 0 \\
\rowcolor{row-stripe}
\multirow[c]{-4}{*}{\shortstack[c]{Array\\Access}}  & \multirow[c]{-4}{*}{74}  & 21 & 0 & 0 & 0 & 55 & 74 & 0 \\
 &  & 8 & 0 & 0 & 0 & 3 & 0 & 0 \\
&  & 11 & 0 & 0 & 0 & 3 & 0 & 0 \\
&  & 17 & 0 & 0 & 0 & 3 & 5 & 0 \\
\multirow[c]{-4}{*}{\shortstack[c]{Array\\Length}}  & \multirow[c]{-4}{*}{5}  & 21 & 0 & 0 & 0 & 3 & 5 & 0 \\
\rowcolor{row-stripe}
 &  & 8 & 0 & 0 & 0 & 65 & 0 & 7 \\
\rowcolor{row-stripe}
&  & 11 & 0 & 0 & 0 & 65 & 0 & 7 \\
\rowcolor{row-stripe}
&  & 17 & 0 & 0 & 0 & 65 & 75 & 0 \\
\rowcolor{row-stripe}
\multirow[c]{-4}{*}{\shortstack[c]{Ar

In [107]:
print(b)

\begin{tabular}{lllrrrrrr}
\toprule
 &  &  & \multicolumn{2}{c}{Galette} & \multicolumn{2}{c}{MirrorTaint} & \multicolumn{2}{c}{Phosphor} \\
 &  &  & Sem & Tag & Sem & Tag & Sem & Tag \\
Group & Total & Version &  &  &  &  &  &  \\
\midrule
 &  & 8 & 0 & 0 & 0 & 1 & 0 & 0 \\
&  & 11 & 0 & 0 & 0 & 1 & 0 & 0 \\
&  & 17 & 0 & 0 & 0 & 1 & 4 & 0 \\
\multirow[c]{-4}{*}{\shortstack[c]{Loop}}  & \multirow[c]{-4}{*}{4}  & 21 & 0 & 0 & 0 & 1 & 4 & 0 \\
\rowcolor{row-stripe}
 &  & 8 & 0 & 0 & 0 & 0 & 0 & 0 \\
\rowcolor{row-stripe}
&  & 11 & 0 & 0 & 0 & 0 & 0 & 0 \\
\rowcolor{row-stripe}
&  & 17 & 0 & 0 & 0 & 0 & 14 & 0 \\
\rowcolor{row-stripe}
\multirow[c]{-4}{*}{\shortstack[c]{Method\\Call}}  & \multirow[c]{-4}{*}{14}  & 21 & 0 & 0 & 0 & 0 & 14 & 0 \\
 &  & 8 & 0 & 0 & 0 & 34 & 3 & 36 \\
&  & 11 & 0 & 0 & 0 & 34 & 3 & 36 \\
&  & 17 & 0 & 0 & 0 & 34 & 40 & 0 \\
\multirow[c]{-4}{*}{\shortstack[c]{Method\\Handle}}  & \multirow[c]{-4}{*}{40}  & 21 & 0 & 0 & 0 & 34 & 40 & 0 \\
\rowcolor{row-stripe}
 