In [173]:
import json
import os
from collections import defaultdict

import numpy as np

from data_util import *

SEMANTIC_DATA_FILE_NAME = 'report.csv'
INFO_FILE_NAME = 'info.json'
MINIMUM_VERSIONS = defaultdict(
    lambda: 8,
    {
        'JdkUnsafe': 9,
        'MethodHandleJava9': 9,
        'RecordType': 16,
        'StringIndyConcat': 9,
        'VarHandle': 9
    }
)


class SemanticTrial:
    """Represents the results of a taint tracking tool on the synthetic benchmarks."""

    def __init__(self, trial_dir):
        self.id = os.path.basename(trial_dir)
        self.data_file = os.path.join(trial_dir, SEMANTIC_DATA_FILE_NAME)
        self.info_file = os.path.join(trial_dir, INFO_FILE_NAME)
        self.valid = all(os.path.isfile(f) for f in [self.data_file, self.info_file])
        self.vendor = self.version = self.tool = None
        if os.path.isfile(self.info_file):
            with open(self.info_file, 'r') as f:
                info = json.load(f)
                self.version = int(info['version'])
                self.tool = info['tool']
                self.vendor = info['vendor']

    def get_id(self):
        return f"<id={self.id}, tool={self.tool}, vendor={self.vendor}, version={self.version}>"

    def get_data_frame(self):
        data = pd.read_csv(self.data_file) \
            .rename(columns=lambda x: x.strip())
        return set_columns(data, version=self.version, vendor=self.vendor, tool=self.tool)


def get_semantic_group(class_name):
    return class_name.split('.')[-1].removesuffix('ITCase')


def classify(fp, fn, status):
    if status != 'success':
        return "BEHAVIOR"
    else:
        return "PASSED" if (fp + fn == 0) else "PROPAGATION"


def find_trials(input_dir):
    print(f'Searching for trials in {input_dir}.')
    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)]
    trials = list(map(SemanticTrial, filter(os.path.isdir, files)))
    print(f"\tFound {len(trials)} trials.")
    return trials


def check_trials(trials):
    print(f'Checking trials.')
    result = []
    for c in trials:
        if not c.valid:
            print(f"\tMissing required files for {c.get_id()}.")
        else:
            result.append(c)
    print(f'\t{len(result)} trials were valid.')
    return result


def combine_semantic_data(trials, file):
    print('Creating semantic experiment CSV.')
    data = pd.concat([t.get_data_frame() for t in trials]) \
        .reset_index(drop=True)
    data.to_csv(file, index=False)
    print(f'\tWrote semantic experiment CSV to {file}.')
    return data


def assert_record_equality(frames):
    if len(frames) <= 1:
        return
    record_sets = [set(list(map(tuple, frame.to_dict(orient='records')))) for frame in frames]
    assert all(record_sets[0] == record_set for record_set in record_sets[1:])


def assert_vendor_matches(counts):
    vendors = list(counts['vendor'].unique())
    for version in list(counts['version'].unique()):
        assert_record_equality([select(counts, version=version, vendor=vendor) for vendor in vendors])


def assert_consistent_execution_counts(counts):
    groups = list(counts['group'].unique())
    for group in groups:
        assert len(set(select(counts, group=group)['executed']) - {0}) == 1


def assert_minimum_versions_satisfied(counts):
    groups = list(counts['group'].unique())
    for group in groups:
        selected = select(counts, group=group)
        assert_empty_or_zeros(selected[selected['version'] < MINIMUM_VERSIONS[group]]['executed'])


def assert_empty_or_zeros(series):
    assert len(set(series.dropna().unique()) - {0}) == 0


reports_dir = '/home/katie/Downloads/slurm-1189075/'
output_dir = '/home/katie/Downloads/processed-1189075/'
pd.set_option('display.max_rows', 100)
os.makedirs(output_dir, exist_ok=True)
trials = check_trials(find_trials(reports_dir))
# Aggregate the individual CSVs
data = combine_semantic_data(trials, os.path.join(output_dir, 'semantic.csv'))
data.head()

Searching for trials in /home/katie/Downloads/slurm-1189075/.
	Found 24 trials.
Checking trials.
	Missing required files for <id=10, tool=phosphor, vendor=temurin, version=17>.
	Missing required files for <id=14, tool=phosphor, vendor=corretto, version=17>.
	Missing required files for <id=15, tool=phosphor, vendor=corretto, version=21>.
	Missing required files for <id=11, tool=phosphor, vendor=temurin, version=21>.
	20 trials were valid.
Creating semantic experiment CSV.
	Wrote semantic experiment CSV to /home/katie/Downloads/processed-1189075/semantic.csv.


Unnamed: 0,class,method,name,tp,fp,fn,status,version,vendor,tool
0,edu.neu.ccs.prl.galette.bench.ArrayAccessITCase,getSetElement,"getSetElement(type=class java.lang.Object, tai...",3,0,0,success,21,corretto,galette
1,edu.neu.ccs.prl.galette.bench.ArrayAccessITCase,getSetElement,"getSetElement(type=class java.lang.Object, tai...",2,0,0,success,21,corretto,galette
2,edu.neu.ccs.prl.galette.bench.ArrayAccessITCase,getSetElement,"getSetElement(type=class java.lang.Object, tai...",2,0,0,success,21,corretto,galette
3,edu.neu.ccs.prl.galette.bench.ArrayAccessITCase,getSetElement,"getSetElement(type=class java.lang.Object, tai...",1,0,0,success,21,corretto,galette
4,edu.neu.ccs.prl.galette.bench.ArrayAccessITCase,getSetElement,"getSetElement(type=class java.lang.Object, tai...",2,0,0,success,21,corretto,galette


In [189]:
def create_count_table(data):
    # Convert test class names into benchmark groups
    data['group'] = data['class'].apply(get_semantic_group)
    # Remove disabled tests
    data = data[data['status'] != 'disabled']
    # Convert raw data into result category
    data['category'] = data[['fp', 'fn', 'status']] \
        .apply(lambda x: classify(*x), axis=1)
    data['category'] = pd.Categorical(data['category'], categories=list(data['category'].unique()))
    by = ['group', 'tool', 'version', 'vendor']
    # Count the number of entries in each category for each group for each tool on each JDK
    # Pivot along the categories to convert put the categories in columns
    counts = data.groupby(by)['category'] \
        .value_counts() \
        .reset_index() \
        .pivot(columns='category', index=by, values='count') \
        .fillna(0) \
        .astype('int64') \
        .reset_index()
    # Compute the total number test executed in each group for each tool on each JDK
    counts['executed'] = counts['BEHAVIOR'] + counts['PASSED'] + counts['PROPAGATION']
    # Confirm that the total number test executed in each group for each tool on each JDK is consistent
    assert_consistent_execution_counts(counts)
    # Confirm that there are no results for each group on JDKs lower than its minimum version
    assert_minimum_versions_satisfied(counts)
    # Compute the total number of enabled tests per group per version
    enabled = counts.groupby(['group', 'version'])['executed'] \
        .max() \
        .rename('enabled') \
        .reset_index()
    # Added totals to table
    counts = counts.merge(enabled, on=['group', 'version'], how='left')
    # Compute the number of tests that were not run
    counts['MISSING'] = counts['enabled'] - counts['executed']
    # Compute the total number of tests per group
    totals = counts.groupby(['group'])['enabled'] \
        .max() \
        .rename('total') \
        .reset_index()
    return counts.drop(columns=['executed']) \
        .merge(totals, on=['group'], how='left')


counts = create_count_table(data)
# Confirm that each tool performed the same on each JDK version for all vendors
assert_vendor_matches(counts)
# Since the results for different vendors are identical, select a single vendor
counts = select(counts, vendor='temurin') \
    .drop(columns='vendor') \
    .reset_index(drop=True)
counts

Unnamed: 0,group,tool,version,PASSED,BEHAVIOR,PROPAGATION,enabled,MISSING,total
0,ArrayAccess,galette,8,74,0,0,74,0,74
1,ArrayAccess,galette,11,74,0,0,74,0,74
2,ArrayAccess,galette,17,74,0,0,74,0,74
3,ArrayAccess,galette,21,74,0,0,74,0,74
4,ArrayAccess,mirror-taint,8,19,0,55,74,0,74
...,...,...,...,...,...,...,...,...,...
319,VarHandle,mirror-taint,21,579,0,1353,1932,0,1932
320,VarHandle,phosphor,8,0,0,0,0,0,1932
321,VarHandle,phosphor,11,0,1932,0,1932,0,1932
322,VarHandle,phosphor,17,0,0,0,1932,1932,1932


In [190]:
# Find failures due to unexpected exceptions, failed behavioral assertions, or missing entries,
# opposed to incorrect taint tag propagation
counts[(counts['BEHAVIOR'] != 0) | (counts['MISSING'] != 0)]

Unnamed: 0,group,tool,version,PASSED,BEHAVIOR,PROPAGATION,enabled,MISSING,total
8,ArrayAccess,phosphor,8,0,74,0,74,0,74
9,ArrayAccess,phosphor,11,0,74,0,74,0,74
10,ArrayAccess,phosphor,17,0,0,0,74,74,74
11,ArrayAccess,phosphor,21,0,0,0,74,74,74
20,ArrayLength,phosphor,8,0,5,0,5,0,5
...,...,...,...,...,...,...,...,...,...
310,Throwable,phosphor,17,0,0,0,5,5,5
311,Throwable,phosphor,21,0,0,0,5,5,5
321,VarHandle,phosphor,11,0,1932,0,1932,0,1932
322,VarHandle,phosphor,17,0,0,0,1932,1932,1932


In [195]:
def compute_failures(behavior, missing, propagation, enabled):
    if enabled == 0:
        return np.NaN
    else:
        count = behavior + missing + propagation
        result = str(count)
        if behavior != 0:
            result = "*" + result
        if missing != 0:
            result = "^" + result
        return result


counts['FAILURES'] = counts[['BEHAVIOR', 'MISSING', 'PROPAGATION', 'enabled']] \
    .apply(lambda x: compute_failures(*x), axis=1)
counts

Unnamed: 0,group,tool,version,PASSED,BEHAVIOR,PROPAGATION,enabled,MISSING,total,FAILURES
0,ArrayAccess,galette,8,74,0,0,74,0,74,0
1,ArrayAccess,galette,11,74,0,0,74,0,74,0
2,ArrayAccess,galette,17,74,0,0,74,0,74,0
3,ArrayAccess,galette,21,74,0,0,74,0,74,0
4,ArrayAccess,mirror-taint,8,19,0,55,74,0,74,55
...,...,...,...,...,...,...,...,...,...,...
319,VarHandle,mirror-taint,21,579,0,1353,1932,0,1932,1353
320,VarHandle,phosphor,8,0,0,0,0,0,1932,
321,VarHandle,phosphor,11,0,1932,0,1932,0,1932,*1932
322,VarHandle,phosphor,17,0,0,0,1932,1932,1932,^1932


In [196]:
table = counts.pivot(index=['group', 'total'], values=['FAILURES'], columns=['tool', 'version']) \
    .reorder_levels(axis=1, order=['tool', 'version', None]) \
    .sort_index(axis=1) \
    .sort_index(axis=0) \
    .droplevel(2, axis=1)
table.index.names = [x.title() for x in table.index.names]
table.columns = table.columns.map(lambda l: (l[0].title().replace('-', ''), l[1]))
table.columns.names = [None for _ in table.columns.names]
styler = table.style.format(precision=0, na_rep='---')
display(styler)

Unnamed: 0_level_0,Unnamed: 1_level_0,Galette,Galette,Galette,Galette,MirrorTaint,MirrorTaint,MirrorTaint,MirrorTaint,Phosphor,Phosphor,Phosphor,Phosphor
Unnamed: 0_level_1,Unnamed: 1_level_1,8,11,17,21,8,11,17,21,8,11,17,21
Group,Total,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
ArrayAccess,74,0,0,0,0,55,55,55,55,*74,*74,^74,^74
ArrayLength,5,0,0,0,0,3,3,3,3,*5,*5,^5,^5
ArrayReflection,75,0,0,0,0,65,65,65,65,*75,*75,^75,^75
Assignment,12,0,0,0,0,0,0,0,0,*12,*12,^12,^12
BoxedType,8,0,0,0,0,0,0,0,0,*8,*8,^8,^8
ClassReflection,7,0,0,0,0,0,0,0,0,*7,*7,^7,^7
Collection,6,0,0,0,0,2,2,2,2,*6,*6,^6,^6
Conditional,4,0,0,0,0,0,0,0,0,*4,*4,^4,^4
ConstructorReflection,88,0,0,0,0,12,12,12,12,*88,*88,^88,^88
Field,6,0,0,0,0,2,2,2,2,*6,*6,^6,^6
