In [None]:
import itertools
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import mannwhitneyu

import experiment
import extract

# Plotting setup
sns.set_style("whitegrid", {"font.family": "Arial"})

# Constants
PALETTE = {
    "Constant / FLUFFI": "#6f4e7b",
    "FAST / FLUFFI": "#c9472f",
    "Constant / Round-Robin": "#ffa056",
    "FAST / Round-Robin": "#f7c860",
    "Constant / AFLFast": "#9dd766",
    "FAST / AFLFast": "#267895",
}  # 8dddd0
Y_KEY_LABELS = {
    "paths": "# Paths Covered",
    "covered_blocks": "# Blocks Covered",
    "crashes_unique": "# Crashes Found",
}

# Load the data
df_measurements = pd.read_parquet("measurements.parquet")
df_measurements["cpu_seconds_round"] = df_measurements["cpu_time"].round(-3)
df_measurements["cpu_hours_round"] = df_measurements["cpu_seconds_round"] / 3600
df_measurements["bugs"] = (
    df_measurements["crashes_unique"] + df_measurements["access_violations_unique"]
)
df_measurements["covered_blocks_exec"] = (
    df_measurements["covered_blocks"] / df_measurements["completed_testcases"]
)
df_measurements["paths_exec"] = (
    df_measurements["paths"] / df_measurements["completed_testcases"]
)

# Get maxes in steps
def get_max(steps=1):
    dfs = {}
    for i in range(1, steps + 1):
        trial_time = (experiment.TRIAL_TIME / steps) * i
        df_lim = df_measurements.loc[df_measurements["cpu_time"] <= trial_time]
        df_lim = df_lim.loc[
            df_lim.groupby(["experiment", "benchmark", "trial"])["cpu_time"].idxmax()
        ]
        dfs[trial_time] = df_lim
    return dfs


In [None]:
# Mann-Whitney U test

y_key = "covered_blocks"

for trial_time, df_max in get_max(30).items():
    print(trial_time / 3600)
    wins = {}
    for exp in extract.EXPERIMENTS:
        wins[exp] = 0

    # For each experiment combo
    for exp_x, exp_y in itertools.combinations(extract.EXPERIMENTS, 2):
        result = {exp_x: 0, exp_y: 0, "Inconclusive": 0}
        result_b = {exp_x: [], exp_y: [], "Inconclusive": []}

        # Calculate for each benchmark
        for benchmark in experiment.BENCHMARKS:
            df_benchmark = df_max.loc[df_max["benchmark"] == benchmark]
            if df_benchmark[y_key].max() == 0:
                continue
            x = df_benchmark.loc[df_benchmark["experiment"] == exp_x][y_key]
            y = df_benchmark.loc[df_benchmark["experiment"] == exp_y][y_key]
            try:
                _, p = mannwhitneyu(x, y)
            except:
                result["Inconclusive"] += 1
                result_b["Inconclusive"].append(benchmark)
                continue
            if p < 0.05:
                if x.mean() > y.mean():
                    result[exp_x] += 1
                    result_b[exp_x].append(benchmark)
                    wins[exp_x] += 1
                else:
                    result[exp_y] += 1
                    result_b[exp_y].append(benchmark)
                    wins[exp_y] += 1
            else:
                result["Inconclusive"] += 1
                result_b["Inconclusive"].append(benchmark)

        # Print result
        print(result)
        print(result_b)
    print(sorted(wins.items(), key=lambda item: item[1], reverse=True))


In [None]:
# Line graphs for code coverage over time for a single benchmark

hue_orders = {
    "covered_blocks": [
        "FAST / AFLFast",
        "Constant / FLUFFI",
        "Constant / AFLFast",
        "FAST / FLUFFI",
        "FAST / Round-Robin",
        "Constant / Round-Robin",
    ],
    "paths": [
        "FAST / AFLFast",
        "Constant / AFLFast",
        "Constant / FLUFFI",
        "FAST / FLUFFI",
        "FAST / Round-Robin",
        "Constant / Round-Robin",
    ],
}

for y_key in ["covered_blocks", "paths"]:
    for benchmark in experiment.BENCHMARKS:
        if "njs" not in benchmark:
            continue
        df_benchmark = df_measurements.loc[(df_measurements["benchmark"] == benchmark)]
        plt.figure(figsize=(6, 4), dpi=100)
        g = sns.lineplot(
            y=y_key,
            x="cpu_hours_round",
            hue="experiment",
            hue_order=hue_orders[y_key],
            palette=PALETTE,
            data=df_benchmark,
            estimator=np.median,
            ci=95,
        )
        g.legend(title=None)
        g.set_xlim(0, 30)
        if y_key == "covered_blocks":
            g.set_ylim(2000)
        elif y_key == "paths":
            g.set_ylim(0)
        g.set_xlabel("CPU Hours")
        g.set_ylabel(Y_KEY_LABELS[y_key])


In [None]:
# Graphs for average normalized score

y_key = "paths_exec"
coverage_dict = {"cpu_hours": [], "experiment": [], "score": []}
rank_dict = {"cpu_hours": [], "experiment": [], "rank": []}

for trial_time, df_max in get_max(180).items():

    # Initialization
    sum_coverage = {}
    sum_ranks = {}
    for exp in extract.EXPERIMENTS:
        sum_coverage[exp] = 0
        sum_ranks[exp] = 0

    # Get median for each benchmark
    for benchmark in experiment.BENCHMARKS:
        df_benchmark = df_max[df_max["benchmark"] == benchmark]
        if df_benchmark[y_key].max() == 0:
            continue
        exp_coverage = {}
        max_coverage = df_benchmark[y_key].max()
        for exp in extract.EXPERIMENTS:
            median_coverage = df_benchmark.loc[df_benchmark["experiment"] == exp][
                y_key
            ].median()
            sum_coverage[exp] += (
                0 if max_coverage == 0 else (median_coverage / max_coverage) * 100.0
            )
            exp_coverage[exp] = median_coverage
        for rank, key in enumerate(
            sorted(exp_coverage, key=exp_coverage.get, reverse=True), 1
        ):
            sum_ranks[key] += rank

    # Calculate the scores
    coverage_score = {}
    rank_score = {}
    for exp in extract.EXPERIMENTS:
        coverage_score[exp] = sum_coverage[exp] / len(experiment.BENCHMARKS)
        rank_score[exp] = sum_ranks[exp] / len(experiment.BENCHMARKS)

    # Add to dict
    for exp, score in coverage_score.items():
        coverage_dict["cpu_hours"].append(trial_time / 3600)
        coverage_dict["experiment"].append(exp)
        coverage_dict["score"].append(score)
    for exp, rank in rank_score.items():
        rank_dict["cpu_hours"].append(trial_time / 3600)
        rank_dict["experiment"].append(exp)
        rank_dict["rank"].append(rank)

    # Sort and print results
    if trial_time == experiment.TRIAL_TIME:
        coverage_score_sorted = dict(
            sorted(coverage_score.items(), key=lambda item: item[1], reverse=True)
        )
        rank_score_sorted = dict(
            sorted(rank_score.items(), key=lambda item: item[1], reverse=True)
        )
        print(coverage_score_sorted)
        print(rank_score_sorted)

# Coverage bar plot
sns.set(font_scale=1)
sns.set_style("whitegrid", {"font.family": "Arial"})
plt.figure(figsize=(6, 4), dpi=100)
g = sns.barplot(
    y=list(coverage_score_sorted.keys()),
    x=list(coverage_score_sorted.values()),
    palette=PALETTE,
)
if y_key == "paths":
    g.set_xlim(70, 80)
elif y_key == "paths_exec":
    g.set_xlim(72, 82)
elif y_key == "covered_blocks":
    g.set_xlim(90, 96)
elif y_key == "covered_blocks_exec":
    g.set_xlim(80, 86)
elif y_key == "bugs":
    g.set_xlim(30, 34)
elif y_key == "completed_testcases":
    g.set_xlim(90, 92.5)
g.set_xlabel("Average Normalized Score")

# Rank bar plot
plt.figure(figsize=(6, 6), dpi=100)
rank_score_sorted_y = list(rank_score_sorted.keys())
rank_score_sorted_y.reverse()
rank_score_sorted_x = list(rank_score_sorted.values())
rank_score_sorted_x.reverse()
g = sns.barplot(
    y=rank_score_sorted_y,
    x=rank_score_sorted_x,
    palette=PALETTE,
)
g.set_xlim(1, 5)
g.set_xlabel("Average Rank")

# Coverage line plot
df_coverage = pd.DataFrame(coverage_dict)
plt.figure(figsize=(6, 4), dpi=100)
g = sns.lineplot(
    y="score",
    x="cpu_hours",
    hue="experiment",
    hue_order=coverage_score_sorted.keys(),
    palette=PALETTE,
    data=df_coverage,
)
g.legend(title=None)
g.set_xlim(0, 30)
g.set_xlabel("CPU Hours")
g.set_ylabel("Average Normalized Score")

# Rank line plot
df_rank = pd.DataFrame(rank_dict)
plt.figure(figsize=(6, 4), dpi=100)
g = sns.lineplot(
    y="rank",
    x="cpu_hours",
    hue="experiment",
    hue_order=rank_score_sorted.keys(),
    palette=PALETTE,
    data=df_rank,
)
g.legend(title=None)
g.set_xlim(0, 30)
g.set_xlabel("CPU Hours")
g.set_ylabel("Average Rank")


In [None]:
# LaTeX table describing each benchmark

benchmarks_sorted = sorted(experiment.BENCHMARKS)
for benchmark in benchmarks_sorted:
    benchmark_dir = os.path.join(experiment.FUZZBENCH_DIR, benchmark)
    seeds_path = os.path.join(benchmark_dir, "seeds/")
    seeds = []
    for seed in os.listdir(seeds_path):
        seed_path = os.path.join(seeds_path, seed)
        with open(seed_path, "rb") as f:
            data = f.read()
        seeds.append((seed, data))
    with open(os.path.join(benchmark_dir, "target.txt"), "r") as f:
        target_name = f.read().strip()
    target_path = os.path.join(benchmark_dir, target_name)
    num = os.path.getsize(target_path)
    for unit in ["", "K", "M", "G"]:
        if abs(num) < 1024.0:
            size = f"{num:3.1f} {unit}B"
            break
        num /= 1024.0
    print(
        f"{benchmark} & ??? & {min(len(seeds), experiment.SEED_NUM_LIMIT)} & {size} \\\\ \\hline"
    )


In [None]:
# LaTeX table for bugs found

for benchmark in sorted(experiment.BENCHMARKS):
    df_benchmark = df_measurements.loc[(df_measurements["benchmark"] == benchmark)]
    val = df_benchmark["bugs"].max()
    benchmark = benchmark.replace("_", r"{\_}")
    print(rf"{benchmark} & {val} \\ \hline")
