In [None]:
# reload modules
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.chdir("") # set the root directory of the project
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sra.dataloader import (
    get_covs_runs,
    get_run_paths,
    get_trial_paths,
    get_covs_trial,
)
from sra.cfggen import parse_ll, parse_dotfiles, gen_cfg_inter
from sra.mapper import get_bbs_fuzz, get_mapping, fuzzdat_to_obs
from sra.estimator import Graph, SimpleGraph
from sra.estimator_chunk import structure_estimation
from typing import List

# Get mapping

In [None]:
source_dir = "fuzz-data/ft_data/llvm/jasper"
source_name = "jasper"

dotfiles = [
    fname
    for fname in os.listdir(source_dir)
    if fname.startswith(".") and fname.endswith(".dot")
]
functions = sorted([fname.split(".")[1] for fname in dotfiles])

bbs_fuzz_path = "fuzz-data/jasper-fuzzruns/jasper_01/jasper_aflpp_seed_01_run_01/ft_jasper.json"
bbs_fuzz = get_bbs_fuzz(bbs_fuzz_path)

debug_info_dict, blocks_dict = parse_ll(
    os.path.join(source_dir, f"{source_name}.ll")
)
cfgs_intra, node_to_bb = parse_dotfiles(
    dotfiles,
    source_dir,
    debug_info_dict,
    blocks_dict,
    non_overlap_lineidx=False,
)

cfg_inter = gen_cfg_inter(cfgs_intra)
graph = Graph(cfg_inter, start_func="jas_init")
graph.cache_dict = {"func": {}, "twopoint_prob": {}}

map_fuzz_to_obs_node, map_obs_to_fuzz_node = get_mapping(
    functions, bbs_fuzz, node_to_bb, cfg_inter, debug=True
)


# Load coverage data

In [None]:
from sra.dataprocessor import get_statistics

os.makedirs("fuzz-result/jasper", exist_ok=True)

non_minus_cov_data_o_path = "fuzz-result/jasper/non_minus_cov_data_o.npy"

if os.path.exists(non_minus_cov_data_o_path):
    print(
        """number of trials: 20
covs_f.shape=(15340, 14637)
cov_maxs.shape=(15340,)
mean: 209123.89198174706, std: 17336911.26044492, min: 3729, max: 2147393560
quantiles:
              0
0.01   10470.56
0.05   13636.00
0.10   16801.70
0.15   24579.80
0.20   38718.60
0.25   49240.75
0.30   57062.60
0.35   60531.90
0.40   64115.00
0.45   67348.55
0.50   70521.00
0.55   73911.90
0.60   76854.40
0.65   79707.00
0.70   82381.20
0.75   84813.00
0.80   87780.20
0.85   95207.90
0.90  110882.90
0.95  134639.00
0.99  176394.89"""
    )
else:
    projname = "jasper"
    runs_path = "fuzz-data/jasper-fuzzruns"
    run_paths = get_run_paths(runs_path, projname)
    covs_trials_f = []
    for run_path in run_paths:
        for trial_path in get_trial_paths(run_path):
            covs_trial = get_covs_trial(
                trial_path, expected_time_interval=(899, 902)
            )
            covs = [cov for cov in covs_trial if max(cov) > 0]
            covs_trials_f.append(covs)
    print(f"number of trials: {len(covs_trials_f)}")
    covs_f = []
    for covs in covs_trials_f:
        covs_f.extend(covs)
    covs_f = np.array(covs_f)
    print(f"{covs_f.shape=}")
    cov_maxs = np.max(covs_f, axis=1)
    print(f"{cov_maxs.shape=}")
    get_statistics(cov_maxs)


In [None]:
if os.path.exists(non_minus_cov_data_o_path):
    print(
        """cov_data_f.shape=(13807, 14637)
sum_data_f.shape=(14637,)
min_obs=2342
total_obs=930234012
min prob: 2342/930234012 = 2.5176460651709647e-06
* Note. np.max(sum_data_f)=930233923
* There's no one element that represents all the observations."""
    )
else:
    in_between_range = [
        13636.00 <= cov_max_f <= 134639.00 for cov_max_f in cov_maxs
    ]
    cov_data_f = covs_f[in_between_range]
    print(f"{cov_data_f.shape=}")
    sum_data_f = np.sum(cov_data_f, axis=0)
    print(f"{sum_data_f.shape=}")
    min_obs = np.min(sum_data_f[sum_data_f > 0])
    print(f"{min_obs=}")
    num_obss = cov_maxs[in_between_range]
    total_obs = np.sum(num_obss)
    print(f"{total_obs=}")
    print(f"min prob: {min_obs}/{total_obs} = {min_obs/total_obs}")
    print(f"* Note. {np.max(sum_data_f)=}")
    if max(sum_data_f) != total_obs:
        print("* There's no one element that represents all the observations.")
    else:
        print("* There's one element that represents all the observations.")


In [None]:
if os.path.exists(non_minus_cov_data_o_path):
    print(
        """cov_data_o.shape=(13807, 22946)
sum_data_o.shape=(22946,)
* Let's also check whether the min probability is still the same after the mapping.
* _min_obs=2342.0 -> (_min_obs == min_obs)=True"""
    )
else:
    nodenames = list(cfg_inter["nodes"].keys())
    cov_data_o = fuzzdat_to_obs(
        cov_data_f, cfg_inter, bbs_fuzz, map_fuzz_to_obs_node, nodenames
    )
    print(f"{cov_data_o.shape=}")
    sum_data_o = np.sum(cov_data_o, axis=0)
    print(f"{sum_data_o.shape=}")
    _min_obs = np.min(sum_data_o[sum_data_o > 0])
    print(
        "* Let's also check whether the min probability is still the same after the mapping."
    )
    print(f"* {_min_obs=} -> {(_min_obs == min_obs)=}")


In [None]:
observable_nodes_path = "fuzz-result/jasper/observable_nodes"

if os.path.exists(non_minus_cov_data_o_path):
    non_minus_cov_data_o = np.load(non_minus_cov_data_o_path)
    with open(observable_nodes_path) as f:
        observable_nodes = [line.strip() for line in f.readlines()]
    print(
        """len(non_minus_ids_o)=6143
non_minus_cov_data_o.shape=(13807, 6143)"""
    )
else:
    non_minus_ids_o = [i for i in range(len(sum_data_o)) if sum_data_o[i] > 0]
    print(f"{len(non_minus_ids_o)=}")
    observable_nodes = [nodenames[i] for i in non_minus_ids_o]
    with open(observable_nodes_path, "w") as f:
        for node in observable_nodes:
            f.write(node + "\n")
    non_minus_cov_data_o = cov_data_o[:, non_minus_ids_o]
    print(f"{non_minus_cov_data_o.shape=}")
    np.save(
        "fuzz-result/jasper/non_minus_cov_data_o.npy",
        non_minus_cov_data_o,
    )


## Data re-organizing

### 1. load _f data and _o data

In [None]:
projname = "jasper"
runs_path = "fuzz-data/jasper-fuzzruns"
run_paths = get_run_paths(runs_path, projname)
covs_trials_f = []
for run_path in run_paths:
    for trial_path in get_trial_paths(run_path):
        covs_trial = get_covs_trial(
            trial_path, expected_time_interval=(899, 902)
        )
        covs = [cov for cov in covs_trial if max(cov) > 0]
        covs_trials_f.append(covs)
covs_f = []
for covs in covs_trials_f:
    covs_f.extend(covs)
covs_f = np.array(covs_f)
cov_maxs = np.max(covs_f, axis=1)
in_between_range = [
    13636.00 <= cov_max_f <= 134639.00 for cov_max_f in cov_maxs
]
cov_data_f = covs_f[in_between_range]

non_minus_cov_data_o_path = "fuzz-result/jasper/non_minus_cov_data_o.npy"
non_minus_cov_data_o = np.load(non_minus_cov_data_o_path)
observable_nodes_path = "fuzz-result/jasper/observable_nodes"
with open(observable_nodes_path) as f:
    observable_nodes = [line.strip() for line in f.readlines()]


### 2. Check consistency
For quite amount of time, choose random element from random i-th row in the _o data and check if it is in the i-th row of _f data.

In [None]:
num_check = 10000
for _ in range(num_check):
    random_row_idx = np.random.randint(0, non_minus_cov_data_o.shape[0])
    random_val_idx = np.random.randint(0, non_minus_cov_data_o.shape[1])
    random_val_o = non_minus_cov_data_o[random_row_idx, random_val_idx]
    random_row_f = cov_data_f[random_row_idx]
    print(f"{random_row_idx=} {random_val_idx=} {random_val_o=}", end="\r")
    if random_val_o not in random_row_f:
        print(f"{random_row_idx=}")
        print(f"{random_val_idx=}")
        print(f"{non_minus_cov_data_o[random_row_idx, random_val_idx]=}")
        print(
            f"{non_minus_cov_data_o[random_row_idx, random_val_idx]} not in cov_data_f[{random_row_idx}]"
        )
        raise Exception("Not consistent!")


### 3. Split and save _o data

In [None]:
split_ids = []
for covs_trial in covs_trials_f:
    covs_trial_maxs = np.max(np.array(covs_trial), axis=1)
    in_between_range_trial = [
        13636.00 <= cov_max_f <= 134639.00 for cov_max_f in covs_trial_maxs
    ]
    split_ids.append(sum(in_between_range_trial))
split_ids = np.cumsum(split_ids)
assert len(split_ids) == len(covs_trials_f)
assert split_ids[-1] == len(non_minus_cov_data_o)


In [None]:
non_minus_cov_trials_o = np.split(non_minus_cov_data_o, split_ids)
save_dir = os.path.join("fuzz-result/jasper", "non_minus_cov_trials_o")
os.makedirs(save_dir, exist_ok=True)
for i, non_minus_cov_trial_o in enumerate(non_minus_cov_trials_o):
    np.save(
        os.path.join(save_dir, f"non_minus_cov_trial_o_{i}.npy"),
        non_minus_cov_trial_o,
    )


### 4. load saved _o data

In [None]:
import os

load_dir = os.path.join("fuzz-result/jasper", "non_minus_cov_trials_o")
non_minus_cov_trial_o_paths = [
    os.path.join(load_dir, fname)
    for fname in sorted(
        [
            fname
            for fname in os.listdir(load_dir)
            if fname.startswith("non_minus_cov_trial_o_")
            and fname.endswith(".npy")
        ],
        key=lambda x: int(x.split("_")[-1].split(".")[0]),
    )
]
non_minus_cov_trials_o = []
for non_minus_cov_trial_o_path in non_minus_cov_trial_o_paths:
    non_minus_cov_trials_o.append(np.load(non_minus_cov_trial_o_path))

non_minus_cov_data_o_path = "fuzz-result/jasper/non_minus_cov_data_o.npy"
non_minus_cov_data_o = np.load(non_minus_cov_data_o_path)

observable_nodes_path = "fuzz-result/jasper/observable_nodes"
with open(observable_nodes_path) as f:
    observable_nodes = [line.strip() for line in f.readlines()]

## Implement

### 1. Re-create the distribution with several options

In [None]:
def get_strat4_cov_data_o(
    target_node, non_minus_cov_trials_o, observable_nodes
):
    cov_data_o = []
    remaining = []
    target_idx = observable_nodes.index(target_node)
    for non_minus_cov_trial_o in non_minus_cov_trials_o:
        if not non_minus_cov_trial_o[:, target_idx].any():
            remaining.append(non_minus_cov_trial_o)
        elif non_minus_cov_trial_o[:, target_idx][0]:
            remaining.append(non_minus_cov_trial_o)
        else:
            min_nonzero_idx = np.min(
                np.nonzero(non_minus_cov_trial_o[:, target_idx])[0]
            )
            cov_data_o.append(non_minus_cov_trial_o[: min_nonzero_idx + 1])
            remaining.append(non_minus_cov_trial_o[min_nonzero_idx + 1 :])
    return np.vstack(cov_data_o), np.vstack(remaining)


In [None]:
target_node = "Node0x1190697d0"
print(f"{non_minus_cov_data_o.shape=}")
strat4_coc_data_o_reach, start4_remaining_reach = get_strat4_cov_data_o(
    target_node, non_minus_cov_trials_o, observable_nodes, "reach"
)
print(f"{strat4_coc_data_o_reach.shape=}, {start4_remaining_reach.shape=}")


## Evaluate

### 1. Check GT difference

In [None]:
def get_GT_stats(target_idx, cov_data, remaining=None):
    sum_data = np.sum(cov_data, axis=0)
    total_obs = np.max(sum_data)
    target_obs = sum_data[target_idx]
    target_prob = target_obs / total_obs
    if remaining is not None:
        sum_remaining = np.sum(remaining, axis=0)
        total_remaining = np.max(sum_remaining)
        target_remaining = sum_remaining[target_idx]
        target_remaining_prob = target_remaining / total_remaining
    else:
        total_remaining, target_remaining, target_remaining_prob = 0, 0, 0
    return (
        total_obs,
        target_obs,
        target_prob,
        total_remaining,
        target_remaining,
        target_remaining_prob,
    )


In [None]:
total_GT_stat = get_GT_stats(
    observable_nodes.index(target_node),
    non_minus_cov_data_o,
)
print(
    f"[Total       ] total_obs, target_obs, total_remaining, target_remaining = {total_GT_stat[0], total_GT_stat[1], total_GT_stat[3], total_GT_stat[4]}"
)
strat4_GT_stat_reach = get_GT_stats(
    observable_nodes.index(target_node),
    strat4_coc_data_o_reach,
    start4_remaining_reach,
)
print(
    f"[Strat4 REACH] total_obs, target_obs, total_remaining, target_remaining = {strat4_GT_stat_reach[0], strat4_GT_stat_reach[1], strat4_GT_stat_reach[3], strat4_GT_stat_reach[4]}"
)


In [None]:
from sra.dataprocessor import (
    strat4_save_data,
    total_frac_strategy,
    total_frac_draw_graph,
)


def analyze_node_strat4(
    target_node, cov_data, remain_data, observable_nodes, graph, option=""
):
    esti, dist, _ = structure_estimation(
        np.zeros((1, len(observable_nodes))),
        graph,
        target_node,
        observable_nodes,
        2,
    )
    dirpath = f"fuzz-result/tcas/start4/{target_node}-{esti:.2e}-{dist}"
    (
        GT,
        lap_esti_df,
        gt_esti_df,
        gt_unob_esti_df,
        struct_esti_df,
    ) = total_frac_strategy(
        target_node, cov_data, observable_nodes, graph, dirpath, prefix=option
    )
    print(f"{GT=}")
    print(f"reciprocal: {1/GT}")

    # save
    GT_stat = get_GT_stats(
        observable_nodes.index(target_node),
        cov_data,
        remain_data,
    )
    strat4_save_data(
        dirpath,
        GT_stat,
        lap_esti_df,
        gt_esti_df,
        gt_unob_esti_df,
        struct_esti_df,
        option,
    )


strat4_coc_data_o = strat4_coc_data_o_reach
start4_remaining = start4_remaining_reach
analyze_node_strat4(
    target_node,
    strat4_coc_data_o,
    start4_remaining,
    observable_nodes,
    graph,
)
