In [1]:
from dataclasses import dataclass, field
import math
import os
from typing import Dict, List

import pandas as pd

# E2E Data Frames

## Load E2E Dataframe

In [2]:
e2e_pareto_path = "csv/e2e-pareto-dataframes"

pareto_cic_iomt_2024_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_cic_iomt_2024_df.csv"))
pareto_cic_iot_2023_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_cic_iot_2023_df.csv"))
pareto_iscxvpn2016_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_iscxvpn2016_df.csv"))
pareto_ucsbfinetuning_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_ucsbfinetuning_df.csv"))
pareto_cic_iot_2023_32_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_cic_iot_2023_32_df.csv"))
pareto_cic_ids_2017_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_cic_ids_2017_df.csv"))
pareto_cic_ids_2018_df = pd.read_csv(os.path.join(e2e_pareto_path, "pareto_cic_ids_2018_df.csv"))

# Motivation

## Features Table

In [3]:
# features_per_layer = "{{\bf Features\% / Layer}} & {cic_iomt_2024_fpl} & {cic_iot_2023_fpl} & {iscxvpn2016_fpl} & {ucsbfinetuning_fpl} & {cic_iot_2023_32_fpl} & {cic_ids_2017_fpl} & {cic_ids_2018_fpl} \\\\"
# features_per_subtree = "{{\bf Features\% / Sub-Tree}} & {cic_iomt_2024_fps} & {cic_iot_2023_fps} & {iscxvpn2016_fps} & {ucsbfinetuning_fps} & {cic_iot_2023_32_fps} & {cic_ids_2017_fps} & {cic_ids_2018_fps} \\\\"
# d_features_per_partition = "{{\bf $\Delta$Features\% / Partition}} & {cic_iomt_2024_dfpp} & {cic_iot_2023_dfpp} & {iscxvpn2016_dfpp} & {ucsbfinetuning_dfpp} & {cic_iot_2023_32_dfpp} & {cic_ids_2017_dfpp} & {cic_ids_2018_dfpp} \\\\"
# d_features_per_subtree = "{{\bf $\Delta$Features\% / Sub-Tree}} & {cic_iomt_2024_dfps} & {cic_iot_2023_dfps} & {iscxvpn2016_dfps} & {ucsbfinetuning_dfps} & {cic_iot_2023_32_dfps} & {cic_ids_2017_dfps} & {cic_ids_2018_dfps} \\\\"

reference_feature_row = "{{\\bf {row_name}}} & {cic_iomt_2024_fpl} & {cic_iot_2023_fpl} & {iscxvpn2016_fpl} & {ucsbfinetuning_fpl} & {cic_iot_2023_32_fpl} & {cic_ids_2017_fpl} & {cic_ids_2018_fpl} \\\\"

datasets_order = [
    "cic_iomt_2024",
    "cic_iot_2023",
    "iscxvpn2016",
    "ucsbfinetuning",
    "cic_iot_2023_32",
    "cic_ids_2017",
    "cic_ids_2018"
]

values = {
    "features_per_layer": 18,
    "features_per_subtree": 4,
    "d_features_per_partition": 4,
    "d_features_per_subtree": 1.4
}

feature_row_names = {
    "features_per_layer": "Features\% / Layer",
    "features_per_subtree": "Features\% / Sub-Tree",
    "d_features_per_partition": "$\Delta$Features\% / Partition",
    "d_features_per_subtree": "$\Delta$Features\% / Sub-Tree"
}

# loop through each stage and dataset
for this_feature_name, this_feature_row in feature_row_names.items():
    
    # copy the breakdown data row
    feature_row = str(reference_feature_row)
    
    # loop through each dataset and fill this stage's time
    stage_data = {}
    for dataset_name in datasets_order:
        
        dataset_stage_features = values[this_feature_name]
        
        stage_data[f"{dataset_name}_fpl"] = dataset_stage_features
    
    # this happens after all datasets have been processed
    # print(stage_data)
    
    # now populate the stage name
    stage_data["row_name"] = this_feature_row
    print(feature_row.format(**stage_data))
        
    if this_feature_row == "Features\% / Sub-Tree":
        print("\midrule")

{\bf Features\% / Layer} & 18 & 18 & 18 & 18 & 18 & 18 & 18 \\
{\bf Features\% / Sub-Tree} & 4 & 4 & 4 & 4 & 4 & 4 & 4 \\
\midrule
{\bf $\Delta$Features\% / Partition} & 4 & 4 & 4 & 4 & 4 & 4 & 4 \\
{\bf $\Delta$Features\% / Sub-Tree} & 1.4 & 1.4 & 1.4 & 1.4 & 1.4 & 1.4 & 1.4 \\


## Resubmission Bandwidth Table

In [4]:
def get_resubmitted_traffic(num_partitions, mean_ttd, std_ttd, total_flows, verbose=False):
    if num_partitions == 1:
        return 0, 0
    
    resubmitted_bits_per_flow = 128
    total_Mbits = (total_flows * resubmitted_bits_per_flow) / 1024 / 1024

    mean_ttd_s = mean_ttd / 1e3
    window_time = mean_ttd_s / num_partitions
    total_traffic_mean = total_Mbits / window_time
    
    std_ttd_s = std_ttd / 1e3
    window_time = std_ttd_s / num_partitions
    total_traffic_std = total_Mbits / window_time
    
    if verbose:
        print(f"Total MBits (Mean): {round(total_Mbits, 2)}")
        print(f"Total Traffic Mbps (Mean): {round(total_traffic_mean, 2)}")
        # print(f"Total Traffic Mbps (std): {round(total_traffic, 2)}")
    
    return total_traffic_mean, total_traffic_std

In [5]:
ws_mean_ttd = 24605.21
hd_mean_ttd = 12010.67

ws_std_ttd = 43385.34
hd_std_ttd = 30168.41

In [6]:
# get_resubmitted_traffic(2.33, ws_mean_ttd, ws_std_ttd, 533000, verbose=True)

In [7]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# \multirow{3}{*}{\sf Webserver} 
ENVIRONMENT_NAME = "\multirow{{3}}{{*}}{{\sf {environment_name}}}"
DATA_ROW = " & {num_flows} & {cic_iomt_2024_mean}\,$\pm$\,{cic_iomt_2024_std} & {cic_iot_2023_mean}\,$\pm$\,{cic_iot_2023_std} & {iscxvpn2016_mean}~$\pm$~{iscxvpn2016_std} \\\\"

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

# Webserver Mean TTD (ms): 24605.21
# Webserver Std Dev TTD (ms): 43385.34
# Hadoop Mean TTD (ms): 12010.67
# Hadoop Std Dev TTD (ms): 30168.41

environments_dictionary = {
    "Webserver": (24605.21, 43385.34),
    "Hadoop": (12010.67, 30168.41)
}

datasets_partitions_and_flows = {
    "100K": {
        "cic_iomt_2024": 3,
        "cic_iot_2023": 2,
        "iscxvpn2016": 2,
    },
    "500K": {
        "cic_iomt_2024": 3,
        "cic_iot_2023": 3,
        "iscxvpn2016": 4,
    },
    "1M": {
        "cic_iomt_2024": 1,
        "cic_iot_2023": 2,
        "iscxvpn2016": 1,
    }
}

# datasets_partitions_and_flows = {
#     "533K": {
#         "cic_iomt_2024": 2.33,
#         "cic_iot_2023": 2.33,
#         "iscxvpn2016": 2,
#     },
# }

datasets_order = [
    "cic_iomt_2024",
    "cic_iot_2023",
    "iscxvpn2016",
]

flows_order = [
    98304,
    491520,
    983040
]

dataset_resubmitted_traffic = {
    "cic_iomt_2024": {
        "Webserver": {
            "means": [],
            "stds": []
        },
        "Hadoop": {
            "means": [],
            "stds": []
        }
    },
    "cic_iot_2023": {
        "Webserver": {
            "means": [],
            "stds": []
        },
        "Hadoop": {
            "means": [],
            "stds": []
        }
    },
    "iscxvpn2016": {
        "Webserver": {
            "means": [],
            "stds": []
        },
        "Hadoop": {
            "means": [],
            "stds": []
        }
    },
}

# loop through each stage and dataset
for env_name, (env_ttd_ms_mean, env_ttd_ms_std) in environments_dictionary.items():
    
    first = True
    
    for num_flows in flows_order:
        num_flows_str = FLOW_STRINGS[num_flows]
        
        table_row = ""
        if first:
            table_row = ENVIRONMENT_NAME
            first = False
            pass
        table_row += str(DATA_ROW)
        
        table_row_tuple = {}
        
        for dataset_name in datasets_order:
            
            partitions = datasets_partitions_and_flows[num_flows_str][dataset_name]
            env_ttd_ms_mean, env_ttd_ms_std = environments_dictionary[env_name]
            env_mbps_mean, env_mbps_std = get_resubmitted_traffic(partitions, env_ttd_ms_mean, env_ttd_ms_std, num_flows)
            
            table_row_tuple["environment_name"] = env_name
            table_row_tuple["num_flows"] = num_flows_str
            table_row_tuple[f"{dataset_name}_mean"] = round(env_mbps_mean, 2)
            table_row_tuple[f"{dataset_name}_std"] = round(env_mbps_std, 2)
            
            dataset_resubmitted_traffic[dataset_name][env_name]["means"].append(env_mbps_mean)
            dataset_resubmitted_traffic[dataset_name][env_name]["stds"].append(env_mbps_std)
            
            # print(env_name, env_ttd_ms_mean, env_ttd_ms_std, num_flows, num_flows_str, dataset_name, partitions, end=" ")
            # print(env_mbps_mean, env_mbps_std)
            
            pass
        
        # print(table_row_tuple)
        print(table_row.format(**table_row_tuple))
        pass
    print("\midrule")
    pass

\multirow{3}{*}{\sf Webserver} & 100K & 1.46\,$\pm$\,0.83 & 0.98\,$\pm$\,0.55 & 0.98~$\pm$~0.55 \\
 & 500K & 7.32\,$\pm$\,4.15 & 7.32\,$\pm$\,4.15 & 9.75~$\pm$~5.53 \\
 & 1M & 0\,$\pm$\,0 & 9.75\,$\pm$\,5.53 & 0~$\pm$~0 \\
\midrule
\multirow{3}{*}{\sf Hadoop} & 100K & 3.0\,$\pm$\,1.19 & 2.0\,$\pm$\,0.8 & 2.0~$\pm$~0.8 \\
 & 500K & 14.99\,$\pm$\,5.97 & 14.99\,$\pm$\,5.97 & 19.98~$\pm$~7.96 \\
 & 1M & 0\,$\pm$\,0 & 19.98\,$\pm$\,7.96 & 0~$\pm$~0 \\
\midrule


In [8]:
for dataset_name, environments in dataset_resubmitted_traffic.items():
    # print(dataset_name)
    for environment, data in environments.items():
        # print(environment, data)
        # get mean of means
        mean_of_means = sum(data["means"]) / len(data["means"])
        # get mean of stds
        sum_of_variances = sum([std ** 2 for std in data["stds"]])
        mean_of_stds = math.sqrt(sum_of_variances / len(data["stds"]))
        print(dataset_name, environment, round(mean_of_means, 2), round(mean_of_stds, 2))

cic_iomt_2024 Webserver 2.93 2.44
cic_iomt_2024 Hadoop 5.99 3.51
cic_iot_2023 Webserver 6.01 4.01
cic_iot_2023 Hadoop 12.32 5.76
iscxvpn2016 Webserver 3.58 3.21
iscxvpn2016 Hadoop 7.33 4.62


# Time to Accuracy

In [9]:
DOE_TIME_LINE = 205
TOTAL_BLACK_BOX_FUNCTION_TIME_LINE = 18
TOTAL_ITERATION_TIME_LINE = 19
FIRST_ITERATION_RESULT = 2
LAST_ITERATION_RESULT = 17

@dataclass
class TTA:
    dataset: str = ""
    # type of dictionary mapping int to float
    partition_fetch_time: Dict[int, float] = field(default_factory=dict)
    doe_time: float = 0.0
    total_black_box_function_time: float = 0.0
    total_iteration_time: float = 0.0
    total_iterations: int = 0
    best_f1_score: float = 0.0
    best_f1_score_flows: float = 0.0
    best_f1_score_iteration: int = 0
    accuracies: List[float] = field(default_factory=list)
    
    def black_box_time(self):
        # get ratio of black box function time to total iteration time
        ratio = self.total_black_box_function_time / self.total_iteration_time
        assert ratio < 1.0
        # this ratio of tta with doe is the time spent in the black box function
        return round(ratio * self.tta(doe=True))
    
    def tta(self, doe=True):
        average_iter_time = round(self.total_iteration_time / self.total_iterations, 2)
        tta_without_doe = self.best_f1_score_iteration * average_iter_time
        return round(self.doe_time + tta_without_doe) if doe else round(tta_without_doe)

    def __str__(self):
        return f"\n\tBest F1 Score: {self.best_f1_score} (Flows: {self.best_f1_score_flows})\n" \
            + f"\tDoE: {round(self.doe_time)} (sec)\n" \
            + f"\tBest Iteration: {self.best_f1_score_iteration}\n" \
            + f"\tAverage Iteration Time: {round(self.total_iteration_time / self.total_iterations, 2)} (sec)\n" \
            + f"\tTotal Black Box Function Time: {self.black_box_time()} (sec)\n" \
            + f"\tTTA (w/o DoE): {self.tta(doe=False)} (sec)\n" \
            + f"\tTTA (with DoE): {self.tta()} (sec)\n"

    def to_dict(self):
        average_iteration_time = round(self.total_iteration_time / self.total_iterations, 2)
        average_black_box_time = round(self.total_black_box_function_time / self.total_iterations, 2)
        average_optimizer_time = round(average_iteration_time - average_black_box_time, 2)
        return {
            "dataset": self.dataset,
            "best_f1_score": self.best_f1_score,
            "best_f1_score_iteration": self.best_f1_score_iteration,
            "average_iteration_time": average_iteration_time,
            "average_black_box_time": average_black_box_time,
            "average_optimizer_time": average_optimizer_time,
            "average_fetch_time": round(max(self.partition_fetch_time.values()), 2), # in seconds
            "tta": self.tta()
        }

In [10]:
def get_tta(hypermapper_log, partition_fetch_timing, dataset, verbose=False):
    # read the CSV file and loop through each line
    first = True
    with open(hypermapper_log, 'r') as f:
        # read all lines
        lines = f.readlines()

    random_sampling_time = 0
    total_black_box_function_time = 0
    total_iteration_time = 0
    total_iterations = 0
    max_f1_score = 0.0
    max_f1_score_flows = 0
    max_f1_score_iteration = 0
    accuracies = [0]
    
    total_lines = len(lines)
    curr_line = 0
    
    while True:
        # stay within bounds for total lines
        curr_line += 1
        if curr_line >= total_lines:
            break
        line = lines[curr_line]
        
        # if the line contains the string "Total time taken"
        if "Communication protocol: receiving message...." in line:
            # if this is the first line, save random sampling time
            if first:
                first = False
                # pick the iteration time line from this iteration
                doe_time_line = lines[curr_line+DOE_TIME_LINE].strip()
                # print(doe_time_line)
                random_sampling_time = round(float(doe_time_line.split()[5]), 2)
                continue
            
            if curr_line+TOTAL_ITERATION_TIME_LINE >= total_lines:
                break
            
            # pick the black box function time line from this iteration
            black_box_function_time_line = lines[curr_line+TOTAL_BLACK_BOX_FUNCTION_TIME_LINE].strip()
            black_box_function_time = round(float(black_box_function_time_line.split()[4]), 2)
            
            # add to total black box function time
            total_black_box_function_time += black_box_function_time
            
            # pick the iteration time line from this iteration
            iteration_time_line = lines[curr_line+TOTAL_ITERATION_TIME_LINE].strip()
            iteration_time = round(float(iteration_time_line.split()[3]), 2)
            
            # add to total search time
            total_iteration_time += iteration_time
            total_iterations += 1
            if verbose:
                print(iteration_time_line)
                print(iteration_time)
            
            # pick the best F1 score from this iteration
            for k in range(FIRST_ITERATION_RESULT, LAST_ITERATION_RESULT+1):
                # pick this result
                this_config_perf = lines[curr_line+k].strip().split(',')
                f1_score = round(-1.0 * float(this_config_perf[-2]), 2)
                num_flows = int(-1.0 * float(this_config_perf[-1]))
                
                # if this is the best F1 score so far, save it 
                # and which iteration it was found at
                if f1_score > max_f1_score and num_flows:
                    max_f1_score = f1_score
                    max_f1_score_flows = num_flows
                    max_f1_score_iteration = total_iterations

                if verbose:
                    print(f"F1 score = {f1_score}, Number of Flows = {num_flows}")
            
            # append best accuracy up to this point into accuracies list
            accuracies.append(max_f1_score)
            
            # skip to after this line
            curr_line += TOTAL_ITERATION_TIME_LINE
            
    # return random_sampling_time, round(total_iteration_time, 2), total_iterations, 
    # max_f1_score, max_f1_score_flows, max_f1_score_iteration, accuracies
    
    # accuracies should be 500 long, so extend last value to 500
    accuracies.extend([accuracies[-1]] * (500 - len(accuracies)))
    
    return TTA(
        dataset,
        partition_fetch_timing,
        round(random_sampling_time, 2), # seconds
        round(total_black_box_function_time, 2), # seconds
        round(total_iteration_time, 2), # seconds
        total_iterations, 
        max_f1_score, 
        max_f1_score_flows,
        max_f1_score_iteration,
        accuracies
    )

In [11]:
# all in seconds
cic_iomt_2024_partition_fetch_seconds = {1: 0.19, 2: 0.32, 3: 0.44, 4: 0.58, 5: 0.66, 6: 0.57, 7: 0.9}
cic_iot_2023_partition_fetch_seconds = {1: 0.06, 2: 0.11, 3: 0.16, 4: 0.21, 5: 0.22, 6: 0.27, 7: 0.32}
iscxvpn2016_partition_fetch_seconds = {1: 0.0, 2: 0.0, 3: 0.01, 4: 0.01, 5: 0.01, 6: 0.01, 7: 0.01}
ucsbfinetuning_partition_fetch_seconds = {1: 0.02, 2: 0.03, 3: 0.04, 4: 0.05, 5: 0.05, 6: 0.06, 7: 0.07}
cic_iot_2023_32_partition_fetch_seconds = {1: 0.21, 2: 0.32, 3: 0.43, 4: 0.57, 5: 0.67, 6: 0.8, 7: 0.91}
cic_ids_2017_partition_fetch_seconds = {1: 0.05, 2: 0.08, 3: 0.12, 4: 0.16, 5: 0.18, 6: 0.21, 7: 0.24}
cic_ids_2018_partition_fetch_seconds = {1: 0.03, 2: 0.06, 3: 0.09, 4: 0.12, 5: 0.13, 6: 0.15, 7: 0.18}

In [12]:
cic_iomt_2024_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-CIC-IoMT-2024-PCAPS1-f10-2024-12-08-01:49:23/hypermapper_log.log",
    cic_iomt_2024_partition_fetch_seconds,
    "D1",
    )
cic_iot_2023_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-CIC-IOT-2023-PCAPS1-f10-2024-12-08-01:49:49/hypermapper_log.log",
    cic_iot_2023_partition_fetch_seconds,
    "D2"
    )
iscxvpn2016_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-ISCXVPN2016-PCAPS0-f10-2024-12-07-05:03:19/hypermapper_log.log",
    iscxvpn2016_partition_fetch_seconds,
    "D3"
    )
ucsbfinetuning_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-UCSBFinetuning-PCAPS0-f10-2024-11-06-11:35:22/hypermapper_log.log",
    ucsbfinetuning_partition_fetch_seconds,
    "D4"
    )
cic_iot_2023_32_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-CIC-IOT-2023-32-PCAPS1-f10-2024-12-08-01:49:36/hypermapper_log.log",
    cic_iot_2023_32_partition_fetch_seconds,
    "D5"
    )
cic_ids_2017_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-CIC-IDS-2017-PCAPS1-f10-2024-12-01-21:18:00/hypermapper_log.log",
    cic_ids_2017_partition_fetch_seconds,
    "D6",
    )
cic_ids_2018_tta = get_tta(
    "../sample-results/hypermapper-bayesian_optimization-CIC-IDS-2018-PCAPS1-f10-2024-12-08-03:08:46/hypermapper_log.log",
    cic_ids_2018_partition_fetch_seconds,
    "D7",
    )

print("cic_iomt_2024_tta: ", cic_iomt_2024_tta)
print("cic_iot_2023_tta: ", cic_iot_2023_tta)
print("iscxvpn2016_tta: ", iscxvpn2016_tta)
print("ucsbfinetuning_tta: ", ucsbfinetuning_tta)
print("cic_iot_2023_32_tta: ", cic_iot_2023_32_tta)
print("cic_ids_2017_tta: ", cic_ids_2017_tta)
print("cic_ids_2018_tta: ", cic_ids_2018_tta)

cic_iomt_2024_tta:  
	Best F1 Score: 0.58 (Flows: 393216)
	DoE: 5786 (sec)
	Best Iteration: 70
	Average Iteration Time: 589.97 (sec)
	Total Black Box Function Time: 44393 (sec)
	TTA (w/o DoE): 41298 (sec)
	TTA (with DoE): 47084 (sec)

cic_iot_2023_tta:  
	Best F1 Score: 0.86 (Flows: 196608)
	DoE: 2708 (sec)
	Best Iteration: 4
	Average Iteration Time: 273.56 (sec)
	Total Black Box Function Time: 3171 (sec)
	TTA (w/o DoE): 1094 (sec)
	TTA (with DoE): 3802 (sec)

iscxvpn2016_tta:  
	Best F1 Score: 0.82 (Flows: 294912)
	DoE: 109 (sec)
	Best Iteration: 35
	Average Iteration Time: 54.34 (sec)
	Total Black Box Function Time: 402 (sec)
	TTA (w/o DoE): 1902 (sec)
	TTA (with DoE): 2011 (sec)

ucsbfinetuning_tta:  
	Best F1 Score: 0.68 (Flows: 196608)
	DoE: 1173 (sec)
	Best Iteration: 3
	Average Iteration Time: 128.06 (sec)
	Total Black Box Function Time: 1033 (sec)
	TTA (w/o DoE): 384 (sec)
	TTA (with DoE): 1557 (sec)

cic_iot_2023_32_tta:  
	Best F1 Score: 0.44 (Flows: 98304)
	DoE: 8297 (sec)
	

In [13]:
def get_tta_series(tta_object, name, limit=500, normalize=False):
    # convert accuracies to a dataframe
    dataframe_accuracies = pd.DataFrame(tta_object.accuracies, columns=["accuracy"])
    # limit the number of accuracies to 'limit'
    dataframe_accuracies = dataframe_accuracies.head(limit)

    if normalize:
        # pick best F1 score and normalize accuracies
        best_f1_score = tta_object.best_f1_score
        dataframe_accuracies["accuracy"] = round(dataframe_accuracies["accuracy"] / best_f1_score, 2)
    
    # add indices as iteration numbers
    dataframe_accuracies["iteration"] = dataframe_accuracies.index + 1
    # add dataset column to this dataframe
    dataframe_accuracies["dataset"] = name
    return dataframe_accuracies[["dataset", "iteration", "accuracy"]]

cic_iomt_2024_tta_accuracies = get_tta_series(cic_iomt_2024_tta, "D1", limit=160, normalize=False)
cic_iot_2023_tta_accuracies = get_tta_series(cic_iot_2023_tta, "D2", limit=160, normalize=False)
iscxvpn2016_tta_accuracies = get_tta_series(iscxvpn2016_tta, "D3", limit=160, normalize=False)
ucsbfinetuning_tta_accuracies = get_tta_series(ucsbfinetuning_tta, "D4", limit=160, normalize=False)
cic_iot_2023_32_tta_accuracies = get_tta_series(cic_iot_2023_32_tta, "D5", limit=160, normalize=False)
cic_ids_2017_tta_accuracies = get_tta_series(cic_ids_2017_tta, "D6", limit=160, normalize=False)
cic_ids_2018_tta_accuracies = get_tta_series(cic_ids_2018_tta, "D7", limit=160, normalize=False)

# append all accuracies into a single dataframe
tta_accuracies_df = pd.concat([
    cic_iomt_2024_tta_accuracies,
    cic_iot_2023_tta_accuracies,
    iscxvpn2016_tta_accuracies,
    ucsbfinetuning_tta_accuracies,
    cic_iot_2023_32_tta_accuracies,
    cic_ids_2017_tta_accuracies,
    cic_ids_2018_tta_accuracies,
])

tta_accuracies_df

Unnamed: 0,dataset,iteration,accuracy
0,D1,1,0.00
1,D1,2,0.51
2,D1,3,0.51
3,D1,4,0.51
4,D1,5,0.51
...,...,...,...
155,D7,156,1.00
156,D7,157,1.00
157,D7,158,1.00
158,D7,159,1.00


## TTA Breakdown Table

In [14]:
# combine the TTA data into a single dataframe
tta_breakdown_df = pd.DataFrame([
    cic_iomt_2024_tta.to_dict(),
    cic_iot_2023_tta.to_dict(),
    iscxvpn2016_tta.to_dict(),
    ucsbfinetuning_tta.to_dict(),
    cic_iot_2023_32_tta.to_dict(),
    cic_ids_2017_tta.to_dict(),
    cic_ids_2018_tta.to_dict()
])

# pick only the columns we need (average iteration time, average black box time, average optimizer time, average fetch time)
tta_breakdown_df = tta_breakdown_df[[
    "dataset", 
    "average_black_box_time", 
    "average_optimizer_time", 
    "average_fetch_time", "tta"
]]

## Save TTA Accuracies And Breakdown

In [15]:
# save the accuracies to a CSV file with same name as the dataframe
tta_path = "csv/tta-dataframes"
if not os.path.exists(tta_path):
    os.makedirs(tta_path)

# combined TTA dataframes
tta_accuracies_df.to_csv(os.path.join(tta_path, "tta_accuracies_df.csv"), index=False)
tta_breakdown_df.to_csv(os.path.join(tta_path, "tta_breakdown_df.csv"), index=False)

In [16]:
tofino_model_timings_us = [44.345855712890625, 47.44529724121094, 47.206878662109375, 42.438507080078125, 45.678932189941406, 46.8924560546875, 43.12384796142578]
tofino_model_timings_s = [0.000044345855712890625, 0.00004744529724121094, 0.000047206878662109375, 0.000042438507080078125, 0.000045678932189941406, 0.0000468924560546875, 0.00004312384796142578]
tcam_rulegen_timings_s = [0.7146813869476318, 0.9074795246124268, 0.986879825592041, 1.0741093158721924, 0.7132437229156494, 0.8042738437652588, 0.9074795246124268, 0.9961817264556885, 1.0827314853668213]

# combine the TTA data into a single dictionary
tta_dictionary = {
    "cic_iomt_2024_tta": cic_iomt_2024_tta.to_dict(),
    "cic_iot_2023_tta": cic_iot_2023_tta.to_dict(),
    "iscxvpn2016_tta": iscxvpn2016_tta.to_dict(),
    "ucsbfinetuning_tta": ucsbfinetuning_tta.to_dict(),
    "cic_iot_2023_32_tta": cic_iot_2023_32_tta.to_dict(),
    "cic_ids_2017_tta": cic_ids_2017_tta.to_dict(),
    "cic_ids_2018_tta": cic_ids_2018_tta.to_dict()
}

# add the average time for tofino model evaluation (microseconds)
tta_dictionary["cic_iomt_2024_tta"]["average_tofino_model_time"] = tofino_model_timings_s[3]
tta_dictionary["cic_iot_2023_tta"]["average_tofino_model_time"] = tofino_model_timings_s[4]
tta_dictionary["iscxvpn2016_tta"]["average_tofino_model_time"] = tofino_model_timings_s[6]
tta_dictionary["ucsbfinetuning_tta"]["average_tofino_model_time"] = tofino_model_timings_s[3]
tta_dictionary["cic_iot_2023_32_tta"]["average_tofino_model_time"] = tofino_model_timings_s[5]
tta_dictionary["cic_ids_2017_tta"]["average_tofino_model_time"] = tofino_model_timings_s[0]
tta_dictionary["cic_ids_2018_tta"]["average_tofino_model_time"] = tofino_model_timings_s[1]

# add the average time for TCAM rule generation (seconds)
tta_dictionary["cic_iomt_2024_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[5], 2)
tta_dictionary["cic_iot_2023_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[2], 2)
tta_dictionary["iscxvpn2016_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[1], 2)
tta_dictionary["ucsbfinetuning_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[8], 2)
tta_dictionary["cic_iot_2023_32_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[4], 2)
tta_dictionary["cic_ids_2017_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[0], 2)
tta_dictionary["cic_ids_2018_tta"]["average_tcam_rulegen_time"] = round(tcam_rulegen_timings_s[6], 2)

tta_dictionary

{'cic_iomt_2024_tta': {'dataset': 'D1',
  'best_f1_score': 0.58,
  'best_f1_score_iteration': 70,
  'average_iteration_time': 589.97,
  'average_black_box_time': 556.25,
  'average_optimizer_time': 33.72,
  'average_fetch_time': 0.9,
  'tta': 47084,
  'average_tofino_model_time': 4.2438507080078125e-05,
  'average_tcam_rulegen_time': 0.8},
 'cic_iot_2023_tta': {'dataset': 'D2',
  'best_f1_score': 0.86,
  'best_f1_score_iteration': 4,
  'average_iteration_time': 273.56,
  'average_black_box_time': 228.16,
  'average_optimizer_time': 45.4,
  'average_fetch_time': 0.32,
  'tta': 3802,
  'average_tofino_model_time': 4.567893218994141e-05,
  'average_tcam_rulegen_time': 0.99},
 'iscxvpn2016_tta': {'dataset': 'D3',
  'best_f1_score': 0.82,
  'best_f1_score_iteration': 35,
  'average_iteration_time': 54.34,
  'average_black_box_time': 10.87,
  'average_optimizer_time': 43.47,
  'average_fetch_time': 0.01,
  'tta': 2011,
  'average_tofino_model_time': 4.312384796142578e-05,
  'average_tcam_rul

In [17]:
tta_path = "csv/tta-dataframes"

# convert each inner dictionary to a dataframe
cic_iomt_2024_tta_dict_df = pd.DataFrame([tta_dictionary["cic_iomt_2024_tta"]])
cic_iot_2023_tta_dict_df = pd.DataFrame([tta_dictionary["cic_iot_2023_tta"]])
iscxvpn2016_tta_dict_df = pd.DataFrame([tta_dictionary["iscxvpn2016_tta"]])
ucsbfinetuning_tta_dict_df = pd.DataFrame([tta_dictionary["ucsbfinetuning_tta"]])
cic_iot_2023_32_tta_dict_df = pd.DataFrame([tta_dictionary["cic_iot_2023_32_tta"]])
cic_ids_2017_tta_dict_df = pd.DataFrame([tta_dictionary["cic_ids_2017_tta"]])
cic_ids_2018_tta_dict_df = pd.DataFrame([tta_dictionary["cic_ids_2018_tta"]])

# save each dataframe to a CSV file
cic_iomt_2024_tta_dict_df.to_csv(os.path.join(tta_path, "cic_iomt_2024_tta_dict_df.csv"), index=False)
cic_iot_2023_tta_dict_df.to_csv(os.path.join(tta_path, "cic_iot_2023_tta_dict_df.csv"), index=False)
iscxvpn2016_tta_dict_df.to_csv(os.path.join(tta_path, "iscxvpn2016_tta_dict_df.csv"), index=False)
ucsbfinetuning_tta_dict_df.to_csv(os.path.join(tta_path, "ucsbfinetuning_tta_dict_df.csv"), index=False)
cic_iot_2023_32_tta_dict_df.to_csv(os.path.join(tta_path, "cic_iot_2023_32_tta_dict_df.csv"), index=False)
cic_ids_2017_tta_dict_df.to_csv(os.path.join(tta_path, "cic_ids_2017_tta_dict_df.csv"), index=False)
cic_ids_2018_tta_dict_df.to_csv(os.path.join(tta_path, "cic_ids_2018_tta_dict_df.csv"), index=False)

# Resource Tables

In [18]:
inference_time_path = "csv/e2e-inference-timing"

inference_times_df = pd.read_csv(os.path.join(inference_time_path, "inference_time.csv"))
inference_times_df

Unnamed: 0,model,dataset,num_flows,inference_time_us
0,netbeacon,D1,100K,67190614.41
1,netbeacon,D1,500K,67174017.26
2,netbeacon,D1,1M,67082343.09
3,netbeacon,D2,100K,29893832.56
4,netbeacon,D2,500K,29964133.95
...,...,...,...,...
58,cap,D6,500K,32854096.66
59,cap,D6,1M,33840055.59
60,cap,D7,100K,6826751.55
61,cap,D7,500K,6971679.53


In [19]:
# Function to get inference_time_us
def get_inference_time(this_df, model, dataset, num_flows):
    result = this_df[
        (this_df['model'] == model) & 
        (this_df['dataset'] == dataset) & 
        (this_df['num_flows'] == num_flows)
    ]
    if not result.empty:
        return result['inference_time_us'].iloc[0]  # Get the first matching row's value
    else:
        return None  # Return None if no match is found

# test
get_inference_time(inference_times_df, "netbeacon", "D1", "100K")

67190614.41

## Resource Table (Format 1)

In [20]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

DATASET_INDEX = "\multirow{{9}}{{*}}{{\sf D{index}}} "
NETBEACON_ROW = "& NetBeacon & {netbeacon_depth} & {netbeacon_partitions} & {netbeacon_features} & {netbeacon_flows} & {netbeacon_f1_score} & {netbeacon_tcam} & {netbeacon_memory} & {netbeacon_latency} \\\\"
LEO_ROW = "\t\t\t\t& Leo       & {leo_depth} & {leo_partitions} & {leo_features} & {leo_flows} & {leo_f1_score} & {leo_tcam} & {leo_memory} & {leo_latency} \\\\"
CAP_ROW = "\t\t\t\t& \cellcolor{{customblue}}CAP & \cellcolor{{customblue}}{cap_depth} & \cellcolor{{customblue}}{cap_partitions} & \cellcolor{{customblue}}{cap_features} & \cellcolor{{customblue}}{cap_flows} & \cellcolor{{customblue}}{cap_f1_score} & \cellcolor{{customblue}}{cap_tcam} & \cellcolor{{customblue}}{cap_memory} & \cellcolor{{customblue}}{cap_latency} \\\\"

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    first = True
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            f1_score = round(this_row['f1_score'].values[0], 2)
            number_of_flows = this_row['num_flows'].values[0]
            max_depth = this_row['max_depth'].values[0]
            num_partitions = this_row['num_partitions'].values[0]
            total_features = this_row['total_features'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]
            
            # get the latency data for this dataset
            # inf_latency_us = latency_data[f"D{index}"][0]
            inf_latency_us = get_inference_time(
                inference_times_df, this_model, f"D{index}", FLOW_STRINGS[number_of_flows]
            )
            inf_latency_s = f"{inf_latency_us/1e6:.2f}"
            
            data_row = ""
            
            # based on the model, format and print the row
            if this_model == 'netbeacon':
                if first:
                    data_row += DATASET_INDEX.format(index=index)
                    first = False
                else:
                    data_row += "\t\t\t\t"
                    
                data_row += NETBEACON_ROW.format(
                    netbeacon_depth=max_depth,
                    netbeacon_partitions=1, #num_partitions,
                    netbeacon_features=total_features,
                    netbeacon_flows=FLOW_STRINGS[number_of_flows],
                    netbeacon_f1_score=f1_score,
                    netbeacon_tcam=tcam_entries,
                    netbeacon_memory=memory_perc,
                    netbeacon_latency=inf_latency_s
                )
                
            elif this_model == 'leo':
                data_row += LEO_ROW.format(
                    leo_depth=max_depth,
                    leo_partitions=1, #num_partitions,
                    leo_features=total_features,
                    leo_flows=FLOW_STRINGS[number_of_flows],
                    leo_f1_score=f1_score,
                    leo_tcam=tcam_entries,
                    leo_memory=memory_perc,
                    leo_latency=inf_latency_s
                )
                
            elif this_model == 'cap':
                data_row += CAP_ROW.format(
                    cap_depth=max_depth,
                    cap_partitions=num_partitions,
                    cap_features=total_features,
                    cap_flows=FLOW_STRINGS[number_of_flows],
                    cap_f1_score=f1_score,
                    cap_tcam=tcam_entries,
                    cap_memory=memory_perc,
                    cap_latency=inf_latency_s
                )
                
                data_row += " \cline{2-10}" if num_flows != SELECTED_FLOWS[-1] else ""

            print(data_row)
        
    print("\n\midrule\n")

\multirow{9}{*}{\sf D1} & NetBeacon & 13 & 1 & 6 & 100K & 0.44 & 3041 & 30.0 & 67.19 \\
				& Leo       & 11 & 1 & 5 & 100K & 0.51 & 16384 & 25.0 & 100.25 \\
				& \cellcolor{customblue}CAP & \cellcolor{customblue}27 & \cellcolor{customblue}3 & \cellcolor{customblue}41 & \cellcolor{customblue}100K & \cellcolor{customblue}0.58 & \cellcolor{customblue}3802 & \cellcolor{customblue}30.0 & \cellcolor{customblue}81.08 \\ \cline{2-10}
				& NetBeacon & 15 & 1 & 3 & 500K & 0.43 & 4071 & 37.5 & 67.17 \\
				& Leo       & 5 & 1 & 4 & 500K & 0.44 & 2048 & 60.0 & 100.25 \\
				& \cellcolor{customblue}CAP & \cellcolor{customblue}28 & \cellcolor{customblue}3 & \cellcolor{customblue}39 & \cellcolor{customblue}500K & \cellcolor{customblue}0.56 & \cellcolor{customblue}3867 & \cellcolor{customblue}25.0 & \cellcolor{customblue}81.09 \\ \cline{2-10}
				& NetBeacon & 3 & 1 & 2 & 1M & 0.18 & 86 & 50.0 & 67.08 \\
				& Leo       & 3 & 1 & 2 & 1M & 0.23 & 2048 & 50.0 & 100.25 \\
				& \cellcolor{customblue}C

## Resource Table (Format 2)

In [21]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

DEPTH_ROW = "\multirow{{7}}{{*}}{{\sf D{index}}} "
DEPTH_ROW += "& Depth & {netbeacon_100K_max_depth} & {leo_100K_max_depth} & \cellcolor{{customblue}}{cap_100K_max_depth} & {netbeacon_500K_max_depth} & {leo_500K_max_depth} & \cellcolor{{customblue}}{cap_500K_max_depth} & {netbeacon_1M_max_depth} & {leo_1M_max_depth} & \cellcolor{{customblue}}{cap_1M_max_depth} \\\\"
PARTITIONS_ROW = "\t\t\t\t& Partitions & {netbeacon_100K_num_partitions} & {leo_100K_num_partitions} & \cellcolor{{customblue}}{cap_100K_num_partitions} & {netbeacon_500K_num_partitions} & {leo_500K_num_partitions} & \cellcolor{{customblue}}{cap_500K_num_partitions} & {netbeacon_1M_num_partitions} & {leo_1M_num_partitions} & \cellcolor{{customblue}}{cap_1M_num_partitions} \\\\"
FEATURES_ROW = "\t\t\t\t& Features & {netbeacon_100K_total_features} & {leo_100K_total_features} & \cellcolor{{customblue}}{cap_100K_total_features} & {netbeacon_500K_total_features} & {leo_500K_total_features} & \cellcolor{{customblue}}{cap_500K_total_features} & {netbeacon_1M_total_features} & {leo_1M_total_features} & \cellcolor{{customblue}}{cap_1M_total_features} \\\\"
F1_SCORE_ROW = "\t\t\t\t& F1 Score & {netbeacon_100K_f1_score} & {leo_100K_f1_score} & \cellcolor{{customblue}}{cap_100K_f1_score} & {netbeacon_500K_f1_score} & {leo_500K_f1_score} & \cellcolor{{customblue}}{cap_500K_f1_score} & {netbeacon_1M_f1_score} & {leo_1M_f1_score} & \cellcolor{{customblue}}{cap_1M_f1_score} \\\\"
LATENCY_ROW = "\t\t\t\t& Latency ($\mu$s) & {netbeacon_100K_latency} & {leo_100K_latency} & \cellcolor{{customblue}}{cap_100K_latency} & {netbeacon_500K_latency} & {leo_500K_latency} & \cellcolor{{customblue}}{cap_500K_latency} & {netbeacon_1M_latency} & {leo_1M_latency} & \cellcolor{{customblue}}{cap_1M_latency} \\\\"
TCAM_ENTRIES_ROW = "\t\t\t\t& TCAM Entries & {netbeacon_100K_tcam_entries} & {leo_100K_tcam_entries} & \cellcolor{{customblue}}{cap_100K_tcam_entries} & {netbeacon_500K_tcam_entries} & {leo_500K_tcam_entries} & \cellcolor{{customblue}}{cap_500K_tcam_entries} & {netbeacon_1M_tcam_entries} & {leo_1M_tcam_entries} & \cellcolor{{customblue}}{cap_1M_tcam_entries} \\\\"
MEMORY_ROW = "\t\t\t\t& Memory \\% & {netbeacon_100K_memory_perc} & {leo_100K_memory_perc} & \cellcolor{{customblue}}{cap_100K_memory_perc} & {netbeacon_500K_memory_perc} & {leo_500K_memory_perc} & \cellcolor{{customblue}}{cap_500K_memory_perc} & {netbeacon_1M_memory_perc} & {leo_1M_memory_perc} & \cellcolor{{customblue}}{cap_1M_memory_perc} \\\\"

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    # create row copies for this dataset
    dataset_rows = {
        "max_depth": str(DEPTH_ROW),
        "num_partitions": str(PARTITIONS_ROW),
        "total_features": str(FEATURES_ROW),
        "f1_score": str(F1_SCORE_ROW),
        "latency": str(LATENCY_ROW),
        "tcam_entries": str(TCAM_ENTRIES_ROW),
        "memory_perc": str(MEMORY_ROW)
    }
    
    dataset_values = {}
    dataset_values["index"] = index
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            max_depth = this_row['max_depth'].values[0]
            num_partitions = max(this_row['num_partitions'].values[0], 1)
            total_features = this_row['total_features'].values[0]
            f1_score = round(this_row['f1_score'].values[0], 2)
            latency = 0
            number_of_flows = this_row['num_flows'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]

            # get the latency data for this dataset
            # inf_latency_us = latency_data[f"D{index}"][0]
            inf_latency_us = get_inference_time(
                inference_times_df, this_model, f"D{index}", FLOW_STRINGS[number_of_flows]
            )
            inf_latency_s = f"{inf_latency_us/1e6:.2f}"
            
            # fill in the values for these columns
            num_flow_str = FLOW_STRINGS[number_of_flows]
            dataset_values[f"{this_model}_{num_flow_str}_max_depth"] = max_depth
            dataset_values[f"{this_model}_{num_flow_str}_num_partitions"] = num_partitions
            dataset_values[f"{this_model}_{num_flow_str}_total_features"] = total_features
            dataset_values[f"{this_model}_{num_flow_str}_f1_score"] = f1_score
            dataset_values[f"{this_model}_{num_flow_str}_latency"] = latency
            dataset_values[f"{this_model}_{num_flow_str}_num_flows"] = num_flow_str
            dataset_values[f"{this_model}_{num_flow_str}_tcam_entries"] = tcam_entries
            dataset_values[f"{this_model}_{num_flow_str}_memory_perc"] = memory_perc
            dataset_values[f"{this_model}_{num_flow_str}_latency"] = inf_latency_s
    
    # print(dataset_values)
    
    # now format all data rows for this dataset
    for this_property, property_string in dataset_rows.items():
        dataset_rows[this_property] = property_string.format(**dataset_values)
        print(dataset_rows[this_property])
    print("\midrule")
    pass

\multirow{7}{*}{\sf D1} & Depth & 13 & 11 & \cellcolor{customblue}27 & 15 & 5 & \cellcolor{customblue}28 & 3 & 3 & \cellcolor{customblue}13 \\
				& Partitions & 1 & 1 & \cellcolor{customblue}3 & 1 & 1 & \cellcolor{customblue}3 & 1 & 1 & \cellcolor{customblue}1 \\
				& Features & 6 & 5 & \cellcolor{customblue}41 & 3 & 4 & \cellcolor{customblue}39 & 2 & 2 & \cellcolor{customblue}2 \\
				& F1 Score & 0.44 & 0.51 & \cellcolor{customblue}0.58 & 0.43 & 0.44 & \cellcolor{customblue}0.56 & 0.18 & 0.23 & \cellcolor{customblue}0.46 \\
				& Latency ($\mu$s) & 67.19 & 100.25 & \cellcolor{customblue}81.08 & 67.17 & 100.25 & \cellcolor{customblue}81.09 & 67.08 & 100.25 & \cellcolor{customblue}100.25 \\
				& TCAM Entries & 3041 & 16384 & \cellcolor{customblue}3802 & 4071 & 2048 & \cellcolor{customblue}3867 & 86 & 2048 & \cellcolor{customblue}4874 \\
				& Memory \% & 30.0 & 25.0 & \cellcolor{customblue}30.0 & 37.5 & 60.0 & \cellcolor{customblue}25.0 & 50.0 & 50.0 & \cellcolor{customblue}50.0 \\


## Resource Table (Format 3)

In [22]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

DEPTH_PARTITIONS_FEATURES_ROW = "\multirow{{3}}{{*}}{{\sf D{index}}} "
DEPTH_PARTITIONS_FEATURES_ROW += "& Depth - Partitions - Features "
DEPTH_PARTITIONS_FEATURES_ROW += "& {netbeacon_100K_max_depth} - {netbeacon_100K_num_partitions} - {netbeacon_100K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& {leo_100K_max_depth} - {leo_100K_num_partitions} - {leo_100K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& \cellcolor{{customblue}}{cap_100K_max_depth} - \cellcolor{{customblue}}{cap_100K_num_partitions} - \cellcolor{{customblue}}{cap_100K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& {netbeacon_500K_max_depth} - {netbeacon_500K_num_partitions} - {netbeacon_500K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& {leo_500K_max_depth} - {leo_500K_num_partitions} - {leo_500K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& \cellcolor{{customblue}}{cap_500K_max_depth} - \cellcolor{{customblue}}{cap_500K_num_partitions} - \cellcolor{{customblue}}{cap_500K_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& {netbeacon_1M_max_depth} - {netbeacon_1M_num_partitions} - {netbeacon_1M_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& {leo_1M_max_depth} - {leo_1M_num_partitions} - {leo_1M_total_features} "
DEPTH_PARTITIONS_FEATURES_ROW += "& \cellcolor{{customblue}}{cap_1M_max_depth} - \cellcolor{{customblue}}{cap_1M_num_partitions} - \cellcolor{{customblue}}{cap_1M_total_features} \\\\"

F1_SCORE_LATENCY_ROW = "\t\t\t\t& F1 Score - Latency ($\mu$s) "
F1_SCORE_LATENCY_ROW += "& {netbeacon_100K_f1_score} - {netbeacon_100K_latency} "
F1_SCORE_LATENCY_ROW += "& {leo_100K_f1_score} - {leo_100K_latency} "
F1_SCORE_LATENCY_ROW += "& \cellcolor{{customblue}}{cap_100K_f1_score} - \cellcolor{{customblue}}{cap_100K_latency} "
F1_SCORE_LATENCY_ROW += "& {netbeacon_500K_f1_score} - {netbeacon_500K_latency} "
F1_SCORE_LATENCY_ROW += "& {leo_500K_f1_score} - {leo_500K_latency} "
F1_SCORE_LATENCY_ROW += "& \cellcolor{{customblue}}{cap_500K_f1_score} - \cellcolor{{customblue}}{cap_500K_latency} "
F1_SCORE_LATENCY_ROW += "& {netbeacon_1M_f1_score} - {netbeacon_1M_latency} "
F1_SCORE_LATENCY_ROW += "& {leo_1M_f1_score} - {leo_1M_latency} "
F1_SCORE_LATENCY_ROW += "& \cellcolor{{customblue}}{cap_1M_f1_score} - \cellcolor{{customblue}}{cap_1M_latency} \\\\"

TCAM_ENTRIES_MEMORY_ROW = "\t\t\t\t& TCAM Entries - Memory \\% "
TCAM_ENTRIES_MEMORY_ROW += "& {netbeacon_100K_tcam_entries} - {netbeacon_100K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& {leo_100K_tcam_entries} - {leo_100K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& \cellcolor{{customblue}}{cap_100K_tcam_entries} - \cellcolor{{customblue}}{cap_100K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& {netbeacon_500K_tcam_entries} - {netbeacon_500K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& {leo_500K_tcam_entries} - {leo_500K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& \cellcolor{{customblue}}{cap_500K_tcam_entries} - \cellcolor{{customblue}}{cap_500K_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& {netbeacon_1M_tcam_entries} - {netbeacon_1M_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& {leo_1M_tcam_entries} - {leo_1M_memory_perc} "
TCAM_ENTRIES_MEMORY_ROW += "& \cellcolor{{customblue}}{cap_1M_tcam_entries} - \cellcolor{{customblue}}{cap_1M_memory_perc} \\\\"

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    # create row copies for this dataset
    dataset_rows = {
        "max_depth/num_partitions/total_features": str(DEPTH_PARTITIONS_FEATURES_ROW),
        "f1_score/latency": str(F1_SCORE_LATENCY_ROW),
        "tcam_entries/memory_perc": str(TCAM_ENTRIES_MEMORY_ROW)
    }
    
    dataset_values = {}
    dataset_values["index"] = index
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            max_depth = this_row['max_depth'].values[0]
            num_partitions = max(this_row['num_partitions'].values[0], 1)
            total_features = this_row['total_features'].values[0]
            f1_score = round(this_row['f1_score'].values[0], 2)
            latency = 0
            number_of_flows = this_row['num_flows'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]
            
            # get the latency data for this dataset
            # inf_latency_us = latency_data[f"D{index}"][0]
            inf_latency_us = get_inference_time(
                inference_times_df, this_model, f"D{index}", FLOW_STRINGS[number_of_flows]
            )
            inf_latency_s = f"{inf_latency_us/1e6:.2f}"

            # fill in the values for these columns
            num_flow_str = FLOW_STRINGS[number_of_flows]
            dataset_values[f"{this_model}_{num_flow_str}_max_depth"] = max_depth
            dataset_values[f"{this_model}_{num_flow_str}_num_partitions"] = num_partitions
            dataset_values[f"{this_model}_{num_flow_str}_total_features"] = total_features
            dataset_values[f"{this_model}_{num_flow_str}_f1_score"] = f1_score
            dataset_values[f"{this_model}_{num_flow_str}_latency"] = latency
            dataset_values[f"{this_model}_{num_flow_str}_num_flows"] = num_flow_str
            dataset_values[f"{this_model}_{num_flow_str}_tcam_entries"] = tcam_entries
            dataset_values[f"{this_model}_{num_flow_str}_memory_perc"] = memory_perc
            dataset_values[f"{this_model}_{num_flow_str}_latency"] = inf_latency_s
    
    # print(dataset_values)
    
    # now format all data rows for this dataset
    for this_property, property_string in dataset_rows.items():
        dataset_rows[this_property] = property_string.format(**dataset_values)
        print(dataset_rows[this_property])
    print("\midrule")
    pass

\multirow{3}{*}{\sf D1} & Depth - Partitions - Features & 13 - 1 - 6 & 11 - 1 - 5 & \cellcolor{customblue}27 - \cellcolor{customblue}3 - \cellcolor{customblue}41 & 15 - 1 - 3 & 5 - 1 - 4 & \cellcolor{customblue}28 - \cellcolor{customblue}3 - \cellcolor{customblue}39 & 3 - 1 - 2 & 3 - 1 - 2 & \cellcolor{customblue}13 - \cellcolor{customblue}1 - \cellcolor{customblue}2 \\
				& F1 Score - Latency ($\mu$s) & 0.44 - 67.19 & 0.51 - 100.25 & \cellcolor{customblue}0.58 - \cellcolor{customblue}81.08 & 0.43 - 67.17 & 0.44 - 100.25 & \cellcolor{customblue}0.56 - \cellcolor{customblue}81.09 & 0.18 - 67.08 & 0.23 - 100.25 & \cellcolor{customblue}0.46 - \cellcolor{customblue}100.25 \\
				& TCAM Entries - Memory \% & 3041 - 30.0 & 16384 - 25.0 & \cellcolor{customblue}3802 - \cellcolor{customblue}30.0 & 4071 - 37.5 & 2048 - 60.0 & \cellcolor{customblue}3867 - \cellcolor{customblue}25.0 & 86 - 50.0 & 2048 - 50.0 & \cellcolor{customblue}4874 - \cellcolor{customblue}50.0 \\
\midrule
\multirow{3}{*}{\sf

## Resource Table (Format 4)

In [23]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

DATASET_INDEX = "\multirow{{3}}{{*}}{{\sf D{index}}} "
DATA_ROW = "& {num_flows_str} "
DATA_ROW += "& {netbeacon_max_depth}, {netbeacon_num_partitions}, {netbeacon_total_features} "
DATA_ROW += "& {leo_max_depth}, {leo_num_partitions}, {leo_total_features} "
DATA_ROW += "& \cellcolor{{customblue}}{{{cap_max_depth}, {cap_num_partitions}, {cap_total_features}}} "

DATA_ROW += "& {netbeacon_f1_score}, {netbeacon_latency} "
DATA_ROW += "& {leo_f1_score}, {leo_latency} "
DATA_ROW += "& \cellcolor{{customblue}}{{\\textbf{{{cap_f1_score}}}, {cap_latency}}} "

DATA_ROW += "& {netbeacon_tcam_entries}, {netbeacon_memory_perc} "
DATA_ROW += "& {leo_tcam_entries}, {leo_memory_perc} "
DATA_ROW += "& \cellcolor{{customblue}}{{{cap_tcam_entries}, {cap_memory_perc}}} \\\\"

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    # create row copies for this dataset
    dataset_rows = {
        "100K": str(DATASET_INDEX + DATA_ROW),
        "500K": str("\t\t\t"+ DATA_ROW),
        "1M": str("\t\t\t"+ DATA_ROW)
    }
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:

        dataset_values_at_N_flows = {}
        dataset_values_at_N_flows["index"] = index    
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            max_depth = this_row['max_depth'].values[0]
            num_partitions = max(this_row['num_partitions'].values[0], 1)
            total_features = this_row['total_features'].values[0]
            f1_score = round(this_row['f1_score'].values[0], 2)
            latency = 0
            number_of_flows = this_row['num_flows'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]
            
            # get the latency data for this dataset
            # inf_latency_us = latency_data[f"D{index}"][0]
            inf_latency_us = get_inference_time(
                inference_times_df, this_model, f"D{index}", FLOW_STRINGS[number_of_flows]
            )
            inf_latency_s = f"{inf_latency_us/1e6:.2f}"

            # fill in the values for these columns
            num_flow_str = FLOW_STRINGS[number_of_flows]
            dataset_values_at_N_flows[f"num_flows_str"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_max_depth"] = max_depth
            dataset_values_at_N_flows[f"{this_model}_num_partitions"] = num_partitions
            dataset_values_at_N_flows[f"{this_model}_total_features"] = total_features
            dataset_values_at_N_flows[f"{this_model}_f1_score"] = f1_score
            dataset_values_at_N_flows[f"{this_model}_latency"] = latency
            dataset_values_at_N_flows[f"{this_model}_num_flows"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_tcam_entries"] = tcam_entries
            dataset_values_at_N_flows[f"{this_model}_memory_perc"] = int(memory_perc)
            dataset_values_at_N_flows[f"{this_model}_latency"] = inf_latency_s
    
        # print(dataset_values_at_N_flows)
        # now format all data rows for this dataset at these many flows
        dataset_rows[num_flow_str] = dataset_rows[num_flow_str].format(**dataset_values_at_N_flows)
        print(dataset_rows[num_flow_str])
        pass
        
    print("\midrule")
    pass

\multirow{3}{*}{\sf D1} & 100K & 13, 1, 6 & 11, 1, 5 & \cellcolor{customblue}{27, 3, 41} & 0.44, 67.19 & 0.51, 100.25 & \cellcolor{customblue}{\textbf{0.58}, 81.08} & 3041, 30 & 16384, 25 & \cellcolor{customblue}{3802, 30} \\
			& 500K & 15, 1, 3 & 5, 1, 4 & \cellcolor{customblue}{28, 3, 39} & 0.43, 67.17 & 0.44, 100.25 & \cellcolor{customblue}{\textbf{0.56}, 81.09} & 4071, 37 & 2048, 60 & \cellcolor{customblue}{3867, 25} \\
			& 1M & 3, 1, 2 & 3, 1, 2 & \cellcolor{customblue}{13, 1, 2} & 0.18, 67.08 & 0.23, 100.25 & \cellcolor{customblue}{\textbf{0.46}, 100.25} & 86, 50 & 2048, 50 & \cellcolor{customblue}{4874, 50} \\
\midrule
\multirow{3}{*}{\sf D2} & 100K & 13, 1, 6 & 10, 1, 6 & \cellcolor{customblue}{28, 2, 43} & 0.8, 29.89 & 0.79, 47.23 & \cellcolor{customblue}{\textbf{0.86}, 39.40} & 3740, 30 & 8192, 30 & \cellcolor{customblue}{10633, 20} \\
			& 500K & 15, 1, 2 & 6, 1, 4 & \cellcolor{customblue}{24, 3, 40} & 0.8, 29.96 & 0.74, 47.23 & \cellcolor{customblue}{\textbf{0.83}, 38.24}

## Resource Table (Format 5)

In [24]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

DATASET_INDEX = "\multirow{{3}}{{*}}{{\sf D{index}}} "
DATA_ROW = "& {num_flows_str} "
DATA_ROW += "& {netbeacon_f1_score} & {leo_f1_score} & \cellcolor{{customblue}}{cap_f1_score} "
DATA_ROW += "& {netbeacon_latency} & {leo_latency} & \cellcolor{{customblue}}{cap_latency} "
DATA_ROW += "& {netbeacon_max_depth} & {leo_max_depth} & \cellcolor{{customblue}}{{{cap_max_depth}~/~{cap_num_partitions}}} "
DATA_ROW += "& {netbeacon_total_features} & {leo_total_features} & \cellcolor{{customblue}}{cap_total_features} "
DATA_ROW += "& {netbeacon_tcam_entries} & {leo_tcam_entries} & \cellcolor{{customblue}}{cap_tcam_entries} "
DATA_ROW += "& {netbeacon_memory_perc} & {leo_memory_perc} & \cellcolor{{customblue}}{cap_memory_perc} \\\\"

collection = []

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    # create row copies for this dataset
    dataset_rows = {
        "100K": str(DATASET_INDEX + DATA_ROW),
        "500K": str("\t\t\t"+ DATA_ROW),
        "1M": str("\t\t\t"+ DATA_ROW)
    }
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:

        dataset_values_at_N_flows = {}
        dataset_values_at_N_flows["index"] = index    
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            max_depth = this_row['max_depth'].values[0]
            num_partitions = max(this_row['num_partitions'].values[0], 1)
            feature_limit = this_row['feature_limit'].values[0]
            total_features = this_row['total_features'].values[0]
            f1_score = round(this_row['f1_score'].values[0], 2)
            latency = 0
            number_of_flows = this_row['num_flows'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]
            
            # get the latency data for this dataset
            # inf_latency_us = latency_data[f"D{index}"][0]
            inf_latency_us = get_inference_time(
                inference_times_df, this_model, f"D{index}", FLOW_STRINGS[number_of_flows]
            )
            inf_latency_s = f"{inf_latency_us/1e6:.2f}"

            # fill in the values for these columns
            num_flow_str = FLOW_STRINGS[number_of_flows]
            dataset_values_at_N_flows[f"num_flows_str"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_max_depth"] = max_depth
            dataset_values_at_N_flows[f"{this_model}_num_partitions"] = num_partitions
            dataset_values_at_N_flows[f"{this_model}_total_features"] = total_features
            dataset_values_at_N_flows[f"{this_model}_f1_score"] = f"{f1_score:.2f}"
            dataset_values_at_N_flows[f"{this_model}_latency"] = latency
            dataset_values_at_N_flows[f"{this_model}_num_flows"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_tcam_entries"] = "{:,}".format(tcam_entries)
            # dataset_values_at_N_flows[f"{this_model}_memory_perc"] = int(memory_perc)
            if this_model == 'leo' or this_model == 'netbeacon':
                reg_bits = 32 * total_features
            elif this_model == 'cap':
                reg_bits = 32 * feature_limit
                
                collection.append((index, num_flow_str, num_partitions))
                
            dataset_values_at_N_flows[f"{this_model}_memory_perc"] = reg_bits
            dataset_values_at_N_flows[f"{this_model}_latency"] = inf_latency_s
    
        # print(dataset_values_at_N_flows)
        # now format all data rows for this dataset at these many flows
        dataset_rows[num_flow_str] = dataset_rows[num_flow_str].format(**dataset_values_at_N_flows)
        print(dataset_rows[num_flow_str])
        pass
        
    print("\midrule")
    pass

\multirow{3}{*}{\sf D1} & 100K & 0.44 & 0.51 & \cellcolor{customblue}0.58 & 67.19 & 100.25 & \cellcolor{customblue}81.08 & 13 & 11 & \cellcolor{customblue}{27~/~3} & 6 & 5 & \cellcolor{customblue}41 & 3,041 & 16,384 & \cellcolor{customblue}3,802 & 192 & 160 & \cellcolor{customblue}128 \\
			& 500K & 0.43 & 0.44 & \cellcolor{customblue}0.56 & 67.17 & 100.25 & \cellcolor{customblue}81.09 & 15 & 5 & \cellcolor{customblue}{28~/~3} & 3 & 4 & \cellcolor{customblue}39 & 4,071 & 2,048 & \cellcolor{customblue}3,867 & 96 & 128 & \cellcolor{customblue}64 \\
			& 1M & 0.18 & 0.23 & \cellcolor{customblue}0.46 & 67.08 & 100.25 & \cellcolor{customblue}100.25 & 3 & 3 & \cellcolor{customblue}{13~/~1} & 2 & 2 & \cellcolor{customblue}2 & 86 & 2,048 & \cellcolor{customblue}4,874 & 64 & 64 & \cellcolor{customblue}64 \\
\midrule
\multirow{3}{*}{\sf D2} & 100K & 0.80 & 0.79 & \cellcolor{customblue}0.86 & 29.89 & 47.23 & \cellcolor{customblue}39.40 & 13 & 10 & \cellcolor{customblue}{28~/~2} & 6 & 6 & \cellcol

## Resource Table (Format 6: No Latency)

In [25]:
# they correspond to 100K, 500K, 1M flows
SELECTED_FLOWS = [98304, 491520, 983040]
FLOW_STRINGS = {
    98304: "100K",
    491520: "500K",
    983040: "1M"
}

# make a dictionary of dataset indices to their pareto dataframes
pareto_dataframes = {
    1: pareto_cic_iomt_2024_df,
    2: pareto_cic_iot_2023_df,
    3: pareto_iscxvpn2016_df,
    4: pareto_ucsbfinetuning_df,
    5: pareto_cic_iot_2023_32_df,
    6: pareto_cic_ids_2017_df,
    7: pareto_cic_ids_2018_df,
}

DATASET_INDEX = "\multirow{{3}}{{*}}{{\sf D{index}}} "
DATA_ROW = "& {num_flows_str} "
DATA_ROW += "& {netbeacon_f1_score} & {leo_f1_score} & \cellcolor{{customblue}}{cap_f1_score} "
DATA_ROW += "& {netbeacon_max_depth} & {leo_max_depth} & \cellcolor{{customblue}}{{{cap_max_depth}~/~{cap_num_partitions}}} "
DATA_ROW += "& {netbeacon_total_features} & {leo_total_features} & \cellcolor{{customblue}}{cap_total_features} "
DATA_ROW += "& {netbeacon_tcam_entries} & {leo_tcam_entries} & \cellcolor{{customblue}}{cap_tcam_entries} "
DATA_ROW += "& {netbeacon_memory_perc} & {leo_memory_perc} & \cellcolor{{customblue}}{cap_memory_perc} \\\\"

# loop over the pareto dataframes and print the results
for index, this_dataset_pareto_df in pareto_dataframes.items():
    # create row copies for this dataset
    dataset_rows = {
        "100K": str(DATASET_INDEX + DATA_ROW),
        "500K": str("\t\t\t"+ DATA_ROW),
        "1M": str("\t\t\t"+ DATA_ROW)
    }
    
    # loop over the selected flows
    for num_flows in SELECTED_FLOWS:

        dataset_values_at_N_flows = {}
        dataset_values_at_N_flows["index"] = index    
        
        # loop over the model types
        for this_model in ['netbeacon', 'leo', 'cap']:
            
            # get the row corresponding to this model and num_flows
            this_row = this_dataset_pareto_df[
                (this_dataset_pareto_df['name'] == this_model) & 
                (this_dataset_pareto_df['num_flows'] == num_flows)
            ]
            
            # print(f"Index {index}, Model {this_model}, Flows {num_flows}")
            # print(this_row)
            
            # read the values from the row
            # f1_score, feature_limit, feature_table_entries, max_depth, num_flows
            # num_partitions, total_features, tree_table_entries, tcam_entries, memory_bits, memory_perc
            max_depth = this_row['max_depth'].values[0]
            num_partitions = max(this_row['num_partitions'].values[0], 1)
            feature_limit = this_row['feature_limit'].values[0]
            total_features = this_row['total_features'].values[0]
            f1_score = round(this_row['f1_score'].values[0], 2)
            number_of_flows = this_row['num_flows'].values[0]
            tcam_entries = int(this_row['tcam_entries'].values[0])
            memory_bits = this_row['memory_bits'].values[0]
            memory_perc = this_row['memory_perc'].values[0]
            
            # fill in the values for these columns
            num_flow_str = FLOW_STRINGS[number_of_flows]
            dataset_values_at_N_flows[f"num_flows_str"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_max_depth"] = max_depth
            dataset_values_at_N_flows[f"{this_model}_num_partitions"] = num_partitions
            dataset_values_at_N_flows[f"{this_model}_total_features"] = total_features
            dataset_values_at_N_flows[f"{this_model}_f1_score"] = f"{f1_score:.2f}"
            dataset_values_at_N_flows[f"{this_model}_num_flows"] = num_flow_str
            dataset_values_at_N_flows[f"{this_model}_tcam_entries"] = "{:,}".format(tcam_entries)
            # dataset_values_at_N_flows[f"{this_model}_memory_perc"] = int(memory_perc)
            if this_model == 'leo' or this_model == 'netbeacon':
                reg_bits = 32 * total_features
            elif this_model == 'cap':
                reg_bits = 32 * feature_limit
            dataset_values_at_N_flows[f"{this_model}_memory_perc"] = reg_bits
    
        # print(dataset_values_at_N_flows)
        # now format all data rows for this dataset at these many flows
        dataset_rows[num_flow_str] = dataset_rows[num_flow_str].format(**dataset_values_at_N_flows)
        print(dataset_rows[num_flow_str])
        pass
        
    print("\midrule")
    pass

\multirow{3}{*}{\sf D1} & 100K & 0.44 & 0.51 & \cellcolor{customblue}0.58 & 13 & 11 & \cellcolor{customblue}{27~/~3} & 6 & 5 & \cellcolor{customblue}41 & 3,041 & 16,384 & \cellcolor{customblue}3,802 & 192 & 160 & \cellcolor{customblue}128 \\
			& 500K & 0.43 & 0.44 & \cellcolor{customblue}0.56 & 15 & 5 & \cellcolor{customblue}{28~/~3} & 3 & 4 & \cellcolor{customblue}39 & 4,071 & 2,048 & \cellcolor{customblue}3,867 & 96 & 128 & \cellcolor{customblue}64 \\
			& 1M & 0.18 & 0.23 & \cellcolor{customblue}0.46 & 3 & 3 & \cellcolor{customblue}{13~/~1} & 2 & 2 & \cellcolor{customblue}2 & 86 & 2,048 & \cellcolor{customblue}4,874 & 64 & 64 & \cellcolor{customblue}64 \\
\midrule
\multirow{3}{*}{\sf D2} & 100K & 0.80 & 0.79 & \cellcolor{customblue}0.86 & 13 & 10 & \cellcolor{customblue}{28~/~2} & 6 & 6 & \cellcolor{customblue}43 & 3,740 & 8,192 & \cellcolor{customblue}10,633 & 192 & 192 & \cellcolor{customblue}128 \\
			& 500K & 0.80 & 0.74 & \cellcolor{customblue}0.83 & 15 & 6 & \cellcolor{custom

# Time to Accuracy Breakdown Table

In [26]:
tta_path = "csv/tta-dataframes"

cic_iomt_2024_tta_dict_df = pd.read_csv(os.path.join(tta_path, "cic_iomt_2024_tta_dict_df.csv"))
cic_iot_2023_tta_dict_df = pd.read_csv(os.path.join(tta_path, "cic_iot_2023_tta_dict_df.csv"))
iscxvpn2016_tta_dict_df = pd.read_csv(os.path.join(tta_path, "iscxvpn2016_tta_dict_df.csv"))
ucsbfinetuning_tta_dict_df = pd.read_csv(os.path.join(tta_path, "ucsbfinetuning_tta_dict_df.csv"))
cic_iot_2023_32_tta_dict_df = pd.read_csv(os.path.join(tta_path, "cic_iot_2023_32_tta_dict_df.csv"))
cic_ids_2017_tta_dict_df = pd.read_csv(os.path.join(tta_path, "cic_ids_2017_tta_dict_df.csv"))
cic_ids_2018_tta_dict_df = pd.read_csv(os.path.join(tta_path, "cic_ids_2018_tta_dict_df.csv"))

# convert each dataframe to dictionary
tta_dictionary = {
    "cic_iomt_2024_tta": cic_iomt_2024_tta_dict_df.iloc[0].to_dict(),
    "cic_iot_2023_tta": cic_iot_2023_tta_dict_df.iloc[0].to_dict(),
    "iscxvpn2016_tta": iscxvpn2016_tta_dict_df.iloc[0].to_dict(),
    "ucsbfinetuning_tta": ucsbfinetuning_tta_dict_df.iloc[0].to_dict(),
    "cic_iot_2023_32_tta": cic_iot_2023_32_tta_dict_df.iloc[0].to_dict(),
    "cic_ids_2017_tta": cic_ids_2017_tta_dict_df.iloc[0].to_dict(),
    "cic_ids_2018_tta": cic_ids_2018_tta_dict_df.iloc[0].to_dict(),
}

tta_dictionary

{'cic_iomt_2024_tta': {'dataset': 'D1',
  'best_f1_score': 0.58,
  'best_f1_score_iteration': 70,
  'average_iteration_time': 589.97,
  'average_black_box_time': 556.25,
  'average_optimizer_time': 33.72,
  'average_fetch_time': 0.9,
  'tta': 47084,
  'average_tofino_model_time': 4.243850708007813e-05,
  'average_tcam_rulegen_time': 0.8},
 'cic_iot_2023_tta': {'dataset': 'D2',
  'best_f1_score': 0.86,
  'best_f1_score_iteration': 4,
  'average_iteration_time': 273.56,
  'average_black_box_time': 228.16,
  'average_optimizer_time': 45.4,
  'average_fetch_time': 0.32,
  'tta': 3802,
  'average_tofino_model_time': 4.5678932189941414e-05,
  'average_tcam_rulegen_time': 0.99},
 'iscxvpn2016_tta': {'dataset': 'D3',
  'best_f1_score': 0.82,
  'best_f1_score_iteration': 35,
  'average_iteration_time': 54.34,
  'average_black_box_time': 10.87,
  'average_optimizer_time': 43.47,
  'average_fetch_time': 0.01,
  'tta': 2011,
  'average_tofino_model_time': 4.312384796142578e-05,
  'average_tcam_rul

In [27]:
breakdown_data_row = "{{\sf {stage}}} & {cic_iomt_2024_stage_time} & {cic_iot_2023_stage_time} & {iscxvpn2016_stage_time} & {ucsbfinetuning_stage_time} & {cic_iot_2023_32_stage_time} & {cic_ids_2017_stage_time} & {cic_ids_2018_stage_time} & {average_stage_time} \\\\"
training_data_row = "\\cellcolor{{custompink}}{{\sf {stage}}} & \\cellcolor{{custompink}}{cic_iomt_2024_stage_time} & \\cellcolor{{custompink}}{cic_iot_2023_stage_time} & \\cellcolor{{custompink}}{iscxvpn2016_stage_time} & \\cellcolor{{custompink}}{ucsbfinetuning_stage_time} & \\cellcolor{{custompink}}{cic_iot_2023_32_stage_time} & \\cellcolor{{custompink}}{cic_ids_2017_stage_time} & \\cellcolor{{custompink}}{cic_ids_2018_stage_time} & \\cellcolor{{custompink}}{average_stage_time} \\\\"
total_data_row = "\\cellcolor{{customblue}}{{\sf {stage}}} & \\cellcolor{{customblue}}{cic_iomt_2024_stage_time} & \\cellcolor{{customblue}}{cic_iot_2023_stage_time} & \\cellcolor{{customblue}}{iscxvpn2016_stage_time} & \\cellcolor{{customblue}}{ucsbfinetuning_stage_time} & \\cellcolor{{customblue}}{cic_iot_2023_32_stage_time} & \\cellcolor{{customblue}}{cic_ids_2017_stage_time} & \\cellcolor{{customblue}}{cic_ids_2018_stage_time} & \\cellcolor{{customblue}}{average_stage_time} \\\\"

datasets_order = [
    "cic_iomt_2024",
    "cic_iot_2023",
    "iscxvpn2016",
    "ucsbfinetuning",
    "cic_iot_2023_32",
    "cic_ids_2017",
    "cic_ids_2018"
]

stages_dictionary = {
    "Data Fetch": "average_fetch_time",
    "Training": "average_black_box_time",
    "Optimizer": "average_optimizer_time",
    "Rulegen": "average_tcam_rulegen_time", 
    "Backend": "average_tofino_model_time",
    "Total Time": "average_iteration_time"
}

# loop through each stage and dataset
for stage_name, column_name in stages_dictionary.items():
    
    # copy the breakdown data row
    # data_row = str(breakdown_data_row) if stage_name != "Total Time" else str(total_data_row)
    if stage_name == "Training":
        data_row = str(training_data_row)
    elif stage_name == "Total Time":
        data_row = str(total_data_row)
    else:
        data_row = str(breakdown_data_row)
    
    row_total, row_count = 0, 0
    row_unit = "s"
    
    # loop through each dataset and fill this stage's time
    stage_data = {}
    for dataset_name in datasets_order:
        
        dataset_stage_time = tta_dictionary[f"{dataset_name}_tta"][column_name]
        # if time is greater than 1sec, convert to seconds (no milliseconds)
        if dataset_stage_time > 10:
            dataset_stage_time = int(dataset_stage_time)
            # update the row total
            row_total += dataset_stage_time
            row_count += 1
            dataset_stage_time = f"{dataset_stage_time}s"
        else:
            # get scientific notation representation
            sn_dataset_stage_time = "{:.2e}".format(dataset_stage_time)
            # get the exponent part of the scientific notation
            exponent = int(sn_dataset_stage_time.split("e")[1])
            # get the mantissa part of the scientific notation
            mantissa = float(sn_dataset_stage_time.split("e")[0])
            # # print(mantissa, exponent)
            if exponent == 0:
                row_total += mantissa
                row_count += 1
                dataset_stage_time = f"{mantissa:.2f}s"
            elif exponent == -1:
                row_total += dataset_stage_time
                row_count += 1
                dataset_stage_time = f"{dataset_stage_time}s"
            elif exponent == -5:
                row_total += mantissa * 10
                row_count += 1
                row_unit = "$\\mu$s"
                dataset_stage_time = f"{int(mantissa * 10)}$\\mu$s"
            else:
                row_total += dataset_stage_time
                row_count += 1
                dataset_stage_time = f"{dataset_stage_time}s"
            
        stage_data[f"{dataset_name}_stage_time"] = dataset_stage_time
    
    # this happens after all datasets have been processed
    # print(stage_data)
    # print(row_total, row_count, row_unit)
    stage_data["average_stage_time"] = f"{round(row_total / row_count, 2)}" + row_unit
    
    # now populate the stage name
    stage_data["stage"] = stage_name
    if stage_name == "Total Time":
        print("\midrule")
    print(data_row.format(**stage_data))
    
print()

{\sf Data Fetch} & 0.9s & 0.32s & 0.01s & 0.07s & 0.91s & 0.24s & 0.18s & 0.38s \\
\cellcolor{custompink}{\sf Training} & \cellcolor{custompink}556s & \cellcolor{custompink}228s & \cellcolor{custompink}10s & \cellcolor{custompink}84s & \cellcolor{custompink}725s & \cellcolor{custompink}163s & \cellcolor{custompink}111s & \cellcolor{custompink}268.14s \\
{\sf Optimizer} & 33s & 45s & 43s & 43s & 30s & 32s & 37s & 37.57s \\
{\sf Rulegen} & 0.8s & 0.99s & 0.91s & 1.08s & 0.71s & 0.71s & 0.91s & 0.87s \\
{\sf Backend} & 42$\mu$s & 45$\mu$s & 43$\mu$s & 42$\mu$s & 46$\mu$s & 44$\mu$s & 47$\mu$s & 44.6$\mu$s \\
\midrule
\cellcolor{customblue}{\sf Total Time} & \cellcolor{customblue}589s & \cellcolor{customblue}273s & \cellcolor{customblue}54s & \cellcolor{customblue}128s & \cellcolor{customblue}756s & \cellcolor{customblue}196s & \cellcolor{customblue}148s & \cellcolor{customblue}306.29s \\



# Microbenchmarks

## Recirculation Bandwidth

In [28]:
actual_flows = {
    "100K": 98304,
    "500K": 491520,
    "1M": 983040
}

cap_flows_and_partitions = {
    "100K": {
        "D1": 3,
        "D2": 2,
        "D3": 2,
        "D4": 1,
        "D5": 5,
        "D6": 5,
        "D7": 5,
    },
    "500K": {
        "D1": 3,
        "D2": 3,
        "D3": 4,
        "D4": 1,
        "D5": 1,
        "D6": 3,
        "D7": 6,
    },
    "1M": {
        "D1": 1,
        "D2": 2,
        "D3": 1,
        "D4": 1,
        "D5": 1,
        "D6": 2,
        "D7": 1,
    }
}

datacenter_environments = {
    "Webserver": (ws_mean_ttd, ws_std_ttd),
    "Hadoop": (hd_mean_ttd, hd_std_ttd),
}

In [29]:
for these_flows, these_partitions in cap_flows_and_partitions.items():
    print(f"{these_flows}", end="")
    for dataset, partitions in these_partitions.items():
        
        for env_name, (env_mean_ttd, env_std_ttd) in datacenter_environments.items():
            print(" &", end=" ")
            (resubmit_bw_mean, resubmit_bw_std) = get_resubmitted_traffic(
                partitions, env_mean_ttd, env_std_ttd, actual_flows[these_flows]
                )
            print(f"{round(resubmit_bw_mean, 2)}$\pm${round(resubmit_bw_std, 2)}", end="")
    
    print(" \\\\", end="")
    
    print()

100K & 1.46$\pm$0.83 & 3.0$\pm$1.19 & 0.98$\pm$0.55 & 2.0$\pm$0.8 & 0.98$\pm$0.55 & 2.0$\pm$0.8 & 0$\pm$0 & 0$\pm$0 & 2.44$\pm$1.38 & 5.0$\pm$1.99 & 2.44$\pm$1.38 & 5.0$\pm$1.99 & 2.44$\pm$1.38 & 5.0$\pm$1.99 \\
500K & 7.32$\pm$4.15 & 14.99$\pm$5.97 & 7.32$\pm$4.15 & 14.99$\pm$5.97 & 9.75$\pm$5.53 & 19.98$\pm$7.96 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 7.32$\pm$4.15 & 14.99$\pm$5.97 & 14.63$\pm$8.3 & 29.97$\pm$11.93 \\
1M & 0$\pm$0 & 0$\pm$0 & 9.75$\pm$5.53 & 19.98$\pm$7.96 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 0$\pm$0 & 9.75$\pm$5.53 & 19.98$\pm$7.96 & 0$\pm$0 & 0$\pm$0 \\
