In [None]:
# Personally I had to add the root folder of the repo to the sys.path.  If certain imports do not work you should uncomment and set the following.
# import sys
# sys.path.append('/root/of/repo/folder/')

In [None]:
from experiments.utils import generate_trace_if_not_exists, read_resource_map, setup_nodes, setup_stats_file_writers, make_dir, setup_node_map, TraceIteratorProxy, plot_with_error_bars, calc_variance, aggregate_runs_in_dir, calc_ratio_over
from simulation.evaluator.strategy.runner import StrategyRunner
from simulation.evaluator.strategy.federated import FederatedStrategy
from simulation.evaluator.statistics.cache_metrics import CacheMetrics

resource_file = "../dataset/out/dataset-resources-stats.csv"
out_dir = make_dir('./out/experiment-federated-vs-single-node/')
resource_map = read_resource_map(resource_file)

# Federated versus Single Node

In this experiment we evaluate whether a Federated Setup with 4 nodes of 1GB performs the same as 1 node with 4GB capacity.

The difference between the two setups is the separate LRU strategies used in the 4 node setup.  While the distribution of files might be equal (due to the hash) this does not mean the file sizes are equally distributed over the four nodes.  This could generate a situation where we remove an item with age 2 from node A while there is an item with age 3 on node B.  This would not happen on the single node setup as there is only a single LRU list there.

Therefore the main question of this experiment is whether this slight difference has a significant impact on the performance.

## Trace Generation
We use a simple helper to load a trace from file, or generate it from a `TraceConfig` if it does not exist.  This saves time when we want to re-run the experiment as we do not need to invoke the generator again.  The trace is saved under a name based on the `trace_config` which means that a new trace will be generated automatically when you change the `TraceConfig` values.  As we save the trace using the `gzip` format it can take some time, after trace generation, to save the file.  The process is worth it however as it saves a significant amount of space.

If you want to re-generate the trace on every run replace `generate_trace_if_not_exists` with `generate_trace` from `.utils`.

In [None]:
from simulation.generator.main_zipf import TraceConfig, Simulation

def generate_trace(seed: str):
    trace_config = TraceConfig(node_map=setup_node_map(8), seed=seed, no_users=500, no_iterations=800)
    simulation = Simulation(trace_config, resource_file)
    return generate_trace_if_not_exists(f"{out_dir}/{trace_config.to_filename()}.trace.gz", simulation=simulation)

## Multi Node Federated Setup
In the multi-node setup we evaluate 4 nodes with 1024MB capacity working together according to the federated strategy.

In [None]:
def run_multi_node_experiment(trace, marker: str = "") -> dict[str, CacheMetrics]:
    nodes = setup_nodes(8, 1024 * 1024 * 1024)
    stats_writers = setup_stats_file_writers(nodes, make_dir(f"{out_dir}multi-node"), marker=marker)
    strategy = FederatedStrategy(nodes)
    return StrategyRunner(strategy, trace, resource_map, stats_writers=stats_writers).perform()

## Single Node Federated Setup
In the single-node setup we evaluate 1 node with 4096MB capacity.  
To be able to use the same trace we use a `TraceIteratorProxy` that maps all requests to a single node.

In [None]:
def run_single_node_experiment(trace, marker: str = "") -> dict[str, CacheMetrics]:
    trace_proxy = TraceIteratorProxy(trace.instructions, 
                                    proxy_map={ f"cdn{i + 1}": "cdn1" for i in range(8) })
    nodes = setup_nodes(1, 8 * 1024 * 1024 * 1024)
    stats_writers = setup_stats_file_writers(nodes, make_dir(f"{out_dir}single-node/"), marker=marker)
    strategy = FederatedStrategy(nodes)
    return StrategyRunner(strategy, trace_proxy, resource_map, stats_writers=stats_writers).perform()

In [None]:
no_runs = 10
trace_seeds = [ str(i) for i in range(no_runs) ]

In [None]:
# Make use of multiprocess (over multiprocessing) if an "AttributeError" says it couldn't find `run_experiment`.
from multiprocessing import Pool

def run_experiment(trace_seed: str):
    trace = generate_trace(seed=trace_seed)
    run_single_node_experiment(trace, marker=trace_seed)
    run_multi_node_experiment(trace, marker=trace_seed)

if __name__ == '__main__':
    print("Executing experiments...")
    with Pool(4) as p:
        p.map(run_experiment, trace_seeds, chunksize=1)

## Analyse
We now need to analyse the results outputted to `multi_node_stats` and `single_node_stats`, potentially with some graphs.

In [None]:
aggregated_multi_data = aggregate_runs_in_dir(f"{out_dir}/multi-node")
aggregated_single_data = aggregate_runs_in_dir(f"{out_dir}/single-node")

In [None]:
import matplotlib.pyplot as plt
from palettable.colorbrewer.sequential import Greys_4
from palettable.colorbrewer.diverging import PuOr_4
greys = Greys_4.mpl_colors
puor_4 = PuOr_4.mpl_colors
import experiments.plotter.neat_plotter
import numpy as np

In [None]:
multi_capacity = 1024 * 1024 * 1024
single_capacity = 8 * multi_capacity

multi_total_items = [ calc_variance([ y / multi_capacity for y in x ]) for x in aggregated_multi_data['cache_total']]
single_total_items = [ calc_variance([ y / single_capacity for y in x ]) for x in aggregated_single_data['cache_total']]

x_labels = range(len(single_total_items))
plt.figure(num=None, figsize=(4, 4), dpi=300)

plot_with_error_bars(plt, x_labels, single_total_items, label="Single Bytes Used", color=puor_4[3])
plot_with_error_bars(plt, x_labels, multi_total_items, label="Multi Bytes Used", color=puor_4[2], linestyle='dashed')

ylim = plt.ylim()
plt.ylabel('Fraction of total capacity')
plt.xlabel('Iteration')
plt.legend(loc='lower right')
plt.show()

single_used, _ = zip(*single_total_items)
multi_bytes, _ = zip(*multi_total_items)
print(f"Max Difference: {np.max(np.abs(np.array(single_used) - np.array(multi_bytes)))}")

In [None]:
multi_average_hit_ratio = calc_ratio_over(aggregated_multi_data, 'hits_total', 'misses_total')
single_average_hit_ratio = calc_ratio_over(aggregated_single_data, 'hits_total', 'misses_total')

x_labels = range(len(single_average_hit_ratio))
plt.figure(num=None, figsize=(4, 4), dpi=300)

plot_with_error_bars(plt, x_labels, single_average_hit_ratio, label="Single Hit Ratio", color=puor_4[1])
plot_with_error_bars(plt, x_labels, multi_average_hit_ratio, label="Multi Hit Ratio", color=puor_4[0], linestyle='dashed')

plt.ylim(ylim)
plt.ylabel('Hit Ratio')
plt.xlabel('Iteration')
plt.legend(loc='upper right')
plt.show()

single_hits, _ = zip(*single_average_hit_ratio)
multi_hits, _ = zip(*multi_average_hit_ratio)
print(f"Max Difference: {np.max(np.abs(np.array(single_hits) - np.array(multi_hits)))}")

In [None]:
multi_average_byte_ratio = calc_ratio_over(aggregated_multi_data, 'cache_bytes_total', 'origin_bytes_total')
single_average_byte_ratio = calc_ratio_over(aggregated_single_data, 'cache_bytes_total', 'origin_bytes_total')

x_labels = range(len(single_average_byte_ratio))
plt.figure(num=None, figsize=(4, 4), dpi=300)

plot_with_error_bars(plt, x_labels, single_average_byte_ratio, label="Single Byte Ratio", color=puor_4[3])
plot_with_error_bars(plt, x_labels, multi_average_byte_ratio, label="Multi Byte Ratio", color=puor_4[2], linestyle='dashed')

plt.ylim(ylim)
plt.ylabel('Byte Ratio')
plt.xlabel('Iteration')
plt.legend(loc='upper right')
plt.show()

single_bytes, _ = zip(*single_average_byte_ratio)
multi_bytes, _ = zip(*multi_average_byte_ratio)
print(f"Max Difference: {np.max(np.abs(np.array(single_bytes) - np.array(multi_bytes)))}")