In [None]:
import matplotlib as mpl
from matplotlib import pyplot as plt
import pickle
import numpy as np
from tqdm.notebook import tqdm
from cost_based_selection import preprocessing_utils
from glob import glob
from scipy import stats
import itertools as it
from IPython.display import display
import pandas as pd
import os

mpl.rcParams['figure.dpi'] = 144
mpl.style.use('../scrartcl.mplstyle')

In [None]:
costs_by_model = {}
features_by_model = {}
sizes = (1 + np.arange(10)) * 100
seeds = range(5)
for model, size, seed in tqdm(it.product(["ba", "dmX"], sizes, seeds)):
    pattern = f"../workspace/{model}/simulations/pilot/num_nodes-{size}/seed-{seed}/batch-*.pkl"
    simulations = preprocessing_utils.load_simulations(pattern)
    features_by_model[model] = simulations["features"]
    costs_by_model.setdefault(model, []).append(simulations["costs"])

In [None]:
fig, ax = plt.subplots()

zorder = 9
for model, costs in costs_by_model.items():
    costs = np.sum(costs, axis=-1).reshape((len(sizes), len(seeds)))
    y = np.mean(costs, axis=-1)
    yerr = np.std(costs, axis=-1) / np.sqrt(len(seeds) - 1)
    poly = np.polynomial.Polynomial.fit(np.log(sizes), np.log(y), 1).convert()
    
    label = {'ba': 'Barabási Albert model', 'dmX': "Duplication divergence models"}[model]
    ls = {'ba': '-', 'dmX': '--'}[model]
    marker = {'ba': 'o', 'dmX': 's'}[model]
    line, = ax.plot(sizes, y, label=fr"{label} $\propto n_s^{{{poly.coef[1]:.2f}}}$", ls='none', 
                    marker=marker, zorder=zorder)
    line.set_markeredgecolor('w')
    line.set_markersize(7)
    zorder -= 1
    # ax.fill_between(sizes, y - yerr, y + yerr, color=line.get_color(), alpha=.25)
    poly.coef[1]
    ax.plot(sizes, np.exp(poly(np.log(sizes))), color=line.get_color(), ls='-', zorder=zorder - 1)
    print(model, y[-1] / y[0], poly.coef[1])
    
ax.set_ylabel('Cost of summaries per\nsimulation (seconds)')
ax.set_xlabel('Number of nodes $n_s$')
ax.legend()
fig.tight_layout()
fig.savefig("cost-scaling.pdf")

In [None]:
# Show the cost by feature for each model.
for model, costs in costs_by_model.items():
    _, num_features = np.shape(costs)
    costs = np.reshape(costs, (len(sizes), len(seeds), num_features))
    costs = costs[-1].mean(axis=0) * 1_000
    df = pd.DataFrame({"feature": features_by_model[model], "cost": costs})
    print(model, df.cost.sum())
    display(df)