In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from edgedroid.models.timings import *
import edgedroid.data as default_data

exec_time_data, neuroticism_data, imp_data, dur_data = default_data.load_default_exec_time_data()
exec_time_data

Unnamed: 0,run_id,ttf,exec_time,neuroticism
0,134146,0.597441,3.654797,0.375
1,134146,0.553513,4.438645,0.375
2,134146,0.561716,2.943222,0.375
3,134146,0.586512,5.405761,0.375
4,134146,0.558940,5.225161,0.375
...,...,...,...,...
6755,137353,0.557074,6.439071,0.625
6756,137353,0.534339,4.680858,0.625
6757,137353,0.560288,3.467878,0.625
6758,137353,0.579000,2.325759,0.625


In [2]:
data = preprocess_data(exec_time_data, neuroticism_data, imp_data, dur_data)
data

Unnamed: 0,run_id,ttf,next_exec_time,neuroticism,neuroticism_raw,impairment,duration,transition,duration_raw
0,134146,0.000000,3.654797,"[-inf, 0.5)",0.375,"[-inf, 1.4817823276676907)","[0.0, 5.0)",NoTransition,1
1,134146,0.597441,4.438645,"[-inf, 0.5)",0.375,"[-inf, 1.4817823276676907)","[0.0, 5.0)",NoTransition,2
2,134146,0.553513,2.943222,"[-inf, 0.5)",0.375,"[-inf, 1.4817823276676907)","[0.0, 5.0)",NoTransition,3
3,134146,0.561716,5.405761,"[-inf, 0.5)",0.375,"[-inf, 1.4817823276676907)","[0.0, 5.0)",NoTransition,4
4,134146,0.586512,5.225161,"[-inf, 0.5)",0.375,"[-inf, 1.4817823276676907)","[5.0, 10.0)",NoTransition,5
...,...,...,...,...,...,...,...,...,...
6755,137353,0.577341,6.439071,"[0.5, inf)",0.625,"[-inf, 1.4817823276676907)","[5.0, 10.0)",Higher2Lower,6
6756,137353,0.557074,4.680858,"[0.5, inf)",0.625,"[-inf, 1.4817823276676907)","[5.0, 10.0)",Higher2Lower,7
6757,137353,0.534339,3.467878,"[0.5, inf)",0.625,"[-inf, 1.4817823276676907)","[5.0, 10.0)",Higher2Lower,8
6758,137353,0.560288,2.325759,"[0.5, inf)",0.625,"[-inf, 1.4817823276676907)","[5.0, 10.0)",Higher2Lower,9


In [3]:
import itertools
from collections import deque
import pandas as pd
from typing import Tuple, Deque
from tqdm.notebook import tqdm
import numpy as np

result_rows = deque()

for run_id in tqdm(data["run_id"].unique(), desc="Run"):
    test_data = data[data["run_id"] == run_id].copy()

    def get_test_data(*args, **kwargs) -> Tuple[pd.DataFrame, pd.arrays.IntervalArray, pd.arrays.IntervalArray, pd.arrays.IntervalArray]:
        etime_data = exec_time_data[exec_time_data["run_id"] != run_id].copy()
        return etime_data, neuroticism_data, imp_data, dur_data

    class EmpiricalTestModel(EmpiricalETM):
        get_data = get_test_data
        
    class EmpiricalMeanTestModel(EmpiricalAggregateETM):
        get_data = get_test_data
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, aggregate_fn=np.mean, **kwargs)
            
    class EmpiricalMedianTestModel(EmpiricalAggregateETM):
        get_data = get_test_data
        
        def __init__(self, *args, **kwargs):
            super().__init__(*args, aggregate_fn=np.median, **kwargs)
        
    class TheoreticalTestModel(FittedETM):
        get_data = get_test_data

    class NaiveTestModel(FirstOrderETM):
        get_data = get_test_data
        
    class NaiveAggTestModel(FirstOrderAggregateETM):
        get_data = get_test_data
        
    cleanup = CleanupMode.TRUNCATE

    neuroticism = test_data["neuroticism_raw"].unique()[0]
    naive_model = NaiveTestModel()
    naive_mean_model = NaiveAggTestModel(np.mean)
    naive_median_model = NaiveAggTestModel(np.median)
    
    # deque containing (model name, model object, ttf_bins, window_size, kernel name)
    models: Deque[Tuple[str, ExecutionTimeModel, int, int, str]] = deque()
    models.extend((
        ("1st order", naive_model, 0, 0, "none"),
        ("1st order (median)", naive_median_model, 0, 0, "none"),
        ("1st order (mean)", naive_mean_model, 0, 0, "none")
    ))
    
    for (ttf_bins, window_size) in itertools.product(range(1, 10), (4, 8, 12)):
        exp_kernel = ExponentialTTFWindowKernel(window_size=window_size)
        steep_linear_kernel = LinearTTFWindowKernel(window_size=window_size, max_relative_weight=10)
        shallow_linear_kernel = LinearTTFWindowKernel(window_size=window_size, max_relative_weight=2)
        average_kernel = AverageTTFWindowKernel(window_size=window_size)
        
        kernels = (
            ("exponential", exp_kernel),
            ("linear_steep", steep_linear_kernel),
            ("linear_shallow", shallow_linear_kernel),
            ("average", average_kernel)
        )
        model_classes = (
            ("empirical", EmpiricalTestModel),
            ("empirical (mean)", EmpiricalMeanTestModel),
            ("empirical (median)", EmpiricalMedianTestModel),
            ("theoretical", TheoreticalTestModel)
        )
        
        for (class_name, model_cls), (kernel_name, kernel) in itertools.product(model_classes, kernels):
            m = model_cls(kernel=kernel, neuroticism=neuroticism, ttf_levels=ttf_bins, cleanup=cleanup)
            models.append((class_name, m, ttf_bins, window_size, kernel_name))
        
    for _ in range(30):
        for i, (prev_ttf, exec_time) in enumerate(test_data[["ttf", "next_exec_time"]].itertuples(index=False)):
            for model_name, model, ttf_bins, window_size, kernel_name in models:
                if i == 0:
                    # reset the model between runs
                    model.reset()
                else:
                    # first row has no previous ttf
                    model.advance(prev_ttf)
                    
                prediction = model.get_execution_time()
                error = prediction - exec_time
                result_rows.append({
                    "real": exec_time,
                    "prediction": prediction,
                    "error": error,
                    "sqr_error": np.square(error),
                    "model": model_name,
                    "kernel": kernel_name,
                    "ttf_bins": ttf_bins,
                    "window_size": window_size,
                })

results = pd.DataFrame(result_rows)
for col in ("model", "kernel"):
    results[col] = results[col].astype("category")

results.to_parquet("./full_validation.parquet")
results

Run:   0%|          | 0/40 [00:00<?, ?it/s]

Unnamed: 0,real,prediction,error,sqr_error,model,kernel,ttf_bins,window_size
0,3.654797,6.278157,2.623360,6.882016,1st order,none,0,0
1,3.654797,5.261648,1.606851,2.581971,1st order (median),none,0,0
2,3.654797,5.873496,2.218699,4.922626,1st order (mean),none,0,0
3,3.654797,4.469169,0.814371,0.663201,empirical,exponential,1,4
4,3.654797,3.369724,-0.285073,0.081267,empirical,linear_steep,1,4
...,...,...,...,...,...,...,...,...
88217995,4.615283,5.036001,0.420718,0.177003,empirical (median),average,9,12
88217996,4.615283,5.277464,0.662181,0.438484,theoretical,exponential,9,12
88217997,4.615283,5.836481,1.221198,1.491324,theoretical,linear_steep,9,12
88217998,4.615283,7.666758,3.051475,9.311500,theoretical,linear_shallow,9,12


In [4]:
results.groupby(["model", "kernel", "window_size", "ttf_bins"], observed=True)["sqr_error"].mean().sort_values(ascending=True)

model             kernel          window_size  ttf_bins
empirical (mean)  linear_steep    8            6           10.133110
                                               7           10.136870
                  exponential     12           6           10.139448
                                  8            6           10.139615
                  linear_steep    8            9           10.140530
                                                             ...    
theoretical       linear_shallow  8            4           14.812868
                                  4            9           14.831268
                  exponential     4            4           14.835295
                  linear_steep    4            9           14.869504
1st order         none            0            0           20.445514
Name: sqr_error, Length: 435, dtype: float64

In [5]:
# results["model_tag"] = results["model"] + " " + results["cleanup"].astype(str)
# results

In [6]:
# # %matplotlib inline
# import matplotlib.pyplot as plt
# import seaborn as sns
# 
# fg = sns.catplot(results, kind="point", x="ttf_bins", y="sqr_error", hue="model_tag", height=5, aspect=2)
# fg.set(ylim=(0, None))
# fg.set_ylabels("MSE")
# for ax in fg.axes.flat:
#     ax.set_xticks(ax.get_xticks(), ax.get_xticklabels(), rotation=45, ha='right')
# plt.show()

In [7]:
# results.groupby(["model", "ttf_bins", "cleanup"])["sqr_error"].mean().sort_values()