In [1]:
import os
from typing import *

import multiprocess as mp
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import edgedroid.data as e_data
from edgedroid.models import *
from collections import deque


def process_model(
        mcls: Type[ExecutionTimeModel],
        run_id: int,
        # df: pd.DataFrame,
        # data: pd.DataFrame,
        fade_dist: int,
        samples: int,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    result = deque()

    # load the data
    params = e_data.load_default_exec_time_data()
    data = preprocess_data(*params, transition_fade_distance=fade_dist)
    raw_data, *_ = params

    # dataframe for participant
    df = raw_data[raw_data["run_id"] == run_id].copy()

    # prepare the model
    neuro = raw_data.iloc[0, df.columns.get_loc("neuroticism")]
    model: ExecutionTimeModel = mcls(data=data, neuroticism=neuro, transition_fade_distance=fade_dist)

    # record task duration
    task_df = pd.DataFrame(index=np.arange(samples), columns=["run_id", "real_task_duration", "model_task_duration"])
    task_df["run_id"] = run_id
    task_df["real_task_duration"] = df["exec_time"].sum()
    task_df["model"] = mcls.__name__

    for i in range(samples):
        model.reset()
        states = deque()

        model_exec_times = np.empty(len(df.index), dtype=np.float64)

        for j, step in enumerate(df.itertuples(index=True)):
            model_exec_times[j] = model.get_execution_time()
            states.append(model.state_info())
            model.set_delay(step.delay)

        result_df = df.copy()
        result_df["model_exec_time"] = model_exec_times
        result_df["sample"] = i
        result_df["model"] = mcls.__name__

        model_df = pd.DataFrame(states)
        result_df = result_df.merge(
            right=model_df.drop(columns=["neuroticism_raw"]),
            on="seq",
            suffixes=["", "_model"]
        )

        result.append(result_df)
        task_df.at[i, "model_task_duration"] = result_df["model_exec_time"].sum()

    trace_df = pd.concat(result, ignore_index=True)
    del result

    task_df["reldiff"] = (task_df["model_task_duration"] - task_df["real_task_duration"]) / task_df["real_task_duration"]
    task_df["fade_distance"] = fade_dist

    # filter outliers using preprocessed data
    # remove the top and bottom 5-percentiles (exec times)
    for _, df in data.groupby(["neuroticism", "impairment", "transition", "duration"]):
        # winsorize
        percs = np.percentile(df.next_exec_time, [5, 95])

        out = df[(df["next_exec_time"] < percs[0]) | (df["next_exec_time"] > percs[1])]
        trace_df = trace_df[~(np.isin(trace_df["run_id"], out["run_id"]) & np.isin(trace_df["seq"], out["seq"]))]

    trace_df = trace_df.copy()
    trace_df["reldiff"] = (trace_df["model_exec_time"] - trace_df["exec_time"]) / trace_df["exec_time"]
    trace_df["fade_distance"] = fade_dist

    return trace_df, task_df


samples = 50

raw_data, *_ = e_data.load_default_exec_time_data()
run_ids = raw_data.run_id.unique()
del raw_data

step_result = deque()
task_result = deque()
workers = os.cpu_count() - 1
for mcls in tqdm((TheoreticalExecutionTimeModel, EmpiricalExecutionTimeModel), leave=True, desc="Models"):
    for i, fade_dist in enumerate(tqdm((2, 4, 8, None), leave=True, desc="Fade distance")):
        with tqdm(total=len(run_ids), leave=True, desc="Runs", position=i) as pbar, mp.Pool(workers) as pool:
            fade_result = [
                pool.apply_async(
                    process_model,
                    args=(
                        mcls,
                        rid,
                        fade_dist,
                        samples
                    ),
                    callback=lambda _: pbar.update()
                )
                for rid in run_ids
            ]

            for r in fade_result:
                step_df, task_df = r.get()
                step_result.append(step_df)
                task_result.append(task_df)

            # step_result.extend([r.get() for r in fade_result])

trace_result = pd.concat(step_result, ignore_index=True)
trace_task_result = pd.concat(task_result, ignore_index=True)
del step_result
del task_result

In [2]:
trace_result["fade_distance"] = trace_result["fade_distance"].replace({None: -1})
trace_result

In [3]:
import seaborn as sns
import matplotlib.pyplot as plt

fg = sns.catplot(
    kind="bar",
    x="model",
    y="reldiff",
    col="fade_distance",
    data=trace_result,
    ci=95
)
fg.set_ylabels("Relative difference between\nmodel execution times and underlying data.")
plt.show()

In [4]:
fg = sns.catplot(
    kind="point",
    x="fade_distance",
    y="reldiff",
    hue="model",
    data=trace_result,
    ci=95,
    dodge=True,
    # err_style="bars",
)
fg.set_ylabels("Relative difference between\nmodel execution times and underlying data.")
plt.show()

In [5]:
trace_result.groupby(["model", "fade_distance"])["reldiff"].describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, .9, .99])

In [6]:
fg = sns.catplot(
    kind="bar",
    x="model",
    col="fade_distance",
    y="reldiff",
    data=trace_task_result,
    ci=95,
)
fg.set_ylabels("Relative difference between\nmodel task durations and underlying data.")
plt.show()

In [7]:
trace_task_result["reldiff"] = trace_task_result["reldiff"].astype(float)
trace_task_result.groupby(["model", "fade_distance"])["reldiff"].describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, .9, .99])

In [8]:
fg = sns.catplot(
    kind="point",
    hue="model",
    x="fade_distance",
    y="reldiff",
    data=trace_task_result,
    ci=95,
    dodge=True,
)
fg.set_ylabels("Relative difference between\nmodel task durations and underlying data.")
plt.show()