model selection

molguin92 · Oct 22, 2023 · 0fb2a10 · 0fb2a10
1 parent 0a94664
commit 0fb2a10
Show file tree

Hide file tree

Showing 6 changed files with 1,907 additions and 254 deletions.
diff --git a/analysis_2023/errors.ipynb b/analysis_2023/errors.ipynb
diff --git a/analysis_2023/full_validation.ipynb b/analysis_2023/full_validation.ipynb
diff --git a/analysis_2023/requirements.txt b/analysis_2023/requirements.txt
@@ -6,3 +6,4 @@ numpy
 seaborn
 matplotlib
 pmdarima
+tqdm
diff --git a/analysis_2023/validation.ipynb b/analysis_2023/validation.ipynb
diff --git a/edgedroid/models/__init__.py b/edgedroid/models/__init__.py
@@ -39,6 +39,7 @@
     "EmpiricalETM",
     "FittedETM",
     "LegacyETM",
+    "CleanupMode",
 ]
 
 

diff --git a/edgedroid/models/timings.py b/edgedroid/models/timings.py
@@ -23,7 +23,6 @@
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
-from nptyping import Shape
 from pandas import arrays
 from scipy import stats
 
@@ -415,16 +414,33 @@ def _convolve_kernel(arr: pd.Series, kernel: npt.NDArray):
     return pd.Series(result[kernel.size :], index=index)
 
 
-def _winsorize(arr: npt.NDArray) -> npt.NDArray:
-    low_bound = np.percentile(arr, 5)
-    high_bound = np.percentile(arr, 95)
+def _winsorize(
+    arr: npt.NDArray, low_percentile: int = 5, high_percentile: int = 95
+) -> npt.NDArray:
+    low_bound = np.percentile(arr, low_percentile)
+    high_bound = np.percentile(arr, high_percentile)
 
     arr[arr < low_bound] = low_bound
     arr[arr > high_bound] = high_bound
 
     return arr
 
 
+def _truncate(
+    arr: npt.NDArray, low_percentile: int = 5, high_percentile: int = 95
+) -> npt.NDArray:
+    low_bound = np.percentile(arr, low_percentile)
+    high_bound = np.percentile(arr, high_percentile)
+
+    return np.copy(arr[np.logical_and(arr >= low_bound, arr <= high_bound)])
+
+
+class CleanupMode(enum.Enum):
+    NONE = enum.auto()
+    WINSORIZE = enum.auto()
+    TRUNCATE = enum.auto()
+
+
 class EmpiricalETM(ExecutionTimeModel):
     @staticmethod
     def make_kernel(window: int, exp_factor: float = 0.7):
@@ -439,7 +455,7 @@ def __init__(
         neuroticism: float | None,
         window: int = 12,
         ttf_levels: int = 4,
-        winsorize: bool = True,
+        cleanup: CleanupMode = CleanupMode.WINSORIZE,
     ):
         data, neuro_bins, *_ = self.get_data()
 
@@ -469,8 +485,11 @@ def __init__(
         self._views: Dict[pd.Interval, npt.NDArray] = {}
         for binned_rolling_ttf, df in data.groupby("binned_rolling_ttf", observed=True):
             exec_times = df["next_exec_time"].to_numpy()
-            if winsorize:
+
+            if cleanup == CleanupMode.WINSORIZE:
                 exec_times = _winsorize(exec_times)
+            elif cleanup == CleanupMode.TRUNCATE:
+                exec_times = _truncate(exec_times)
 
             self._views[binned_rolling_ttf] = exec_times
 
@@ -534,13 +553,13 @@ def __init__(
         dist: stats.rv_continuous = stats.exponnorm,
         window: int = 12,
         ttf_levels: int = 4,
-        winsorize: bool = True,
+        cleanup: CleanupMode = CleanupMode.WINSORIZE,
     ):
         super(FittedETM, self).__init__(
             neuroticism=neuroticism,
             window=window,
             ttf_levels=ttf_levels,
-            winsorize=winsorize,
+            cleanup=cleanup,
         )
 
         self._dists: Dict[pd.Interval, stats.rv_continuous] = {}