A min_weight param to exp functions (#108)

* Proposal for a `min_weight` param to exp functions One issue with using the current exp functions is that there's no `min_periods` parameter, meaning that initial results can be overly influenced by a couple of values. This PR adds a `min_weight` parameter to the exp functions, which is the minimum weight that a value requires before producing a value. Assuming that `alpha` is constant, this is equivalent to taking a rolling sum of `1` or `0`s and then passing the original result where the corresponding value of the rolling sum is greater than `min_weight/alpha`. So for a series with a slower moving average (i.e. a lower alpha), we require more points to be present before producing a value. Similarly, a higher `min_weight` value will require more points to be present. If all values exist, then the value of `1/alpha` will asymptotically approach 1 — so a `min_weight` of 1 will never produce values. * Speed up `nanmean` & `nansum` by 20% & 50% by yolo-ing NaNs This has a surprisingly large effect on the speed of these routines, and without any loss of accuracy. * Apply `NaN` handling to `rolling_exp_nanvar` This speeds up `rolling_exp_nanvar` 5-10% It will likely be slower if there are lots of leading `NaN`s; but I don't expect that's an overridingly common case, and this also makes the code simpler * Remove some of the attempts to speed it up, it's surprisingly fast already * Update numbagg/moving.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add pandas comparison `nansum` test * Add back checks for `nansum`, which can't use `denom` to assess whether it's seen values * Refine benchmarks * more thorough testing --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
numbagg · Oct 7, 2023 · 843de2a · 843de2a
1 parent 6670eb0
commit 843de2a
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 20 deletions.
diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
@@ -33,6 +33,9 @@ def setup(self, func, n):
     def time_numbagg(self, func, n):
         func[0](self.array, 0.5)
 
+    def time_numbagg_min_weight(self, func, n):
+        func[0](self.array, 0.5, min_weight=0.2)
+
     def time_pandas(self, func, n):
         func[1](self.df_ewm)
 

diff --git a/numbagg/decorators.py b/numbagg/decorators.py
@@ -232,7 +232,7 @@ def __call__(self, arr: np.ndarray, window, min_count=None, axis=-1):
 
 
 class NumbaNDMovingExp(NumbaNDMoving):
-    def __call__(self, arr, alpha, axis=-1):
+    def __call__(self, arr, alpha, min_weight=0, axis=-1):
         if alpha < 0:
             raise ValueError(f"alpha must be positive: {alpha}")
         # If an empty tuple is passed, there's no reduction to do, so we return the
@@ -243,7 +243,7 @@ def __call__(self, arr, alpha, axis=-1):
         # For the sake of speed, we ignore divide-by-zero and NaN warnings, and test for
         # their correct handling in our tests.
         with np.errstate(invalid="ignore", divide="ignore"):
-            return self.gufunc(arr, alpha, axis=axis)
+            return self.gufunc(arr, alpha, min_weight, axis=axis)
 
 
 class NumbaGroupNDReduce:

diff --git a/numbagg/moving.py b/numbagg/moving.py
@@ -4,28 +4,43 @@
 from .decorators import ndmoving, ndmovingexp
 
 
-@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
-def move_exp_nanmean(a, alpha, out):
+@ndmovingexp(
+    [
+        (float32[:], float32, float32, float32[:]),
+        (float64[:], float64, float64, float64[:]),
+    ]
+)
+def move_exp_nanmean(a, alpha, min_weight, out):
     N = len(a)
 
-    numer = denom = 0.0
+    numer = denom = weight = 0.0
     decay = 1.0 - alpha
 
     for i in range(N):
         a_i = a[i]
 
         numer *= decay
         denom *= decay
+        weight *= decay
 
         if not np.isnan(a_i):
             numer += a_i
             denom += 1.0
+            weight += alpha
 
-        out[i] = numer / denom
+        if weight >= min_weight:
+            out[i] = numer / denom
+        else:
+            out[i] = np.nan
 
 
-@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
-def move_exp_nansum(a, alpha, out):
+@ndmovingexp(
+    [
+        (float32[:], float32, float32, float32[:]),
+        (float64[:], float64, float64, float64[:]),
+    ]
+)
+def move_exp_nansum(a, alpha, min_weight, out):
     """
     Calculates the exponentially decayed sum.
 
@@ -39,52 +54,61 @@ def move_exp_nansum(a, alpha, out):
 
     N = len(a)
 
-    numer = 0.0
+    numer = weight = 0.0
     decay = 1.0 - alpha
     zero_count = True
 
     for i in range(N):
         a_i = a[i]
 
         numer *= decay
+        weight *= decay
 
         if not np.isnan(a_i):
             zero_count = False
             numer += a_i
+            weight += alpha
 
-        if not zero_count:
+        if weight >= min_weight and not zero_count:
             out[i] = numer
         else:
             out[i] = np.nan
 
 
-@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
-def move_exp_nanvar(a, alpha, out):
+@ndmovingexp(
+    [
+        (float32[:], float32, float32, float32[:]),
+        (float64[:], float64, float64, float64[:]),
+    ]
+)
+def move_exp_nanvar(a, alpha, min_weight, out):
     N = len(a)
 
     # sum_x: decayed sum of the sequence values.
     # sum_x2: decayed sum of the squared sequence values.
     # n: decayed count of non-missing values observed so far in the sequence.
     # n2: decayed sum of the (already-decayed) weights of non-missing values.
-    sum_x2 = sum_x = sum_weight = sum_weight2 = 0.0
+    sum_x2 = sum_x = sum_weight = sum_weight2 = weight = 0.0
     decay = 1.0 - alpha
 
     for i in range(N):
         a_i = a[i]
 
-        if not np.isnan(a_i):
-            sum_x2 += a_i**2
-            sum_x += a_i
-            sum_weight += 1.0
-            sum_weight2 += 1.0
-
         # decay the values
         sum_x2 *= decay
         sum_x *= decay
         sum_weight *= decay
         # We decay this twice because we want the weight^2, so need to decay again
         # (We could explain this better; contributions welcome...)
         sum_weight2 *= decay**2
+        weight *= decay
+
+        if not np.isnan(a_i):
+            sum_x2 += a_i**2
+            sum_x += a_i
+            sum_weight += 1.0
+            sum_weight2 += 1.0
+            weight += alpha
 
         var_biased = (sum_x2 / sum_weight) - ((sum_x / sum_weight) ** 2)
 
@@ -97,7 +121,10 @@ def move_exp_nanvar(a, alpha, out):
         #   = sum_weight2 / sum_weight**2
         bias = 1 - sum_weight2 / (sum_weight**2)
 
-        out[i] = var_biased / bias
+        if weight >= min_weight:
+            out[i] = var_biased / bias
+        else:
+            out[i] = np.nan
 
 
 @ndmoving(

diff --git a/numbagg/test/test_moving.py b/numbagg/test/test_moving.py
@@ -45,6 +45,74 @@ def test_move_exp_nanmean_2d(rand_array):
     assert_almost_equal(result, expected)
 
 
+@pytest.mark.parametrize("func", [move_exp_nanmean, move_exp_nansum, move_exp_nanvar])
+def test_move_exp_min_weight(func):
+    # Make an array of 25 values, with the first 5 being NaN, and then look at the final
+    # 19. We can't look at the whole series, because `nanvar` will always return NaN for
+    # the first value.
+    array = np.ones(25)
+    array[:5] = np.nan
+
+    # min_weight of 0 should produce values everywhere
+    result = np.sum(~np.isnan(func(array, min_weight=0.0, alpha=0.2))[6:])
+    expected = 19
+    assert result == expected
+    result = np.sum(~np.isnan(func(array, min_weight=0.0, alpha=0.8))[6:])
+    expected = 19
+    assert result == expected
+
+    result = np.sum(~np.isnan(func(array, min_weight=0.5, alpha=0.2))[6:])
+    expected = 17
+    assert result == expected
+    result = np.sum(~np.isnan(func(array, min_weight=0.5, alpha=0.8))[6:])
+    expected = 19
+    assert result == expected
+
+    result = np.sum(~np.isnan(func(array, min_weight=0.9, alpha=0.2))[6:])
+    expected = 10
+    assert result == expected
+    result = np.sum(~np.isnan(func(array, min_weight=0.9, alpha=0.8))[6:])
+    expected = 19
+    assert result == expected
+
+    # min_weight of 1 should never produce values
+    result = np.sum(~np.isnan(func(array, min_weight=1.0, alpha=0.2))[6:])
+    expected = 0
+    assert result == expected
+    result = np.sum(~np.isnan(func(array, min_weight=1.0, alpha=0.8))[6:])
+    expected = 0
+    assert result == expected
+
+
+@pytest.mark.parametrize("n", [10, 200])
+@pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9])
+@pytest.mark.parametrize("test_nans", [True, False])
+def test_move_exp_min_weight_numerical(n, alpha, rand_array, test_nans):
+    array = rand_array[0, :n]
+    if not test_nans:
+        array = np.nan_to_num(array)
+    # High alphas mean fast decays, mean initial weights are higher
+    initial_weight = alpha
+    weights = (
+        np.array([(1 - alpha) ** (i - 1) for i in range(n, 0, -1)]) * initial_weight
+    )
+    assert_almost_equal(weights[-1], initial_weight)
+    # Fill weights with NaNs where array has them
+    weights = np.where(np.isnan(array), np.nan, weights)
+
+    # This is the weight of the final value
+    weight = np.nansum(weights)
+
+    # Run with min_weight slightly above the final value required, assert it doesn't let
+    # it through
+    result = move_exp_nanmean(array, alpha, min_weight=weight + 0.01)
+    assert np.isnan(result[-1])
+
+    # And with min_weight slightly below
+    result = move_exp_nanmean(array, alpha, min_weight=weight - 0.01)
+    assert not np.isnan(result[-1])
+
+
 def test_move_exp_nanmean_numeric():
     array = np.array([10, 0, np.nan, 10])