Skip to content

Commit

Permalink
A min_weight param to exp functions (#108)
Browse files Browse the repository at this point in the history
* Proposal for a `min_weight` param to exp functions

One issue with using the current exp functions is that there's no `min_periods` parameter, meaning that initial results can be overly influenced by a couple of values.

This PR adds a `min_weight` parameter to the exp functions, which is the minimum weight that a value requires before producing a value.

Assuming that `alpha` is constant, this is equivalent to taking a rolling sum of `1` or `0`s and then passing the original result where the corresponding value of the rolling sum is greater than `min_weight/alpha`.

So for a series with a slower moving average (i.e. a lower alpha), we require more points to be present before producing a value. Similarly, a higher `min_weight` value will require more points to be present. If all values exist, then the value of `1/alpha` will asymptotically approach 1 — so a `min_weight` of 1 will never produce values.

* Speed up `nanmean` & `nansum` by 20% & 50% by yolo-ing NaNs

This has a surprisingly large effect on the speed of these routines, and without any loss of accuracy.

* Apply `NaN` handling to `rolling_exp_nanvar`

This speeds up `rolling_exp_nanvar` 5-10%

It will likely be slower if there are lots of leading `NaN`s; but I don't expect that's an overridingly common case, and this also makes the code simpler

* Remove some of the attempts to speed it up, it's surprisingly fast already

* Update numbagg/moving.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add pandas comparison `nansum` test

* Add back checks for `nansum`, which can't use `denom` to assess whether it's seen values

* Refine benchmarks

* more thorough testing

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
max-sixty and pre-commit-ci[bot] committed Oct 7, 2023
1 parent 6670eb0 commit 843de2a
Show file tree
Hide file tree
Showing 4 changed files with 118 additions and 20 deletions.
3 changes: 3 additions & 0 deletions benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def setup(self, func, n):
def time_numbagg(self, func, n):
func[0](self.array, 0.5)

def time_numbagg_min_weight(self, func, n):
func[0](self.array, 0.5, min_weight=0.2)

def time_pandas(self, func, n):
func[1](self.df_ewm)

Expand Down
4 changes: 2 additions & 2 deletions numbagg/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def __call__(self, arr: np.ndarray, window, min_count=None, axis=-1):


class NumbaNDMovingExp(NumbaNDMoving):
def __call__(self, arr, alpha, axis=-1):
def __call__(self, arr, alpha, min_weight=0, axis=-1):
if alpha < 0:
raise ValueError(f"alpha must be positive: {alpha}")
# If an empty tuple is passed, there's no reduction to do, so we return the
Expand All @@ -243,7 +243,7 @@ def __call__(self, arr, alpha, axis=-1):
# For the sake of speed, we ignore divide-by-zero and NaN warnings, and test for
# their correct handling in our tests.
with np.errstate(invalid="ignore", divide="ignore"):
return self.gufunc(arr, alpha, axis=axis)
return self.gufunc(arr, alpha, min_weight, axis=axis)


class NumbaGroupNDReduce:
Expand Down
63 changes: 45 additions & 18 deletions numbagg/moving.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,28 +4,43 @@
from .decorators import ndmoving, ndmovingexp


@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
def move_exp_nanmean(a, alpha, out):
@ndmovingexp(
[
(float32[:], float32, float32, float32[:]),
(float64[:], float64, float64, float64[:]),
]
)
def move_exp_nanmean(a, alpha, min_weight, out):
N = len(a)

numer = denom = 0.0
numer = denom = weight = 0.0
decay = 1.0 - alpha

for i in range(N):
a_i = a[i]

numer *= decay
denom *= decay
weight *= decay

if not np.isnan(a_i):
numer += a_i
denom += 1.0
weight += alpha

out[i] = numer / denom
if weight >= min_weight:
out[i] = numer / denom
else:
out[i] = np.nan


@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
def move_exp_nansum(a, alpha, out):
@ndmovingexp(
[
(float32[:], float32, float32, float32[:]),
(float64[:], float64, float64, float64[:]),
]
)
def move_exp_nansum(a, alpha, min_weight, out):
"""
Calculates the exponentially decayed sum.
Expand All @@ -39,52 +54,61 @@ def move_exp_nansum(a, alpha, out):

N = len(a)

numer = 0.0
numer = weight = 0.0
decay = 1.0 - alpha
zero_count = True

for i in range(N):
a_i = a[i]

numer *= decay
weight *= decay

if not np.isnan(a_i):
zero_count = False
numer += a_i
weight += alpha

if not zero_count:
if weight >= min_weight and not zero_count:
out[i] = numer
else:
out[i] = np.nan


@ndmovingexp([(float32[:], float32, float32[:]), (float64[:], float64, float64[:])])
def move_exp_nanvar(a, alpha, out):
@ndmovingexp(
[
(float32[:], float32, float32, float32[:]),
(float64[:], float64, float64, float64[:]),
]
)
def move_exp_nanvar(a, alpha, min_weight, out):
N = len(a)

# sum_x: decayed sum of the sequence values.
# sum_x2: decayed sum of the squared sequence values.
# n: decayed count of non-missing values observed so far in the sequence.
# n2: decayed sum of the (already-decayed) weights of non-missing values.
sum_x2 = sum_x = sum_weight = sum_weight2 = 0.0
sum_x2 = sum_x = sum_weight = sum_weight2 = weight = 0.0
decay = 1.0 - alpha

for i in range(N):
a_i = a[i]

if not np.isnan(a_i):
sum_x2 += a_i**2
sum_x += a_i
sum_weight += 1.0
sum_weight2 += 1.0

# decay the values
sum_x2 *= decay
sum_x *= decay
sum_weight *= decay
# We decay this twice because we want the weight^2, so need to decay again
# (We could explain this better; contributions welcome...)
sum_weight2 *= decay**2
weight *= decay

if not np.isnan(a_i):
sum_x2 += a_i**2
sum_x += a_i
sum_weight += 1.0
sum_weight2 += 1.0
weight += alpha

var_biased = (sum_x2 / sum_weight) - ((sum_x / sum_weight) ** 2)

Expand All @@ -97,7 +121,10 @@ def move_exp_nanvar(a, alpha, out):
# = sum_weight2 / sum_weight**2
bias = 1 - sum_weight2 / (sum_weight**2)

out[i] = var_biased / bias
if weight >= min_weight:
out[i] = var_biased / bias
else:
out[i] = np.nan


@ndmoving(
Expand Down
68 changes: 68 additions & 0 deletions numbagg/test/test_moving.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,74 @@ def test_move_exp_nanmean_2d(rand_array):
assert_almost_equal(result, expected)


@pytest.mark.parametrize("func", [move_exp_nanmean, move_exp_nansum, move_exp_nanvar])
def test_move_exp_min_weight(func):
# Make an array of 25 values, with the first 5 being NaN, and then look at the final
# 19. We can't look at the whole series, because `nanvar` will always return NaN for
# the first value.
array = np.ones(25)
array[:5] = np.nan

# min_weight of 0 should produce values everywhere
result = np.sum(~np.isnan(func(array, min_weight=0.0, alpha=0.2))[6:])
expected = 19
assert result == expected
result = np.sum(~np.isnan(func(array, min_weight=0.0, alpha=0.8))[6:])
expected = 19
assert result == expected

result = np.sum(~np.isnan(func(array, min_weight=0.5, alpha=0.2))[6:])
expected = 17
assert result == expected
result = np.sum(~np.isnan(func(array, min_weight=0.5, alpha=0.8))[6:])
expected = 19
assert result == expected

result = np.sum(~np.isnan(func(array, min_weight=0.9, alpha=0.2))[6:])
expected = 10
assert result == expected
result = np.sum(~np.isnan(func(array, min_weight=0.9, alpha=0.8))[6:])
expected = 19
assert result == expected

# min_weight of 1 should never produce values
result = np.sum(~np.isnan(func(array, min_weight=1.0, alpha=0.2))[6:])
expected = 0
assert result == expected
result = np.sum(~np.isnan(func(array, min_weight=1.0, alpha=0.8))[6:])
expected = 0
assert result == expected


@pytest.mark.parametrize("n", [10, 200])
@pytest.mark.parametrize("alpha", [0.1, 0.5, 0.9])
@pytest.mark.parametrize("test_nans", [True, False])
def test_move_exp_min_weight_numerical(n, alpha, rand_array, test_nans):
array = rand_array[0, :n]
if not test_nans:
array = np.nan_to_num(array)
# High alphas mean fast decays, mean initial weights are higher
initial_weight = alpha
weights = (
np.array([(1 - alpha) ** (i - 1) for i in range(n, 0, -1)]) * initial_weight
)
assert_almost_equal(weights[-1], initial_weight)
# Fill weights with NaNs where array has them
weights = np.where(np.isnan(array), np.nan, weights)

# This is the weight of the final value
weight = np.nansum(weights)

# Run with min_weight slightly above the final value required, assert it doesn't let
# it through
result = move_exp_nanmean(array, alpha, min_weight=weight + 0.01)
assert np.isnan(result[-1])

# And with min_weight slightly below
result = move_exp_nanmean(array, alpha, min_weight=weight - 0.01)
assert not np.isnan(result[-1])


def test_move_exp_nanmean_numeric():
array = np.array([10, 0, np.nan, 10])

Expand Down

0 comments on commit 843de2a

Please sign in to comment.