numbagg · dcherian · Oct 15, 2023 · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/numbagg/decorators.py b/numbagg/decorators.py
@@ -432,18 +432,21 @@ def __init__(
                     values_dtypes, labels_dtypes
                 )
             ]
+        nargs = len(signature[0])
         for sig in signature:
             if not isinstance(sig, tuple):
                 raise TypeError(f"signatures for ndmoving must be tuples: {signature}")
-            if len(sig) != 3:
+            if len(sig) != nargs:
                 raise TypeError(
-                    "signature has wrong number of arguments != 3: " f"{signature}"
+                    f"signature has wrong number of argument != {nargs}: "
+                    f"{signature}"
                 )
             if any(ndim(arg) != 0 for arg in sig):
                 raise ValueError(
                     "all arguments in signature for ndreduce must be scalars: "
                     f" {signature}"
                 )
+        self.needs_count = nargs == 4
 
         self.signature = signature
 
@@ -456,12 +459,18 @@ def gufunc(self, core_ndim, *, target):
         numba_sig = []
         slices = (slice(None),) * core_ndim
         for input_sig in self.signature:
-            values, labels, out = input_sig
-            new_sig = (values[slices], labels[slices], out[:])
+            if self.needs_count:
+                values, labels, counts, out = input_sig
+                new_sig = (values[slices], labels[slices], counts[:], out[:])
+            else:
+                values, labels, out = input_sig
+                new_sig = (values[slices], labels[slices], out[:])  # type: ignore[assignment]
             numba_sig.append(new_sig)
 
         first_sig = numba_sig[0]
-        gufunc_sig = ",".join(2 * [_gufunc_arg_str(first_sig[0])]) + ",(z)"
+        gufunc_sig = ",".join(2 * [_gufunc_arg_str(first_sig[0])]) + (
+            ",(z)" if not self.needs_count else ",(z),(z)"
+        )
         vectorize = numba.guvectorize(
             numba_sig,
             gufunc_sig,
@@ -540,6 +549,7 @@ def __call__(
                 )
             values = np.moveaxis(values, axis, -1)
             gufunc = self.gufunc(1, target=target)
+
         else:
             values_shape = tuple(values.shape[ax] for ax in axis)
             if labels.shape != values_shape:
@@ -556,7 +566,11 @@ def __call__(
         # while `prod` uses 1. So we don't initialize with a value here, and instead
         # rely on the function to do so.
         result = np.empty(broadcast_shape + (num_labels,), values.dtype)
-        gufunc(values, labels, result)
+        if self.needs_count:
+            counts = np.zeros(result.shape, dtype=np.int64)
+            gufunc(values, labels, counts, result)
+        else:
+            gufunc(values, labels, result)
         return result
 
 

diff --git a/numbagg/grouped.py b/numbagg/grouped.py
@@ -1,11 +1,32 @@
 import numpy as np
+from numba.types import float32, float64, int32, int64
 
 from .decorators import groupndreduce
 
-
-@groupndreduce.wrap()
-def group_nanmean(values, labels, out):
-    counts = np.zeros(out.shape, dtype=labels.dtype)
+dtypes = [
+    (int32, int32, int32),
+    (int32, int64, int32),
+    (int64, int32, int64),
+    (int64, int64, int64),
+    (float32, int32, float32),
+    (float32, int64, float32),
+    (float64, int32, float64),
+    (float64, int64, float64),
+]
+dtypes_counts = [
+    (int32, int32, int64, int32),
+    (int32, int64, int64, int32),
+    (int64, int32, int64, int64),
+    (int64, int64, int64, int64),
+    (float32, int32, int64, float32),
+    (float32, int64, int64, float32),
+    (float64, int32, int64, float64),
+    (float64, int64, int64, float64),
+]
+
+
+@groupndreduce.wrap(signature=dtypes_counts)
+def group_nanmean(values, labels, counts, out):
     out[:] = 0.0
 
     for indices in np.ndindex(values.shape):