From ca0b39b566b1287568637aa833854ce682179670 Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 18 Nov 2025 21:16:13 -0500 Subject: [PATCH 1/3] BUG: groupby raises on non-C-contiguous masks --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/groupby.pyx | 26 ++++++++++++------------ pandas/tests/groupby/test_all_methods.py | 17 ++++++++++++++++ 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c96bb7f663368..21fcd256ff749 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1260,6 +1260,7 @@ Groupby/resample/rolling - Bug in :meth:`Series.resample` could raise when the date range ended shortly before a non-existent time. (:issue:`58380`) - Bug in :meth:`Series.resample` raising error when resampling non-nanosecond resolutions out of bounds for nanosecond precision (:issue:`57427`) - Bug in :meth:`Series.rolling.var` and :meth:`Series.rolling.std` computing incorrect results due to numerical instability. (:issue:`47721`, :issue:`52407`, :issue:`54518`, :issue:`55343`) +- Bug in :meth:`DataFrame.groupby` methods when operating on NumPy-nullable data failing when the NA mask was not C-contiguous (:issue:`61031`) Reshaping ^^^^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9f8ff86cbcb7e..32b29ac7af252 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -819,7 +819,7 @@ def group_prod( int64_t[::1] counts, ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask, + const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint skipna=True, @@ -893,7 +893,7 @@ def group_var( const intp_t[::1] labels, Py_ssize_t min_count=-1, int64_t ddof=1, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, str name="var", @@ -998,7 +998,7 @@ def group_skew( int64_t[::1] counts, ndarray[float64_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1086,7 +1086,7 @@ def group_kurt( int64_t[::1] counts, ndarray[float64_t, ndim=2] values, const intp_t[::1] labels, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1180,7 +1180,7 @@ def group_mean( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -1324,7 +1324,7 @@ def group_ohlc( ndarray[int64float_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: """ @@ -1870,7 +1870,7 @@ cdef group_min_max( Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ): @@ -1983,7 +1983,7 @@ def group_idxmin_idxmax( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, str name="idxmin", bint skipna=True, uint8_t[:, ::1] result_mask=None, @@ -2096,7 +2096,7 @@ def group_max( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2124,7 +2124,7 @@ def group_min( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2148,7 +2148,7 @@ def group_min( cdef group_cummin_max( numeric_t[:, ::1] out, ndarray[numeric_t, ndim=2] values, - const uint8_t[:, ::1] mask, + const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask, const intp_t[::1] labels, int ngroups, @@ -2264,7 +2264,7 @@ def group_cummin( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: @@ -2290,7 +2290,7 @@ def group_cummax( const intp_t[::1] labels, int ngroups, bint is_datetimelike, - const uint8_t[:, ::1] mask=None, + const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint skipna=True, ) -> None: diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 2310c3bf59e15..313ce09956f30 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -84,3 +84,20 @@ def test_dup_labels_output_shape(groupby_func, idx): assert result.shape == (1, 2) tm.assert_index_equal(result.columns, idx) + + +def test_not_c_contiguous_mask(groupby_func): + if groupby_func == "corrwith": + return + df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64") + reversed = DataFrame( + {"a": [2, 1, 1], "b": [5, 4, 3]}, dtype="Int64", index=[2, 1, 0] + )[::-1] + assert not reversed["b"].array._mask.flags["C_CONTIGUOUS"] + args = get_groupby_method_args(groupby_func, df) + + gb_reversed = reversed.groupby("a") + result = getattr(gb_reversed, groupby_func)(*args) + gb = df.groupby("a") + expected = getattr(gb, groupby_func)(*args) + tm.assert_equal(result, expected) From 19df340e31849e5bd12db0359958dee46db7326b Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 18 Nov 2025 21:19:59 -0500 Subject: [PATCH 2/3] Add comment --- pandas/tests/groupby/test_all_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 313ce09956f30..5a98b35f485bc 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -88,6 +88,7 @@ def test_dup_labels_output_shape(groupby_func, idx): def test_not_c_contiguous_mask(groupby_func): if groupby_func == "corrwith": + # corrwith is deprecated return df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}, dtype="Int64") reversed = DataFrame( From ad96dd33ec85bb84cd552d8b77c37487521e62b7 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Thu, 20 Nov 2025 07:13:39 -0500 Subject: [PATCH 3/3] GH# --- pandas/tests/groupby/test_all_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_all_methods.py b/pandas/tests/groupby/test_all_methods.py index 5a98b35f485bc..7a012f5da4aa8 100644 --- a/pandas/tests/groupby/test_all_methods.py +++ b/pandas/tests/groupby/test_all_methods.py @@ -87,6 +87,7 @@ def test_dup_labels_output_shape(groupby_func, idx): def test_not_c_contiguous_mask(groupby_func): + # https://github.com/pandas-dev/pandas/issues/61031 if groupby_func == "corrwith": # corrwith is deprecated return