From b5b447e982c43a97d3a667acf42237e08f2b5707 Mon Sep 17 00:00:00 2001 From: "T. Koskamp" Date: Sun, 23 Nov 2025 12:52:15 +0100 Subject: [PATCH 1/4] BUG: Inconsistent behavior of Groupby with None values with filter (#62501) --- doc/source/whatsnew/v2.3.4.rst | 1 + pandas/core/groupby/groupby.py | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.4.rst b/doc/source/whatsnew/v2.3.4.rst index 6e729c4bf2e2a..897cbacb03170 100644 --- a/doc/source/whatsnew/v2.3.4.rst +++ b/doc/source/whatsnew/v2.3.4.rst @@ -14,6 +14,7 @@ Bug fixes ^^^^^^^^^ - Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`) - Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`) +- Bug in :meth:`~DataFrame.groupby` with ``None`` values with filter (:issue:`62501`) .. --------------------------------------------------------------------------- .. _whatsnew_234.contributors: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 2c8ec599a19ef..62bceace6acbe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -650,6 +650,8 @@ def get_converter(s): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 + elif isna(s): + return lambda key: np.nan else: return lambda key: key @@ -684,11 +686,17 @@ def get_converter(s): for name in names ) + elif any(isna(k) for k in self.indices.keys()): + converters = [get_converter(name) for name in names] + names = (converter(name) for converter, name in zip(converters, names)) + else: converter = get_converter(index_sample) names = (converter(name) for name in names) - return [self.indices.get(name, []) for name in names] + indices = {np.nan if isna(k) else k: v for k, v in self.indices.items()} + + return [indices.get(name, []) for name in names] @final def _get_index(self, name): From d2046e9fc94d35aef1e8df4d9bfdbd924e955499 Mon Sep 17 00:00:00 2001 From: "T. Koskamp" Date: Tue, 25 Nov 2025 20:51:26 +0100 Subject: [PATCH 2/4] BUG: Inconsistent behavior of Groupby with None values with filter (#62501) - Add test cases - Add tuple support - Incorporate feedback --- pandas/core/groupby/groupby.py | 25 ++++++++++++++++------- pandas/tests/groupby/test_filters.py | 30 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 62bceace6acbe..d71342876280d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -680,21 +680,32 @@ def get_converter(s): ) raise ValueError(msg) from err - converters = (get_converter(s) for s in index_sample) + has_nan = any(isna(n) for n in name_sample) + + sample = name_sample if has_nan else index_sample + converters = (get_converter(s) for s in sample) + names = ( tuple(f(n) for f, n in zip(converters, name, strict=True)) for name in names ) - elif any(isna(k) for k in self.indices.keys()): - converters = [get_converter(name) for name in names] - names = (converter(name) for converter, name in zip(converters, names)) - + indices = self.indices + if not self.dropna and has_nan: + indices = {} + for k, v in self.indices.items(): + k = tuple(np.nan if isna(e) else e for e in k) + indices[k] = v else: - converter = get_converter(index_sample) + has_nan = isna(name_sample) + + convert_sample = name_sample if has_nan else index_sample + converter = get_converter(convert_sample) names = (converter(name) for name in names) - indices = {np.nan if isna(k) else k: v for k, v in self.indices.items()} + indices = self.indices + if not self.dropna and has_nan: + indices = {np.nan if isna(k) else k: v for k, v in indices.items()} return [indices.get(name, []) for name in names] diff --git a/pandas/tests/groupby/test_filters.py b/pandas/tests/groupby/test_filters.py index 4fe3aac629513..c20fc9e3d62e7 100644 --- a/pandas/tests/groupby/test_filters.py +++ b/pandas/tests/groupby/test_filters.py @@ -606,3 +606,33 @@ def test_filter_consistent_result_before_after_agg_func(): grouper.sum() result = grouper.filter(lambda x: True) tm.assert_frame_equal(result, expected) + + +def test_filter_with_non_values(): + # GH 62501 + df = DataFrame( + [ + [1], + [None], + ], + columns=["a"], + ) + + result = df.groupby("a", dropna=False).filter(lambda x: True) + tm.assert_frame_equal(result, df) + + +def test_filter_with_non_values_multi_index(): + # GH 62501 + df = DataFrame( + [ + [1, 2], + [3, None], + [None, 4], + [None, None], + ], + columns=["a", "b"], + ) + + result = df.groupby(["a", "b"], dropna=False).filter(lambda x: True) + tm.assert_frame_equal(result, df) From 74057eb2efeae34e1cd2b59a717a9d1ae668ce95 Mon Sep 17 00:00:00 2001 From: "T. Koskamp" Date: Wed, 26 Nov 2025 23:01:49 +0100 Subject: [PATCH 3/4] Update indices property from groupby --- pandas/core/groupby/groupby.py | 50 ++++++++++------------------------ pandas/core/groupby/ops.py | 21 ++++++++++++-- 2 files changed, 32 insertions(+), 39 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d71342876280d..5dd2266d0bb22 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -637,7 +637,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: return self._grouper.indices @final - def _get_indices(self, names): + def _get_indices(self, name): """ Safe get multiple indices, translate keys for datelike to underlying repr. @@ -650,28 +650,27 @@ def get_converter(s): return lambda key: Timestamp(key) elif isinstance(s, np.datetime64): return lambda key: Timestamp(key).asm8 - elif isna(s): - return lambda key: np.nan else: return lambda key: key - if len(names) == 0: - return [] + if isna(name): + return self.indices.get(np.nan, []) + if isinstance(name, tuple): + name = tuple(np.nan if isna(comp) else comp for comp in name) if len(self.indices) > 0: index_sample = next(iter(self.indices)) else: index_sample = None # Dummy sample - name_sample = names[0] if isinstance(index_sample, tuple): - if not isinstance(name_sample, tuple): + if not isinstance(name, tuple): msg = "must supply a tuple to get_group with multiple grouping keys" raise ValueError(msg) - if not len(name_sample) == len(index_sample): + if not len(name) == len(index_sample): try: # If the original grouper was a tuple - return [self.indices[name] for name in names] + return self.indices[name] except KeyError as err: # turns out it wasn't a tuple msg = ( @@ -680,41 +679,20 @@ def get_converter(s): ) raise ValueError(msg) from err - has_nan = any(isna(n) for n in name_sample) - - sample = name_sample if has_nan else index_sample - converters = (get_converter(s) for s in sample) - - names = ( - tuple(f(n) for f, n in zip(converters, name, strict=True)) - for name in names - ) - - indices = self.indices - if not self.dropna and has_nan: - indices = {} - for k, v in self.indices.items(): - k = tuple(np.nan if isna(e) else e for e in k) - indices[k] = v + converters = (get_converter(s) for s in index_sample) + name = tuple(f(n) for f, n in zip(converters, name, strict=True)) else: - has_nan = isna(name_sample) - - convert_sample = name_sample if has_nan else index_sample - converter = get_converter(convert_sample) - names = (converter(name) for name in names) - - indices = self.indices - if not self.dropna and has_nan: - indices = {np.nan if isna(k) else k: v for k, v in indices.items()} + converter = get_converter(index_sample) + name = converter(name) - return [indices.get(name, []) for name in names] + return self.indices.get(name, []) @final def _get_index(self, name): """ Safe get index, translate keys for datelike to underlying repr. """ - return self._get_indices([name])[0] + return self._get_indices(name) @final @cache_readonly diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d86264cb95dc5..2591426906655 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -652,9 +652,24 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 - return self.groupings[0].indices - codes_list = [ping.codes for ping in self.groupings] - return get_indexer_dict(codes_list, self.levels) + result = self.groupings[0].indices + else: + codes_list = [ping.codes for ping in self.groupings] + result = get_indexer_dict(codes_list, self.levels) + if not self.dropna: + has_mi = isinstance(self.result_index, MultiIndex) + if not has_mi and self.result_index.hasnans: + result = { + np.nan if isna(key) else key: value for key, value in result.items() + } + elif has_mi: + # MultiIndex has no efficient way to tell if there are NAs + result = { + tuple(np.nan if isna(comp) else comp for comp in key): value + for key, value in result.items() + } + + return result @final @cache_readonly From f7c5e23876e6f1fea7d4277195893d1b790f2e0a Mon Sep 17 00:00:00 2001 From: "T. Koskamp" Date: Sat, 29 Nov 2025 17:00:27 +0100 Subject: [PATCH 4/4] Incorporate review suggestion for issue #63178 BUG: Inconsistent behavior of Groupby with None values with filter --- pandas/core/groupby/groupby.py | 9 +-------- pandas/core/groupby/ops.py | 3 ++- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5dd2266d0bb22..4b8b7717ad7ee 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -637,7 +637,7 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: return self._grouper.indices @final - def _get_indices(self, name): + def _get_index(self, name): """ Safe get multiple indices, translate keys for datelike to underlying repr. @@ -687,13 +687,6 @@ def get_converter(s): return self.indices.get(name, []) - @final - def _get_index(self, name): - """ - Safe get index, translate keys for datelike to underlying repr. - """ - return self._get_indices(name) - @final @cache_readonly def _selected_obj(self): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2591426906655..f6600f39bbc57 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -665,7 +665,8 @@ def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: elif has_mi: # MultiIndex has no efficient way to tell if there are NAs result = { - tuple(np.nan if isna(comp) else comp for comp in key): value + # error: "Hashable" has no attribute "__iter__" (not iterable) + tuple(np.nan if isna(comp) else comp for comp in key): value # type: ignore[attr-defined] for key, value in result.items() }