From 316dcdf15f9fc642abcf9689b5f989887c67caab Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 29 Sep 2022 21:45:29 -0400 Subject: [PATCH 1/7] bug: MultiIndex.get_indexer with na/missing --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/core/indexes/base.py | 5 ++++ pandas/core/indexes/multi.py | 9 ++++++++ pandas/tests/indexes/multi/test_indexing.py | 14 +++++++++++ pandas/tests/indexes/multi/test_join.py | 19 +++++++++++++++ pandas/tests/indexes/multi/test_reindex.py | 12 ++++++++++ pandas/tests/indexes/multi/test_setops.py | 23 ++++++++++++++++--- .../indexing/multiindex/test_multiindex.py | 20 ++++++++++++---- 8 files changed, 96 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index a1313f2e20d81..24502d5cad678 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -221,6 +221,7 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`) - Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) +- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`#####`) - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d2b4a4c7d130e..f81a024e1063d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4028,6 +4028,11 @@ def get_indexer( target, method=method, limit=limit, tolerance=tolerance ) + if self._is_multi and self._contains_na_any_level: + # GH# + # MultiIndex requires special treatment to handle missing values + return Index(self._values).get_indexer_for(target) + return self._get_indexer(target, method, limit, tolerance) def _get_indexer( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 574dfdf48055d..8500a76521d04 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2613,6 +2613,15 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: target.names = self.names return target + @cache_readonly + def _contains_na_any_level(self) -> bool: + """ + Return True if any level of the index contains missing values. + """ + if len(self) == 0: + return False + return any(c.min() == -1 for c in self.codes) + # -------------------------------------------------------------------- # Indexing Methods diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 591b7abf60cc0..0d0a378a137e4 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -924,3 +924,17 @@ def test_get_locs_reordering(keys, expected): result = idx.get_locs(keys) expected = np.array(expected, dtype=np.intp) tm.assert_numpy_array_equal(result, expected) + + +def test_get_indexer_for_multiindex_with_nans(nulls_fixture): + # GH37222 + idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]) + idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"]) + + result = idx2.get_indexer(idx1) + expected = np.array([-1, 1]) + tm.assert_numpy_array_equal(result, expected) + + result = idx1.get_indexer(idx2) + expected = np.array([-1, 1]) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_join.py b/pandas/tests/indexes/multi/test_join.py index e6bec97aedb38..23d5325dde2bb 100644 --- a/pandas/tests/indexes/multi/test_join.py +++ b/pandas/tests/indexes/multi/test_join.py @@ -2,6 +2,7 @@ import pytest from pandas import ( + DataFrame, Index, Interval, MultiIndex, @@ -158,3 +159,21 @@ def test_join_overlapping_interval_level(): result = idx_1.join(idx_2, how="outer") tm.assert_index_equal(result, expected) + + +def test_join_multi_with_nan(): + # GH29252 + df1 = DataFrame( + data={"col1": [1.1, 1.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + df2 = DataFrame( + data={"col2": [2.1, 2.2]}, + index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]), + ) + result = df1.join(df2) + expected = DataFrame( + data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]}, + index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_reindex.py b/pandas/tests/indexes/multi/test_reindex.py index 5d124c19cd865..7ef739056ada8 100644 --- a/pandas/tests/indexes/multi/test_reindex.py +++ b/pandas/tests/indexes/multi/test_reindex.py @@ -159,3 +159,15 @@ def test_reindex_limit_arg_with_multiindex(): match="limit argument only valid if doing pad, backfill or nearest reindexing", ): df.reindex(new_idx, fill_value=0, limit=1) + + +def test_reindex_with_none_in_nested_multiindex(): + # GH42883 + index = MultiIndex.from_tuples([(("a", None), 1), (("b", None), 2)]) + index2 = MultiIndex.from_tuples([(("b", None), 2), (("a", None), 1)]) + df1_dtype = pd.DataFrame([1, 2], index=index) + df2_dtype = pd.DataFrame([2, 1], index=index2) + + result = df1_dtype.reindex_like(df2_dtype) + expected = df2_dtype + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 718ac407d4a3f..b5cd81bf6fe95 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -4,6 +4,7 @@ import pandas as pd from pandas import ( CategoricalIndex, + DataFrame, Index, IntervalIndex, MultiIndex, @@ -550,6 +551,15 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture): tm.assert_index_equal(result, expected) +def test_union_with_missing_values_on_both_sides(nulls_fixture): + # GH#38623 + mi1 = MultiIndex.from_arrays([[1, np.nan]]) + mi2 = MultiIndex.from_arrays([[1, np.nan, 3]]) + result = mi1.union(mi2) + expected = MultiIndex.from_arrays([[1, np.nan, 3]]) + tm.assert_index_equal(result, expected) + + def test_union_nan_got_duplicated(): # GH#38977 mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]]) @@ -620,15 +630,13 @@ def test_union_keep_dtype_precision(any_real_numeric_dtype): def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype): # GH#48498 - arr1 = Series([4, pd.NA], dtype=any_numeric_ea_dtype) arr2 = Series([1, pd.NA], dtype=any_numeric_ea_dtype) midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None]) midx2 = MultiIndex.from_arrays([arr2, [1, 2]]) result = midx.union(midx2) - # Expected is actually off and should contain (1, 1) too. See GH#37222 expected = MultiIndex.from_arrays( - [Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]] + [Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]] ) tm.assert_index_equal(result, expected) @@ -667,3 +675,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype): result = midx.intersection(midx2) expected = MultiIndex.from_arrays([Series([2], dtype=any_numeric_ea_dtype), [1]]) tm.assert_index_equal(result, expected) + + +def test_union_with_na_when_constructing_dataframe(): + # GH43222 + series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),))) + series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b")))) + result = DataFrame([series1, series2]) + expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 08e15545cb998..2cc61eca8df34 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan)], - [(81.0, np.nan), (np.nan, np.nan)], - [1, 2], - [1, 1], + [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)], + [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)], + [1, 2, np.nan], + [np.nan, 1, 2], ], ), ( @@ -227,3 +227,15 @@ def test_multiindex_repeated_keys(self): ], Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])), ) + + def test_multiindex_with_na_missing_key(self): + # GH46173 + df = DataFrame.from_dict( + { + ("foo",): [1, 2, 3], + ("bar",): [5, 6, 7], + (None,): [8, 9, 0], + } + ) + with pytest.raises(KeyError, match="missing_key"): + df[[("missing_key",)]] From 626d21637f39522419a87def2c865caaea9cb077 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 29 Sep 2022 21:52:58 -0400 Subject: [PATCH 2/7] add gh refs --- doc/source/whatsnew/v1.6.0.rst | 2 +- pandas/core/indexes/base.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index 24502d5cad678..8f24bd584e27c 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -221,7 +221,7 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`) - Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) -- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`#####`) +- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`48877`) - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index f81a024e1063d..b55d99eafc0a8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4029,7 +4029,7 @@ def get_indexer( ) if self._is_multi and self._contains_na_any_level: - # GH# + # GH48877 # MultiIndex requires special treatment to handle missing values return Index(self._values).get_indexer_for(target) From e81e0bd094a94f3ab80fc1e8458c609e58e5df88 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 30 Sep 2022 06:32:17 -0400 Subject: [PATCH 3/7] fix test --- pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/multi/test_indexing.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b55d99eafc0a8..2f8d2cea01b66 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4031,7 +4031,7 @@ def get_indexer( if self._is_multi and self._contains_na_any_level: # GH48877 # MultiIndex requires special treatment to handle missing values - return Index(self._values).get_indexer_for(target) + return Index(self._values)._get_indexer(target, method, limit, tolerance) return self._get_indexer(target, method, limit, tolerance) diff --git a/pandas/tests/indexes/multi/test_indexing.py b/pandas/tests/indexes/multi/test_indexing.py index 0d0a378a137e4..8fdbd429eb5c8 100644 --- a/pandas/tests/indexes/multi/test_indexing.py +++ b/pandas/tests/indexes/multi/test_indexing.py @@ -932,9 +932,9 @@ def test_get_indexer_for_multiindex_with_nans(nulls_fixture): idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"]) result = idx2.get_indexer(idx1) - expected = np.array([-1, 1]) + expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) result = idx1.get_indexer(idx2) - expected = np.array([-1, 1]) + expected = np.array([-1, 1], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) From 1dbafa8b9d7a8906a83a3c5475d7e74bc1756035 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Fri, 30 Sep 2022 20:19:53 -0400 Subject: [PATCH 4/7] mypy --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2f8d2cea01b66..12227c0578c88 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -4028,7 +4028,7 @@ def get_indexer( target, method=method, limit=limit, tolerance=tolerance ) - if self._is_multi and self._contains_na_any_level: + if isinstance(self, ABCMultiIndex) and self._contains_na_any_level: # GH48877 # MultiIndex requires special treatment to handle missing values return Index(self._values)._get_indexer(target, method, limit, tolerance) From e133ad419eef94ed327795fd42760b08fcffb763 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 1 Oct 2022 09:57:27 -0400 Subject: [PATCH 5/7] add test for GH48905 --- pandas/tests/indexes/multi/test_isin.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py index 695458273d16e..bf003019d5387 100644 --- a/pandas/tests/indexes/multi/test_isin.py +++ b/pandas/tests/indexes/multi/test_isin.py @@ -13,6 +13,15 @@ def test_isin_nan(): ) +def test_isin_missing(nulls_fixture): + # GH48905 + mi1 = MultiIndex.from_tuples([(1, nulls_fixture)]) + mi2 = MultiIndex.from_tuples([(1, 1), (1, 2)]) + result = mi2.isin(mi1) + expected = np.array([False, False]) + tm.assert_numpy_array_equal(result, expected) + + def test_isin(): values = [("foo", 2), ("bar", 3), ("quux", 4)] From 2db9f821f572171ea8eb415acfa6afe9d1aad5ba Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Thu, 13 Oct 2022 20:46:37 -0400 Subject: [PATCH 6/7] fix tests --- pandas/tests/indexes/multi/test_setops.py | 6 +++--- pandas/tests/indexing/multiindex/test_multiindex.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py index 4238c74e991cb..74bd553ade765 100644 --- a/pandas/tests/indexes/multi/test_setops.py +++ b/pandas/tests/indexes/multi/test_setops.py @@ -553,10 +553,10 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture): def test_union_with_missing_values_on_both_sides(nulls_fixture): # GH#38623 - mi1 = MultiIndex.from_arrays([[1, np.nan]]) - mi2 = MultiIndex.from_arrays([[1, np.nan, 3]]) + mi1 = MultiIndex.from_arrays([[1, nulls_fixture]]) + mi2 = MultiIndex.from_arrays([[1, nulls_fixture, 3]]) result = mi1.union(mi2) - expected = MultiIndex.from_arrays([[1, np.nan, 3]]) + expected = MultiIndex.from_arrays([[1, 3, nulls_fixture]]) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py index 2cc61eca8df34..dc46fb5ad2b47 100644 --- a/pandas/tests/indexing/multiindex/test_multiindex.py +++ b/pandas/tests/indexing/multiindex/test_multiindex.py @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self): [1, 2], ], [ - [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)], - [(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)], - [1, 2, np.nan], - [np.nan, 1, 2], + [(81.0, np.nan), (82.0, np.nan), (np.nan, np.nan)], + [(81.0, np.nan), (82.0, np.nan), (np.nan, np.nan)], + [1, np.nan, 2], + [np.nan, 2, 1], ], ), ( From 81c07d4314414d3cb1ef8d52f899c9ca571bcc0b Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Mon, 7 Nov 2022 19:44:47 -0500 Subject: [PATCH 7/7] tests only --- doc/source/whatsnew/v2.0.0.rst | 3 +-- pandas/core/indexes/base.py | 5 ----- pandas/core/indexes/multi.py | 9 --------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 2194ccf98896e..f69c0a7e19f93 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -558,7 +558,7 @@ Missing MultiIndex ^^^^^^^^^^ -- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`37222`) +- Bug in :meth:`MultiIndex.get_indexer` not matching ``NaN`` values (:issue:`29252`, :issue:`37222`, :issue:`38623`, :issue:`42883`, :issue:`43222`, :issue:`46173`, :issue:`48905`) - Bug in :meth:`MultiIndex.argsort` raising ``TypeError`` when index contains :attr:`NA` (:issue:`48495`) - Bug in :meth:`MultiIndex.difference` losing extension array dtype (:issue:`48606`) - Bug in :class:`MultiIndex.set_levels` raising ``IndexError`` when setting empty level (:issue:`48636`) @@ -567,7 +567,6 @@ MultiIndex - Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`, :issue:`48900`) - Bug in :meth:`MultiIndex.union` not sorting when sort=None and index contains missing values (:issue:`49010`) - Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`) -- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`48877`) - Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`) - diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 0133cd5864293..92a8a7a7adcae 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3729,11 +3729,6 @@ def get_indexer( target, method=method, limit=limit, tolerance=tolerance ) - if isinstance(self, ABCMultiIndex) and self._contains_na_any_level: - # GH48877 - # MultiIndex requires special treatment to handle missing values - return Index(self._values)._get_indexer(target, method, limit, tolerance) - return self._get_indexer(target, method, limit, tolerance) def _get_indexer( diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d9c7f8e023a85..61243118819c0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2509,15 +2509,6 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index: target.names = self.names return target - @cache_readonly - def _contains_na_any_level(self) -> bool: - """ - Return True if any level of the index contains missing values. - """ - if len(self) == 0: - return False - return any(c.min() == -1 for c in self.codes) - # -------------------------------------------------------------------- # Indexing Methods