Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: MultiIndex.get_indexer with na/missing #48877

Merged
merged 9 commits into from
Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.6.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ MultiIndex
- Bug in :meth:`MultiIndex.intersection` losing extension array (:issue:`48604`)
- Bug in :meth:`MultiIndex.union` losing extension array (:issue:`48498`, :issue:`48505`)
- Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
- Bug in :meth:`MultiIndex.get_indexer` returning incorrect indices if index contains na/missing values (:issue:`48877`)
- Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`)
-

Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4028,6 +4028,11 @@ def get_indexer(
target, method=method, limit=limit, tolerance=tolerance
)

if self._is_multi and self._contains_na_any_level:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Id prefer fixing this on the engine level and not special casing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, i'll take another look

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is probably non trivial, not sure what's wrong there

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not seeing an obvious way to fix this on the engine level. If you look a few lines above this, categorical is already special cased to handle nans in a similar way. Are you ok with special casing for now?

# GH48877
# MultiIndex requires special treatment to handle missing values
return Index(self._values).get_indexer_for(target)

return self._get_indexer(target, method, limit, tolerance)

def _get_indexer(
Expand Down
9 changes: 9 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2613,6 +2613,15 @@ def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index:
target.names = self.names
return target

@cache_readonly
def _contains_na_any_level(self) -> bool:
"""
Return True if any level of the index contains missing values.
"""
if len(self) == 0:
return False
return any(c.min() == -1 for c in self.codes)

# --------------------------------------------------------------------
# Indexing Methods

Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,3 +924,17 @@ def test_get_locs_reordering(keys, expected):
result = idx.get_locs(keys)
expected = np.array(expected, dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)


def test_get_indexer_for_multiindex_with_nans(nulls_fixture):
# GH37222
idx1 = MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"])
idx2 = MultiIndex.from_product([["A"], [nulls_fixture, 2.0]], names=["id1", "id2"])

result = idx2.get_indexer(idx1)
expected = np.array([-1, 1])
tm.assert_numpy_array_equal(result, expected)

result = idx1.get_indexer(idx2)
expected = np.array([-1, 1])
tm.assert_numpy_array_equal(result, expected)
19 changes: 19 additions & 0 deletions pandas/tests/indexes/multi/test_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pytest

from pandas import (
DataFrame,
Index,
Interval,
MultiIndex,
Expand Down Expand Up @@ -158,3 +159,21 @@ def test_join_overlapping_interval_level():
result = idx_1.join(idx_2, how="outer")

tm.assert_index_equal(result, expected)


def test_join_multi_with_nan():
# GH29252
df1 = DataFrame(
data={"col1": [1.1, 1.2]},
index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
)
df2 = DataFrame(
data={"col2": [2.1, 2.2]},
index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]),
)
result = df1.join(df2)
expected = DataFrame(
data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
)
tm.assert_frame_equal(result, expected)
12 changes: 12 additions & 0 deletions pandas/tests/indexes/multi/test_reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,15 @@ def test_reindex_limit_arg_with_multiindex():
match="limit argument only valid if doing pad, backfill or nearest reindexing",
):
df.reindex(new_idx, fill_value=0, limit=1)


def test_reindex_with_none_in_nested_multiindex():
# GH42883
index = MultiIndex.from_tuples([(("a", None), 1), (("b", None), 2)])
index2 = MultiIndex.from_tuples([(("b", None), 2), (("a", None), 1)])
df1_dtype = pd.DataFrame([1, 2], index=index)
df2_dtype = pd.DataFrame([2, 1], index=index2)

result = df1_dtype.reindex_like(df2_dtype)
expected = df2_dtype
tm.assert_frame_equal(result, expected)
23 changes: 20 additions & 3 deletions pandas/tests/indexes/multi/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
from pandas import (
CategoricalIndex,
DataFrame,
Index,
IntervalIndex,
MultiIndex,
Expand Down Expand Up @@ -550,6 +551,15 @@ def test_intersection_with_missing_values_on_both_sides(nulls_fixture):
tm.assert_index_equal(result, expected)


def test_union_with_missing_values_on_both_sides(nulls_fixture):
# GH#38623
mi1 = MultiIndex.from_arrays([[1, np.nan]])
mi2 = MultiIndex.from_arrays([[1, np.nan, 3]])
result = mi1.union(mi2)
expected = MultiIndex.from_arrays([[1, np.nan, 3]])
tm.assert_index_equal(result, expected)


def test_union_nan_got_duplicated():
# GH#38977
mi1 = MultiIndex.from_arrays([[1.0, np.nan], [2, 3]])
Expand Down Expand Up @@ -620,15 +630,13 @@ def test_union_keep_dtype_precision(any_real_numeric_dtype):

def test_union_keep_ea_dtype_with_na(any_numeric_ea_dtype):
# GH#48498

arr1 = Series([4, pd.NA], dtype=any_numeric_ea_dtype)
arr2 = Series([1, pd.NA], dtype=any_numeric_ea_dtype)
midx = MultiIndex.from_arrays([arr1, [2, 1]], names=["a", None])
midx2 = MultiIndex.from_arrays([arr2, [1, 2]])
result = midx.union(midx2)
# Expected is actually off and should contain (1, 1) too. See GH#37222
expected = MultiIndex.from_arrays(
[Series([4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [2, 1, 2]]
[Series([1, 4, pd.NA, pd.NA], dtype=any_numeric_ea_dtype), [1, 2, 1, 2]]
)
tm.assert_index_equal(result, expected)

Expand Down Expand Up @@ -667,3 +675,12 @@ def test_intersection_keep_ea_dtypes(val, any_numeric_ea_dtype):
result = midx.intersection(midx2)
expected = MultiIndex.from_arrays([Series([2], dtype=any_numeric_ea_dtype), [1]])
tm.assert_index_equal(result, expected)


def test_union_with_na_when_constructing_dataframe():
# GH43222
series1 = Series((1,), index=MultiIndex.from_tuples(((None, None),)))
series2 = Series((10, 20), index=MultiIndex.from_tuples(((None, None), ("a", "b"))))
result = DataFrame([series1, series2])
expected = DataFrame({(np.nan, np.nan): [1.0, 10.0], ("a", "b"): [np.nan, 20.0]})
tm.assert_frame_equal(result, expected)
20 changes: 16 additions & 4 deletions pandas/tests/indexing/multiindex/test_multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,10 +162,10 @@ def test_rename_multiindex_with_duplicates(self):
[1, 2],
],
[
[(81.0, np.nan), (np.nan, np.nan)],
[(81.0, np.nan), (np.nan, np.nan)],
[1, 2],
[1, 1],
[(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)],
[(81.0, np.nan), (np.nan, np.nan), (82.0, np.nan)],
[1, 2, np.nan],
[np.nan, 1, 2],
],
),
(
Expand Down Expand Up @@ -227,3 +227,15 @@ def test_multiindex_repeated_keys(self):
],
Series([1, 1, 2, 2], MultiIndex.from_arrays([["a", "a", "b", "b"]])),
)

def test_multiindex_with_na_missing_key(self):
# GH46173
df = DataFrame.from_dict(
{
("foo",): [1, 2, 3],
("bar",): [5, 6, 7],
(None,): [8, 9, 0],
}
)
with pytest.raises(KeyError, match="missing_key"):
df[[("missing_key",)]]