Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: MultiIndex.putmask losing ea dtype #49847

Merged
merged 4 commits into from
Nov 24, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,7 @@ MultiIndex
- Bug in :meth:`MultiIndex.union` not sorting when sort=None and index contains missing values (:issue:`49010`)
- Bug in :meth:`MultiIndex.append` not checking names for equality (:issue:`48288`)
- Bug in :meth:`MultiIndex.symmetric_difference` losing extension array (:issue:`48607`)
- Bug in :meth:`MultiIndex.putmask` losing extension array (:issue:`49830`)
mroeschke marked this conversation as resolved.
Show resolved Hide resolved
- Bug in :meth:`MultiIndex.value_counts` returning a :class:`Series` indexed by flat index of tuples instead of a :class:`MultiIndex` (:issue:`49558`)
-

Expand Down
10 changes: 8 additions & 2 deletions pandas/core/array_algos/putmask.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
"""
from __future__ import annotations

from typing import Any
from typing import (
TYPE_CHECKING,
Any,
)

import numpy as np

Expand All @@ -19,6 +22,9 @@

from pandas.core.arrays import ExtensionArray

if TYPE_CHECKING:
from pandas import MultiIndex


def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None:
"""
Expand Down Expand Up @@ -96,7 +102,7 @@ def putmask_without_repeat(


def validate_putmask(
values: ArrayLike, mask: np.ndarray
values: ArrayLike | MultiIndex, mask: np.ndarray
) -> tuple[npt.NDArray[np.bool_], bool]:
"""
Validate mask and check if this putmask operation is a no-op.
Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5159,7 +5159,6 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index:

return Index._with_infer(result, name=name)

@final
def putmask(self, mask, value) -> Index:
"""
Return a new Index of the values set with the mask.
Expand Down
42 changes: 42 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
)

import pandas.core.algorithms as algos
from pandas.core.array_algos.putmask import validate_putmask
from pandas.core.arrays import Categorical
from pandas.core.arrays.categorical import factorize_from_iterables
import pandas.core.common as com
Expand Down Expand Up @@ -3659,6 +3660,47 @@ def _validate_fill_value(self, item):
raise ValueError("Item must have length equal to number of levels.")
return item

def putmask(self, mask, value: MultiIndex) -> MultiIndex:
"""
Return a new MultiIndex of the values set with the mask.

Parameters
----------
mask : array like
value : MultiIndex
Must either be the same length as self or length one

Returns
-------
MultiIndex
"""
mask, noop = validate_putmask(self, mask)
if noop:
return self.copy()

if len(mask) == len(value):
subset = value[mask].remove_unused_levels()
else:
subset = value.remove_unused_levels()

new_levels = []
new_codes = []

for i, (value_level, level, level_codes) in enumerate(
zip(subset.levels, self.levels, self.codes)
):
new_elements = value_level.difference(level)
new_level = level.append(new_elements)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the two lines above be replaced with new_level = level.union(value_level)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, was thinking about duplicates when implementing this, but levels are unique…

value_codes = new_level.get_indexer_for(subset.get_level_values(i))
new_code = ensure_int64(level_codes)
new_code[mask] = value_codes
new_levels.append(new_level)
new_codes.append(new_code)

return MultiIndex(
levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
)

def insert(self, loc: int, item) -> MultiIndex:
"""
Make new MultiIndex inserting new item at location
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/indexes/multi/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,34 @@ def test_putmask_multiindex_other(self):
expected = MultiIndex.from_tuples([right[0], right[1], left[2]])
tm.assert_index_equal(result, expected)

def test_putmask_keep_dtype(self, any_numeric_ea_dtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are the two test cases also covering for the all-nan case in #49830 (comment) or do you think it's not necessary?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's about losing the dtype, so yes this covers the all nan case. But this does not fix the issue yet

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The variable might be a misnomer then any_numeric_ea_dtype (if it covers all nans, this would also include non EA dtype).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It covers the case in a sense that the current tests represent the all nan case as well, I’ll add a specific test when actually fixing the issue anyway

# GH#49830
midx = MultiIndex.from_arrays(
[pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
)
midx2 = MultiIndex.from_arrays(
[pd.Series([5, 6, 7], dtype=any_numeric_ea_dtype), [-1, -2, -3]]
)
result = midx.putmask([True, False, False], midx2)
expected = MultiIndex.from_arrays(
[pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
)
tm.assert_index_equal(result, expected)

def test_putmask_keep_dtype_shorter_value(self, any_numeric_ea_dtype):
# GH#49830
midx = MultiIndex.from_arrays(
[pd.Series([1, 2, 3], dtype=any_numeric_ea_dtype), [10, 11, 12]]
)
midx2 = MultiIndex.from_arrays(
[pd.Series([5], dtype=any_numeric_ea_dtype), [-1]]
)
result = midx.putmask([True, False, False], midx2)
expected = MultiIndex.from_arrays(
[pd.Series([5, 2, 3], dtype=any_numeric_ea_dtype), [-1, 11, 12]]
)
tm.assert_index_equal(result, expected)


class TestGetIndexer:
def test_get_indexer(self):
Expand Down