Skip to content

Commit

Permalink
Backport PR #51658 on branch 2.0.x (ENH: Improve replace lazy copy ha…
Browse files Browse the repository at this point in the history
…ndling) (#51679)

Backport PR #51658: ENH: Improve replace lazy copy handling

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Feb 27, 2023
1 parent 597d354 commit c047d9c
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 6 deletions.
27 changes: 22 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,8 @@ def replace_list(
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
]
if not len(pairs):
if using_cow:
return [self.copy(deep=False)]
# shortcut, nothing to replace
return [self] if inplace else [self.copy()]

Expand All @@ -734,8 +736,9 @@ def replace_list(
masks = [extract_bool_array(x) for x in masks]

if using_cow and inplace:
# TODO(CoW): Optimize
rb = [self.copy()]
# Don't set up refs here, otherwise we will think that we have
# references when we check again later
rb = [self]
else:
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):
Expand All @@ -762,10 +765,16 @@ def replace_list(
mask=m, # type: ignore[arg-type]
inplace=inplace,
regex=regex,
using_cow=using_cow,
)
if convert and blk.is_object and not all(x is None for x in dest_list):
# GH#44498 avoid unwanted cast-back
result = extend_blocks([b.convert(copy=True) for b in result])
result = extend_blocks(
[
b.convert(copy=True and not using_cow, using_cow=using_cow)
for b in result
]
)
new_rb.extend(result)
rb = new_rb
return rb
Expand All @@ -778,6 +787,7 @@ def _replace_coerce(
mask: npt.NDArray[np.bool_],
inplace: bool = True,
regex: bool = False,
using_cow: bool = False,
) -> list[Block]:
"""
Replace value corresponding to the given boolean array with another
Expand Down Expand Up @@ -811,17 +821,24 @@ def _replace_coerce(
if value is None:
# gh-45601, gh-45836, gh-46634
if mask.any():
nb = self.astype(np.dtype(object), copy=False)
if nb is self and not inplace:
has_ref = self.refs.has_reference()
nb = self.astype(np.dtype(object), copy=False, using_cow=using_cow)
if (nb is self or using_cow) and not inplace:
nb = nb.copy()
elif inplace and has_ref and nb.refs.has_reference():
# no copy in astype and we had refs before
nb = nb.copy()
putmask_inplace(nb.values, mask, value)
return [nb]
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
using_cow=using_cow,
)

# ---------------------------------------------------------------------
Expand Down
71 changes: 70 additions & 1 deletion pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ def test_replace_to_replace_wrong_dtype(using_copy_on_write):
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_inplace(using_copy_on_write):
@pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
def test_replace_inplace(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)
Expand Down Expand Up @@ -216,3 +217,71 @@ def test_masking_inplace(using_copy_on_write, method):
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)


def test_replace_empty_list(using_copy_on_write):
df = DataFrame({"a": [1, 2]})

df2 = df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))

arr_a = get_array(df, "a")
df.replace([], [])
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not df2._mgr._has_no_reference(0)


@pytest.mark.parametrize("value", ["d", None])
def test_replace_object_list_inplace(using_copy_on_write, value):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df.replace(["c"], value, inplace=True)
if using_copy_on_write or value is None:
assert np.shares_memory(arr, get_array(df, "a"))
else:
# This could be inplace
assert not np.shares_memory(arr, get_array(df, "a"))
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


def test_replace_list_multiple_elements_inplace(using_copy_on_write):
df = DataFrame({"a": [1, 2, 3]})
arr = get_array(df, "a")
df.replace([1, 2], 4, inplace=True)
if using_copy_on_write:
# TODO(CoW): This should share memory
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_replace_list_none(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})

df_orig = df.copy()
df2 = df.replace(["b"], value=None)
tm.assert_frame_equal(df, df_orig)

assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))


def test_replace_list_none_inplace_refs(using_copy_on_write):
df = DataFrame({"a": ["a", "b", "c"]})
arr = get_array(df, "a")
df_orig = df.copy()
view = df[:]
df.replace(["a"], value=None, inplace=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert not np.shares_memory(arr, get_array(df, "a"))
tm.assert_frame_equal(df_orig, view)
else:
assert np.shares_memory(arr, get_array(df, "a"))

0 comments on commit c047d9c

Please sign in to comment.