Skip to content

Commit

Permalink
Backport PR #50918 on branch 2.0.x (ENH: Optimize replace to avoid co…
Browse files Browse the repository at this point in the history
…pying when not necessary) (#51652)

Backport PR #50918: ENH: Optimize replace to avoid copying when not necessary

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Feb 26, 2023
1 parent 8906d4a commit 3a6fd1e
Show file tree
Hide file tree
Showing 4 changed files with 205 additions and 55 deletions.
36 changes: 30 additions & 6 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,7 @@ def replace(
inplace: bool = False,
# mask may be pre-computed if we're called from replace_list
mask: npt.NDArray[np.bool_] | None = None,
using_cow: bool = False,
) -> list[Block]:
"""
replace the to_replace value with value, possible to create new
Expand All @@ -566,7 +567,12 @@ def replace(
if isinstance(values, Categorical):
# TODO: avoid special-casing
# GH49404
blk = self if inplace else self.copy()
if using_cow and (self.refs.has_reference() or not inplace):
blk = self.copy()
elif using_cow:
blk = self.copy(deep=False)
else:
blk = self if inplace else self.copy()
values = cast(Categorical, blk.values)
values._replace(to_replace=to_replace, value=value, inplace=True)
return [blk]
Expand All @@ -576,22 +582,36 @@ def replace(
# replacing it is a no-op.
# Note: If to_replace were a list, NDFrame.replace would call
# replace_list instead of replace.
return [self] if inplace else [self.copy()]
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]

if mask is None:
mask = missing.mask_missing(values, to_replace)
if not mask.any():
# Note: we get here with test_replace_extension_other incorrectly
# bc _can_hold_element is incorrect.
return [self] if inplace else [self.copy()]
if using_cow:
return [self.copy(deep=False)]
else:
return [self] if inplace else [self.copy()]

elif self._can_hold_element(value):
blk = self if inplace else self.copy()
# TODO(CoW): Maybe split here as well into columns where mask has True
# and rest?
if using_cow:
if inplace:
blk = self.copy(deep=self.refs.has_reference())
else:
blk = self.copy()
else:
blk = self if inplace else self.copy()
putmask_inplace(blk.values, mask, value)
if not (self.is_object and value is None):
# if the user *explicitly* gave None, we keep None, otherwise
# may downcast to NaN
blocks = blk.convert(copy=False)
blocks = blk.convert(copy=False, using_cow=using_cow)
else:
blocks = [blk]
return blocks
Expand Down Expand Up @@ -619,6 +639,7 @@ def replace(
value=value,
inplace=True,
mask=mask[i : i + 1],
using_cow=using_cow,
)
)
return blocks
Expand Down Expand Up @@ -797,7 +818,10 @@ def _replace_coerce(
return [nb]
return [self] if inplace else [self.copy()]
return self.replace(
to_replace=to_replace, value=value, inplace=inplace, mask=mask
to_replace=to_replace,
value=value,
inplace=inplace,
mask=mask,
)

# ---------------------------------------------------------------------
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,11 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
assert not is_list_like(to_replace)
assert not is_list_like(value)
return self.apply(
"replace", to_replace=to_replace, value=value, inplace=inplace
"replace",
to_replace=to_replace,
value=value,
inplace=inplace,
using_cow=using_copy_on_write(),
)

def replace_regex(self, **kwargs):
Expand Down
38 changes: 0 additions & 38 deletions pandas/tests/copy_view/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,44 +1210,6 @@ def test_items(using_copy_on_write):
assert df.loc[0, name] == 0


@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)


@pytest.mark.parametrize("dtype", ["int64", "Int64"])
def test_putmask(using_copy_on_write, dtype):
df = DataFrame({"a": [1, 2], "b": 1, "c": 2}, dtype=dtype)
Expand Down
180 changes: 170 additions & 10 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,34 +9,194 @@
from pandas.tests.copy_view.util import get_array


def test_replace_categorical_inplace_reference(using_copy_on_write):
df = DataFrame({"a": Categorical([1, 2, 3])})
@pytest.mark.parametrize(
"replace_kwargs",
[
{"to_replace": {"a": 1, "b": 4}, "value": -1},
# Test CoW splits blocks to avoid copying unchanged columns
{"to_replace": {"a": 1}, "value": -1},
{"to_replace": {"b": 4}, "value": -1},
{"to_replace": {"b": {4: 1}}},
# TODO: Add these in a further optimization
# We would need to see which columns got replaced in the mask
# which could be expensive
# {"to_replace": {"b": 1}},
# 1
],
)
def test_replace(using_copy_on_write, replace_kwargs):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
df_orig = df.copy()

df_replaced = df.replace(**replace_kwargs)

if using_copy_on_write:
if (df_replaced["b"] == df["b"]).all():
assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

# mutating squeezed df triggers a copy-on-write for that column/block
df_replaced.loc[0, "c"] = -1
if using_copy_on_write:
assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))

if "a" in replace_kwargs["to_replace"]:
arr = get_array(df_replaced, "a")
df_replaced.loc[0, "a"] = 100
assert np.shares_memory(get_array(df_replaced, "a"), arr)
tm.assert_frame_equal(df, df_orig)


def test_replace_mask_all_false_second_block(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value=55.5)

if using_copy_on_write:
# TODO: Block splitting would allow us to avoid copying b
assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "c"] = 1
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
# TODO: This should split and not copy the whole block
# assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))


def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace=1.5, value="a")

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

elif not using_array_manager:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

if using_copy_on_write:
df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_to_replace_wrong_dtype(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
df_orig = df.copy()

df2 = df.replace(to_replace="xxx", value=1.5)

if using_copy_on_write:
assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

else:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))

df2.loc[0, "b"] = 0.5
tm.assert_frame_equal(df, df_orig) # Original is unchanged

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))


def test_replace_inplace(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
df.replace(to_replace=1.5, value=15.5, inplace=True)

assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)


@pytest.mark.parametrize("to_replace", [1.5, [1.5]])
def test_replace_inplace_reference(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=[1], value=2, inplace=True)
df.replace(to_replace=to_replace, value=15.5, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert np.shares_memory(get_array(df, "a"), arr_a)


def test_replace_inplace_reference(using_copy_on_write):
@pytest.mark.parametrize("to_replace", ["a", 100.5])
def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
df = DataFrame({"a": [1.5, 2, 3]})
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=[1.5], value=15.5, inplace=True)
df.replace(to_replace=to_replace, value=15.5, inplace=True)

assert np.shares_memory(get_array(df, "a"), arr_a)
if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a"), arr_a)
assert not df._mgr._has_no_reference(0)
assert not view._mgr._has_no_reference(0)


@pytest.mark.parametrize("to_replace", [1, [1]])
@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
arr_a = get_array(df, "a")
view = df[:]
df.replace(to_replace=to_replace, value=val, inplace=True)

if using_copy_on_write:
assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
assert df._mgr._has_no_reference(0)
assert view._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(get_array(df, "a"), arr_a)
assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical_inplace(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
arr_a = get_array(df, "a")
df.replace(to_replace=1, value=val, inplace=True)

assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)

expected = DataFrame({"a": Categorical([val, 2, 3])})
tm.assert_frame_equal(df, expected)


@pytest.mark.parametrize("val", [1, 1.5])
def test_replace_categorical(using_copy_on_write, val):
df = DataFrame({"a": Categorical([1, 2, 3])})
df_orig = df.copy()
df2 = df.replace(to_replace=1, value=val)

if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert df2._mgr._has_no_reference(0)
assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
tm.assert_frame_equal(df, df_orig)

arr_a = get_array(df2, "a").codes
df2.iloc[0, 0] = 2.0
assert np.shares_memory(get_array(df2, "a").codes, arr_a)


@pytest.mark.parametrize("method", ["where", "mask"])
Expand Down

0 comments on commit 3a6fd1e

Please sign in to comment.