Skip to content

Commit

Permalink
Backport PR #51669 on branch 2.0.x (ENH: Add CoW mechanism to replace…
Browse files Browse the repository at this point in the history
…_regex) (#51713)

Backport PR #51669: ENH: Add CoW mechanism to replace_regex

Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and phofl committed Mar 1, 2023
1 parent 3c5fb2e commit 4ca3ca1
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
21 changes: 18 additions & 3 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def _replace_regex(
value,
inplace: bool = False,
mask=None,
using_cow: bool = False,
) -> list[Block]:
"""
Replace elements by the given value.
Expand All @@ -665,6 +666,8 @@ def _replace_regex(
Perform inplace modification.
mask : array-like of bool, optional
True indicate corresponding element is ignored.
using_cow: bool, default False
Specifying if copy on write is enabled.
Returns
-------
Expand All @@ -673,15 +676,27 @@ def _replace_regex(
if not self._can_hold_element(to_replace):
# i.e. only ObjectBlock, but could in principle include a
# String ExtensionBlock
if using_cow:
return [self.copy(deep=False)]
return [self] if inplace else [self.copy()]

rx = re.compile(to_replace)

new_values = self.values if inplace else self.values.copy()
if using_cow:
if inplace and not self.refs.has_reference():
refs = self.refs
new_values = self.values
else:
refs = None
new_values = self.values.copy()
else:
refs = None
new_values = self.values if inplace else self.values.copy()

replace_regex(new_values, rx, value, mask)

block = self.make_block(new_values)
return block.convert(copy=False)
block = self.make_block(new_values, refs=refs)
return block.convert(copy=False, using_cow=using_cow)

@final
def replace_list(
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ def replace(self: T, to_replace, value, inplace: bool) -> T:
)

def replace_regex(self, **kwargs):
return self.apply("_replace_regex", **kwargs)
return self.apply("_replace_regex", **kwargs, using_cow=using_copy_on_write())

def replace_list(
self: T,
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/copy_view/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,51 @@ def test_replace(using_copy_on_write, replace_kwargs):
tm.assert_frame_equal(df, df_orig)


def test_replace_regex_inplace_refs(using_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
df_orig = df.copy()
view = df[:]
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert not np.shares_memory(arr, get_array(df, "a"))
assert df._mgr._has_no_reference(0)
tm.assert_frame_equal(view, df_orig)
else:
assert np.shares_memory(arr, get_array(df, "a"))


def test_replace_regex_inplace(using_copy_on_write):
df = DataFrame({"a": ["aaa", "bbb"]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_regex_inplace_no_op(using_copy_on_write):
df = DataFrame({"a": [1, 2]})
arr = get_array(df, "a")
df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
if using_copy_on_write:
assert df._mgr._has_no_reference(0)
assert np.shares_memory(arr, get_array(df, "a"))

df_orig = df.copy()
df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
tm.assert_frame_equal(df_orig, df)
if using_copy_on_write:
assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
else:
assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))


def test_replace_mask_all_false_second_block(using_copy_on_write):
df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
df_orig = df.copy()
Expand Down

0 comments on commit 4ca3ca1

Please sign in to comment.