Skip to content

Commit

Permalink
Backport PR #50778 on branch 2.0.x (PERF: Use generator expression fo…
Browse files Browse the repository at this point in the history
…r Blocks.replace_list) (#51714)

Backport PR #50778: PERF: Use generator expression for Blocks.replace_list

Co-authored-by: Thomas Li <47963215+lithomas1@users.noreply.github.com>
  • Loading branch information
meeseeksmachine and lithomas1 committed Mar 1, 2023
1 parent 4ca3ca1 commit fb35381
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 12 deletions.
29 changes: 29 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,4 +386,33 @@ def time_to_numpy_copy(self):
self.ser.to_numpy(copy=True)


class Replace:
param_names = ["num_to_replace"]
params = [100, 1000]

def setup(self, num_to_replace):
N = 1_000_000
self.arr = np.random.randn(N)
self.arr1 = self.arr.copy()
np.random.shuffle(self.arr1)
self.ser = Series(self.arr)

self.to_replace_list = np.random.choice(self.arr, num_to_replace)
self.values_list = np.random.choice(self.arr1, num_to_replace)

self.replace_dict = dict(zip(self.to_replace_list, self.values_list))

def time_replace_dict(self, num_to_replace):
self.ser.replace(self.replace_dict)

def peakmem_replace_dict(self, num_to_replace):
self.ser.replace(self.replace_dict)

def time_replace_list(self, num_to_replace):
self.ser.replace(self.to_replace_list, self.values_list)

def peakmem_replace_list(self, num_to_replace):
self.ser.replace(self.to_replace_list, self.values_list)


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1169,9 +1169,9 @@ Performance improvements
- Fixed a reference leak in :func:`read_hdf` (:issue:`37441`)
- Fixed a memory leak in :meth:`DataFrame.to_json` and :meth:`Series.to_json` when serializing datetimes and timedeltas (:issue:`40443`)
- Decreased memory usage in many :class:`DataFrameGroupBy` methods (:issue:`51090`)
- Performance improvement in :meth:`DataFrame.replace` and :meth:`Series.replace` when using a large dict for ``to_replace`` (:issue:`6697`)
- Memory improvement in :class:`StataReader` when reading seekable files (:issue:`48922`)


.. ---------------------------------------------------------------------------
.. _whatsnew_200.bug_fixes:

Expand Down
32 changes: 21 additions & 11 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,24 +738,34 @@ def replace_list(
if is_string_dtype(values.dtype):
# Calculate the mask once, prior to the call of comp
# in order to avoid repeating the same computations
mask = ~isna(values)
masks = [
compare_or_regex_search(values, s[0], regex=regex, mask=mask)
na_mask = ~isna(values)
masks: Iterable[npt.NDArray[np.bool_]] = (
extract_bool_array(
cast(
ArrayLike,
compare_or_regex_search(
values, s[0], regex=regex, mask=na_mask
),
)
)
for s in pairs
]
)
else:
# GH#38086 faster if we know we dont need to check for regex
masks = [missing.mask_missing(values, s[0]) for s in pairs]

masks = [extract_bool_array(x) for x in masks]
masks = (missing.mask_missing(values, s[0]) for s in pairs)
# Materialize if inplace = True, since the masks can change
# as we replace
if inplace:
masks = list(masks)

if using_cow and inplace:
# Don't set up refs here, otherwise we will think that we have
# references when we check again later
rb = [self]
else:
rb = [self if inplace else self.copy()]
for i, (src, dest) in enumerate(pairs):

for i, ((src, dest), mask) in enumerate(zip(pairs, masks)):
convert = i == src_len # only convert once at the end
new_rb: list[Block] = []

Expand All @@ -764,9 +774,9 @@ def replace_list(
# where to index into the mask
for blk_num, blk in enumerate(rb):
if len(rb) == 1:
m = masks[i]
m = mask
else:
mib = masks[i]
mib = mask
assert not isinstance(mib, bool)
m = mib[blk_num : blk_num + 1]

Expand All @@ -776,7 +786,7 @@ def replace_list(
result = blk._replace_coerce(
to_replace=src,
value=dest,
mask=m, # type: ignore[arg-type]
mask=m,
inplace=inplace,
regex=regex,
using_cow=using_cow,
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/series/methods/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,19 @@ def test_replace2(self):
assert (ser[6:10] == -1).all()
assert (ser[20:30] == -1).all()

@pytest.mark.parametrize("inplace", [True, False])
def test_replace_cascade(self, inplace):
# Test that replaced values are not replaced again
# GH #50778
ser = pd.Series([1, 2, 3])
expected = pd.Series([2, 3, 4])

res = ser.replace([1, 2, 3], [2, 3, 4], inplace=inplace)
if inplace:
tm.assert_series_equal(ser, expected)
else:
tm.assert_series_equal(res, expected)

def test_replace_with_dictlike_and_string_dtype(self, nullable_string_dtype):
# GH 32621, GH#44940
ser = pd.Series(["one", "two", np.nan], dtype=nullable_string_dtype)
Expand Down

0 comments on commit fb35381

Please sign in to comment.