Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Improve replace perf #12745

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,30 @@ def time_replace_large_dict(self):
self.s.replace(self.to_rep, inplace=True)


class replace_convert(object):
goal_time = 0.5

def setup(self):
self.n = (10 ** 3)
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
'B': np.random.randint(self.n, size=(10 ** 3))})

def time_replace_series_timestamp(self):
self.s.replace(self.to_ts)

def time_replace_series_timedelta(self):
self.s.replace(self.to_td)

def time_replace_frame_timestamp(self):
self.df.replace(self.to_ts)

def time_replace_frame_timedelta(self):
self.df.replace(self.to_td)


class replace_replacena(object):
goal_time = 0.2

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Highlights include:
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- Improved performance of ``.replace()`` (:issue:`12745`)

.. _whatsnew_0192.bug_fixes:

Expand Down
23 changes: 15 additions & 8 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
res = self if inplace else self.copy()
for c, src in compat.iteritems(to_replace):
if c in value and c in self:
# object conversion is handled in
# series.replace which is called recursivelly
res[c] = res[c].replace(to_replace=src,
value=value[c],
inplace=False, regex=regex)
inplace=False,
regex=regex)
return None if inplace else res

# {'A': NA} -> 0
elif not is_list_like(value):
for k, src in compat.iteritems(to_replace):
if k in self:
new_data = new_data.replace(to_replace=src,
value=value,
filter=[k],
inplace=inplace,
regex=regex)
keys = [(k, src) for k, src in compat.iteritems(to_replace)
if k in self]
keys_len = len(keys) - 1
for i, (k, src) in enumerate(keys):
convert = i == keys_len
new_data = new_data.replace(to_replace=src,
value=value,
filter=[k],
inplace=inplace,
regex=regex,
convert=convert)
else:
raise TypeError('value argument must be scalar, dict, or '
'Series')
Expand Down
17 changes: 12 additions & 5 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,6 @@ def replace(self, to_replace, value, inplace=False, filter=None,

original_to_replace = to_replace
mask = isnull(self.values)

# try to replace, if we raise an error, convert to ObjectBlock and
# retry
try:
Expand Down Expand Up @@ -1795,13 +1794,14 @@ def should_store(self, value):
return issubclass(value.dtype.type, np.bool_)

def replace(self, to_replace, value, inplace=False, filter=None,
regex=False, mgr=None):
regex=False, convert=True, mgr=None):
to_replace_values = np.atleast_1d(to_replace)
if not np.can_cast(to_replace_values, bool):
return self
return super(BoolBlock, self).replace(to_replace, value,
inplace=inplace, filter=filter,
regex=regex, mgr=mgr)
regex=regex, convert=convert,
mgr=mgr)


class ObjectBlock(Block):
Expand Down Expand Up @@ -3214,6 +3214,7 @@ def comp(s):
masks = [comp(s) for i, s in enumerate(src_list)]

result_blocks = []
src_len = len(src_list) - 1
for blk in self.blocks:

# its possible to get multiple result blocks here
Expand All @@ -3223,8 +3224,9 @@ def comp(s):
new_rb = []
for b in rb:
if b.dtype == np.object_:
convert = i == src_len
result = b.replace(s, d, inplace=inplace, regex=regex,
mgr=mgr)
mgr=mgr, convert=convert)
new_rb = _extend_blocks(result, new_rb)
else:
# get our mask for this element, sized to this
Expand Down Expand Up @@ -4788,7 +4790,12 @@ def _putmask_smart(v, m, n):

# change the dtype
dtype, _ = _maybe_promote(n.dtype)
nv = v.astype(dtype)

if is_extension_type(v.dtype) and is_object_dtype(dtype):
nv = v.get_values(dtype)
else:
nv = v.astype(dtype)

try:
nv[m] = n[m]
except ValueError:
Expand Down