Skip to content

Commit

Permalink
merge master
Browse files Browse the repository at this point in the history
  • Loading branch information
charlesdong1991 committed Jun 12, 2019
1 parent 634577e commit aaf9be3
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 27 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.25.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ Other Deprecations
Use the public attributes :attr:`~RangeIndex.start`, :attr:`~RangeIndex.stop` and :attr:`~RangeIndex.step` instead (:issue:`26581`).
- The :meth:`Series.ftype`, :meth:`Series.ftypes` and :meth:`DataFrame.ftypes` methods are deprecated and will be removed in a future version.
Instead, use :meth:`Series.dtype` and :meth:`DataFrame.dtypes` (:issue:`26705`).

- :func:`Series.str.replace`, when ``pat`` is single special regex character (such as ``.|\`` etc) and regex is not defined, regex is by default ``False`` for now, but this might be deprecated in the future. (:issue:`24804`)

.. _whatsnew_0250.prior_deprecations:

Expand Down Expand Up @@ -605,6 +605,7 @@ Conversion

Strings
^^^^^^^
- Bug in :func:`Series.str.replace` not applying regex in patterns of length 1 (:issue:`24804`)

- Bug in the ``__name__`` attribute of several methods of :class:`Series.str`, which were set incorrectly (:issue:`23551`)
-
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/reshape/melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,8 @@ def melt_stub(df, stub, i, j, value_vars, sep):
newdf = melt(df, id_vars=i, value_vars=value_vars,
value_name=stub.rstrip(sep), var_name=j)
newdf[j] = Categorical(newdf[j])
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "",
regex=True)

# GH17627 Cast numerics suffixes to int/float
newdf[j] = to_numeric(newdf[j], errors='ignore')
Expand Down
16 changes: 13 additions & 3 deletions pandas/core/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def str_endswith(arr, pat, na=np.nan):
return _na_map(f, arr, na, dtype=bool)


def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=None):
r"""
Replace occurrences of pattern/regex in the Series/Index with
some other string. Equivalent to :meth:`str.replace` or
Expand Down Expand Up @@ -452,9 +452,13 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
flags : int, default 0 (no flags)
- re module flags, e.g. re.IGNORECASE
- Cannot be set if `pat` is a compiled regex
regex : bool, default True
regex : boolean, default None
- If True, assumes the passed-in pattern is a regular expression.
- If False, treats the pattern as a literal string
- If `pat` is a single character and `regex` is not specified, `pat`
is interpreted as a string literal. If `pat` is also a regular
expression symbol, a warning is issued that in the future `pat`
will be interpreted as a regex, rather than a literal.
- Cannot be set to False if `pat` is a compiled regex or `repl` is
a callable.
Expand Down Expand Up @@ -561,7 +565,7 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
# add case flag, if provided
if case is False:
flags |= re.IGNORECASE
if is_compiled_re or len(pat) > 1 or flags or callable(repl):
if is_compiled_re or pat or flags or callable(repl):
n = n if n >= 0 else 0
compiled = re.compile(pat, flags=flags)
f = lambda x: compiled.sub(repl=repl, string=x, count=n)
Expand All @@ -574,6 +578,12 @@ def str_replace(arr, pat, repl, n=-1, case=None, flags=0, regex=True):
if callable(repl):
raise ValueError("Cannot use a callable replacement when "
"regex=False")
# if regex is default None, and a single special character is given
# in pat, still take it as a literal, and raise the Future warning
if regex is None and len(pat) == 1 and pat in list(r"[\^$.|?*+()]"):
warnings.warn("'{}' is interpreted as a literal in ".format(pat) +
"default, not regex. It will change in the future.",
FutureWarning)
f = lambda x: x.replace(pat, repl, n)

return _na_map(f, arr)
Expand Down
104 changes: 82 additions & 22 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from numpy.random import randint
import pytest


from pandas import DataFrame, Index, MultiIndex, Series, concat, isna, notna
import pandas.core.strings as strings
import pandas.util.testing as tm
Expand Down Expand Up @@ -892,27 +893,39 @@ def test_casemethods(self):
def test_replace(self):
values = Series(['fooBAD__barBAD', NA])

result = values.str.replace('BAD[_]*', '')
result = values.str.replace('BAD[_]*', '', regex=True)
exp = Series(['foobar', NA])
tm.assert_series_equal(result, exp)

result = values.str.replace('BAD[_]*', '', n=1)
result = values.str.replace('BAD[_]*', '', regex=True, n=1)
exp = Series(['foobarBAD', NA])
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
None, 1, 2.])

rs = Series(mixed).str.replace('BAD[_]*', '')
rs = Series(mixed).str.replace('BAD[_]*', '', regex=True)
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

# unicode
values = Series([u'fooBAD__barBAD', NA])

result = values.str.replace('BAD[_]*', '', regex=True)
exp = Series([u'foobar', NA])
tm.assert_series_equal(result, exp)

result = values.str.replace('BAD[_]*', '', n=1, regex=True)
exp = Series([u'foobarBAD', NA])
tm.assert_series_equal(result, exp)

# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", flags=re.UNICODE)
result = values.str.replace(r"(?<=\w),(?=\w)", ", ", regex=True,
flags=re.UNICODE)
tm.assert_series_equal(result, exp)

# GH 13438
Expand All @@ -930,7 +943,7 @@ def test_replace_callable(self):

# test with callable
repl = lambda m: m.group(0).swapcase()
result = values.str.replace('[a-z][A-Z]{2}', repl, n=2)
result = values.str.replace('[a-z][A-Z]{2}', repl, n=2, regex=True)
exp = Series(['foObaD__baRbaD', NA])
tm.assert_series_equal(result, exp)

Expand All @@ -940,21 +953,21 @@ def test_replace_callable(self):

repl = lambda: None
with pytest.raises(TypeError, match=p_err):
values.str.replace('a', repl)
values.str.replace('a', repl, regex=True)

repl = lambda m, x: None
with pytest.raises(TypeError, match=p_err):
values.str.replace('a', repl)
values.str.replace('a', repl, regex=True)

repl = lambda m, x, y=None: None
with pytest.raises(TypeError, match=p_err):
values.str.replace('a', repl)
values.str.replace('a', repl, regex=True)

# test regex named groups
values = Series(['Foo Bar Baz', NA])
pat = r"(?P<first>\w+) (?P<middle>\w+) (?P<last>\w+)"
repl = lambda m: m.group('middle').swapcase()
result = values.str.replace(pat, repl)
result = values.str.replace(pat, repl, regex=True)
exp = Series(['bAR', NA])
tm.assert_series_equal(result, exp)

Expand All @@ -964,28 +977,39 @@ def test_replace_compiled_regex(self):

# test with compiled regex
pat = re.compile(r'BAD[_]*')
result = values.str.replace(pat, '')
result = values.str.replace(pat, '', regex=True)
exp = Series(['foobar', NA])
tm.assert_series_equal(result, exp)

result = values.str.replace(pat, '', n=1)
result = values.str.replace(pat, '', n=1, regex=True)
exp = Series(['foobarBAD', NA])
tm.assert_series_equal(result, exp)

# mixed
mixed = Series(['aBAD', NA, 'bBAD', True, datetime.today(), 'fooBAD',
None, 1, 2.])

rs = Series(mixed).str.replace(pat, '')
rs = Series(mixed).str.replace(pat, '', regex=True)
xp = Series(['a', NA, 'b', NA, NA, 'foo', NA, NA, NA])
assert isinstance(rs, Series)
tm.assert_almost_equal(rs, xp)

# unicode
values = Series([u'fooBAD__barBAD', NA])

result = values.str.replace(pat, '', regex=True)
exp = Series([u'foobar', NA])
tm.assert_series_equal(result, exp)

result = values.str.replace(pat, '', n=1, regex=True)
exp = Series([u'foobarBAD', NA])
tm.assert_series_equal(result, exp)

# flags + unicode
values = Series([b"abcd,\xc3\xa0".decode("utf-8")])
exp = Series([b"abcd, \xc3\xa0".decode("utf-8")])
pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE)
result = values.str.replace(pat, ", ")
result = values.str.replace(pat, ", ", regex=True)
tm.assert_series_equal(result, exp)

# case and flags provided to str.replace will have no effect
Expand All @@ -995,29 +1019,30 @@ def test_replace_compiled_regex(self):

with pytest.raises(ValueError,
match="case and flags cannot be"):
result = values.str.replace(pat, '', flags=re.IGNORECASE)
result = values.str.replace(pat, '', flags=re.IGNORECASE,
regex=True)

with pytest.raises(ValueError,
match="case and flags cannot be"):
result = values.str.replace(pat, '', case=False)
result = values.str.replace(pat, '', case=False, regex=True)

with pytest.raises(ValueError,
match="case and flags cannot be"):
result = values.str.replace(pat, '', case=True)
result = values.str.replace(pat, '', case=True, regex=True)

# test with callable
values = Series(['fooBAD__barBAD', NA])
repl = lambda m: m.group(0).swapcase()
pat = re.compile('[a-z][A-Z]{2}')
result = values.str.replace(pat, repl, n=2)
result = values.str.replace(pat, repl, n=2, regex=True)
exp = Series(['foObaD__baRbaD', NA])
tm.assert_series_equal(result, exp)

def test_replace_literal(self):
# GH16808 literal replace (regex=False vs regex=True)
values = Series(['f.o', 'foo', NA])
exp = Series(['bao', 'bao', NA])
result = values.str.replace('f.', 'ba')
result = values.str.replace('f.', 'ba', regex=True)
tm.assert_series_equal(result, exp)

exp = Series(['bao', 'foo', NA])
Expand Down Expand Up @@ -2710,6 +2735,7 @@ def test_partition_deprecation(self):
result = values.str.rpartition(pat='_')
tm.assert_frame_equal(result, expected)

@pytest.mark.filterwarnings("ignore: '|' is interpreted as a literal")
def test_pipe_failures(self):
# #2119
s = Series(['A|B|C'])
Expand All @@ -2719,7 +2745,7 @@ def test_pipe_failures(self):

tm.assert_series_equal(result, exp)

result = s.str.replace('|', ' ')
result = s.str.replace('|', ' ', regex=None)
exp = Series(['A B C'])

tm.assert_series_equal(result, exp)
Expand Down Expand Up @@ -2980,17 +3006,17 @@ def test_replace_moar(self):
s = Series(['A', 'B', 'C', 'Aaba', 'Baca', '', NA, 'CABA',
'dog', 'cat'])

result = s.str.replace('A', 'YYY')
result = s.str.replace('A', 'YYY', regex=True)
expected = Series(['YYY', 'B', 'C', 'YYYaba', 'Baca', '', NA,
'CYYYBYYY', 'dog', 'cat'])
assert_series_equal(result, expected)

result = s.str.replace('A', 'YYY', case=False)
result = s.str.replace('A', 'YYY', case=False, regex=True)
expected = Series(['YYY', 'B', 'C', 'YYYYYYbYYY', 'BYYYcYYY', '', NA,
'CYYYBYYY', 'dog', 'cYYYt'])
assert_series_equal(result, expected)

result = s.str.replace('^.a|dog', 'XX-XX ', case=False)
result = s.str.replace('^.a|dog', 'XX-XX ', case=False, regex=True)
expected = Series(['A', 'B', 'C', 'XX-XX ba', 'XX-XX ca', '', NA,
'XX-XX BA', 'XX-XX ', 'XX-XX t'])
assert_series_equal(result, expected)
Expand Down Expand Up @@ -3162,6 +3188,40 @@ def test_method_on_bytes(self):
match="Cannot use .str.cat with values of.*"):
lhs.str.cat(rhs)

@pytest.mark.filterwarnings("ignore: '.' is interpreted as a literal")
@pytest.mark.parametrize("regex, expected_array", [
(True, ['foofoofoo', 'foofoofoo']),
(False, ['abc', '123']),
(None, ['abc', '123'])
])
def test_replace_single_pattern(self, regex, expected_array):
values = Series(['abc', '123'])
# GH: 24804
result = values.str.replace('.', 'foo', regex=regex)
expected = Series(expected_array)
tm.assert_series_equal(result, expected)

@pytest.mark.parametrize("input_array, single_char, replace_char, "
"expect_array, warn",
[("a.c", ".", "b", "abc", True),
("a@c", "@", "at", "aatc", False)]
)
def test_replace_warning_single_character(self, input_array,
single_char, replace_char,
expect_array, warn):
# GH: 24804
values = Series([input_array])
if warn:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
result = values.str.replace(single_char, replace_char,
regex=None)
else:
result = values.str.replace(single_char, replace_char)

expected = Series([expect_array])
tm.assert_series_equal(result, expected)

def test_casefold(self):
# GH25405
expected = Series(['ss', NA, 'case', 'ssd'])
Expand Down

0 comments on commit aaf9be3

Please sign in to comment.