Skip to content

Commit

Permalink
[ENH] Add "fullmatch" matching mode to Series.str [#32806] (#32807)
Browse files Browse the repository at this point in the history
  • Loading branch information
frreiss committed Mar 24, 2020
1 parent 08fce67 commit bed9103
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 12 deletions.
31 changes: 25 additions & 6 deletions doc/source/user_guide/text.rst
Expand Up @@ -641,21 +641,40 @@ You can check whether elements contain a pattern:
.. ipython:: python
pattern = r'[0-9][a-z]'
pd.Series(['1', '2', '3a', '3b', '03c'],
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
dtype="string").str.contains(pattern)
Or whether elements match a pattern:

.. ipython:: python
pd.Series(['1', '2', '3a', '3b', '03c'],
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
dtype="string").str.match(pattern)
The distinction between ``match`` and ``contains`` is strictness: ``match``
relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
.. versionadded:: 1.1.0

Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
an extra ``na`` argument so missing values can be considered True or False:
.. ipython:: python
pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
dtype="string").str.fullmatch(pattern)
.. note::

The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness:
``fullmatch`` tests whether the entire string matches the regular expression;
``match`` tests whether there is a match of the regular expression that begins
at the first character of the string; and ``contains`` tests whether there is
a match of the regular expression at any position within the string.

The corresponding functions in the ``re`` package for these three match modes are
`re.fullmatch <https://docs.python.org/3/library/re.html#re.fullmatch>`_,
`re.match <https://docs.python.org/3/library/re.html#re.match>`_, and
`re.search <https://docs.python.org/3/library/re.html#re.search>`_,
respectively.

Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and
``endswith`` take an extra ``na`` argument so missing values can be considered
True or False:

.. ipython:: python
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Expand Up @@ -69,6 +69,7 @@ Other enhancements
- `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
- :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
- Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
-

Expand Down
65 changes: 61 additions & 4 deletions pandas/core/strings.py
Expand Up @@ -2,15 +2,15 @@
from functools import wraps
import re
import textwrap
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union
import warnings

import numpy as np

import pandas._libs.lib as lib
import pandas._libs.missing as libmissing
import pandas._libs.ops as libops
from pandas._typing import ArrayLike, Dtype
from pandas._typing import ArrayLike, Dtype, Scalar
from pandas.util._decorators import Appender

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -787,9 +787,15 @@ def rep(x, r):
return result


def str_match(arr, pat, case=True, flags=0, na=np.nan):
def str_match(
arr: ArrayLike,
pat: Union[str, Pattern],
case: bool = True,
flags: int = 0,
na: Scalar = np.nan,
):
"""
Determine if each string matches a regular expression.
Determine if each string starts with a match of a regular expression.
Parameters
----------
Expand All @@ -808,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
See Also
--------
fullmatch : Stricter matching that requires the entire string to match.
contains : Analogous, but less strict, relying on re.search instead of
re.match.
extract : Extract matched groups.
Expand All @@ -823,6 +830,50 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
return _na_map(f, arr, na, dtype=dtype)


def str_fullmatch(
arr: ArrayLike,
pat: Union[str, Pattern],
case: bool = True,
flags: int = 0,
na: Scalar = np.nan,
):
"""
Determine if each string entirely matches a regular expression.
.. versionadded:: 1.1.0
Parameters
----------
pat : str
Character sequence or regular expression.
case : bool, default True
If True, case sensitive.
flags : int, default 0 (no flags)
Regex module flags, e.g. re.IGNORECASE.
na : default NaN
Fill value for missing values.
Returns
-------
Series/array of boolean values
See Also
--------
match : Similar, but also returns `True` when only a *prefix* of the string
matches the regular expression.
extract : Extract matched groups.
"""
if not case:
flags |= re.IGNORECASE

regex = re.compile(pat, flags=flags)

dtype = bool
f = lambda x: regex.fullmatch(x) is not None

return _na_map(f, arr, na, dtype=dtype)


def _get_single_group_name(rx):
try:
return list(rx.groupindex.keys()).pop()
Expand Down Expand Up @@ -2762,6 +2813,12 @@ def match(self, pat, case=True, flags=0, na=np.nan):
result = str_match(self._parent, pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

@copy(str_fullmatch)
@forbid_nonstring_types(["bytes"])
def fullmatch(self, pat, case=True, flags=0, na=np.nan):
result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na)
return self._wrap_result(result, fill_value=na, returns_string=False)

@copy(str_replace)
@forbid_nonstring_types(["bytes"])
def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/test_strings.py
Expand Up @@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right):
("join", (",",), {}),
("ljust", (10,), {}),
("match", ("a",), {}),
("fullmatch", ("a",), {}),
("normalize", ("NFC",), {}),
("pad", (10,), {}),
("partition", (" ",), {"expand": False}),
Expand Down Expand Up @@ -1176,9 +1177,9 @@ def test_match(self):
exp = Series([True, np.nan, False])
tm.assert_series_equal(result, exp)

values = Series(["fooBAD__barBAD", np.nan, "foo"])
values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
result = values.str.match(".*BAD[_]+.*BAD")
exp = Series([True, np.nan, False])
exp = Series([True, True, np.nan, False])
tm.assert_series_equal(result, exp)

# mixed
Expand Down Expand Up @@ -1208,6 +1209,22 @@ def test_match(self):
exp = Series([True, np.nan, np.nan])
tm.assert_series_equal(exp, res)

def test_fullmatch(self):
# GH 32806
values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
result = values.str.fullmatch(".*BAD[_]+.*BAD")
exp = Series([True, False, np.nan, False])
tm.assert_series_equal(result, exp)

# Make sure that the new string arrays work
string_values = Series(
["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string"
)
result = string_values.str.fullmatch(".*BAD[_]+.*BAD")
# Result is nullable boolean with StringDtype
string_exp = Series([True, False, np.nan, False], dtype="boolean")
tm.assert_series_equal(result, string_exp)

def test_extract_expand_None(self):
values = Series(["fooBAD__barBAD", np.nan, "foo"])
with pytest.raises(ValueError, match="expand must be True or False"):
Expand Down Expand Up @@ -3384,6 +3401,9 @@ def test_match_findall_flags(self):
result = data.str.match(pat, flags=re.IGNORECASE)
assert result[0]

result = data.str.fullmatch(pat, flags=re.IGNORECASE)
assert result[0]

result = data.str.findall(pat, flags=re.IGNORECASE)
assert result[0][0] == ("dave", "google", "com")

Expand Down

0 comments on commit bed9103

Please sign in to comment.