[ENH] Add "fullmatch" matching mode to Series.str [#32806] (#32807)

pandas-dev · Mar 24, 2020 · bed9103 · bed9103
1 parent 08fce67
commit bed9103
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 12 deletions.
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -641,21 +641,40 @@ You can check whether elements contain a pattern:
 .. ipython:: python
 
    pattern = r'[0-9][a-z]'
-   pd.Series(['1', '2', '3a', '3b', '03c'],
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
              dtype="string").str.contains(pattern)
 
 Or whether elements match a pattern:
 
 .. ipython:: python
 
-   pd.Series(['1', '2', '3a', '3b', '03c'],
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
              dtype="string").str.match(pattern)
 
-The distinction between ``match`` and ``contains`` is strictness: ``match``
-relies on strict ``re.match``, while ``contains`` relies on ``re.search``.
+.. versionadded:: 1.1.0
 
-Methods like ``match``, ``contains``, ``startswith``, and ``endswith`` take
-an extra ``na`` argument so missing values can be considered True or False:
+.. ipython:: python
+
+   pd.Series(['1', '2', '3a', '3b', '03c', '4dx'],
+             dtype="string").str.fullmatch(pattern)
+
+.. note::
+
+    The distinction between ``match``, ``fullmatch``, and ``contains`` is strictness:
+    ``fullmatch`` tests whether the entire string matches the regular expression;
+    ``match`` tests whether there is a match of the regular expression that begins
+    at the first character of the string; and ``contains`` tests whether there is
+    a match of the regular expression at any position within the string.
+
+    The corresponding functions in the ``re`` package for these three match modes are
+    `re.fullmatch <https://docs.python.org/3/library/re.html#re.fullmatch>`_,
+    `re.match <https://docs.python.org/3/library/re.html#re.match>`_, and
+    `re.search <https://docs.python.org/3/library/re.html#re.search>`_,
+    respectively.
+
+Methods like ``match``, ``fullmatch``, ``contains``, ``startswith``, and
+``endswith`` take an extra ``na`` argument so missing values can be considered
+True or False:
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -69,6 +69,7 @@ Other enhancements
 - `OptionError` is now exposed in `pandas.errors` (:issue:`27553`)
 - :func:`timedelta_range` will now infer a frequency when passed ``start``, ``stop``, and ``periods`` (:issue:`32377`)
 - Positional slicing on a :class:`IntervalIndex` now supports slices with ``step > 1`` (:issue:`31658`)
+- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
 - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
 -
 

diff --git a/pandas/core/strings.py b/pandas/core/strings.py
@@ -2,15 +2,15 @@
 from functools import wraps
 import re
 import textwrap
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Pattern, Type, Union
 import warnings
 
 import numpy as np
 
 import pandas._libs.lib as lib
 import pandas._libs.missing as libmissing
 import pandas._libs.ops as libops
-from pandas._typing import ArrayLike, Dtype
+from pandas._typing import ArrayLike, Dtype, Scalar
 from pandas.util._decorators import Appender
 
 from pandas.core.dtypes.common import (
@@ -787,9 +787,15 @@ def rep(x, r):
         return result
 
 
-def str_match(arr, pat, case=True, flags=0, na=np.nan):
+def str_match(
+    arr: ArrayLike,
+    pat: Union[str, Pattern],
+    case: bool = True,
+    flags: int = 0,
+    na: Scalar = np.nan,
+):
     """
-    Determine if each string matches a regular expression.
+    Determine if each string starts with a match of a regular expression.
 
     Parameters
     ----------
@@ -808,6 +814,7 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
 
     See Also
     --------
+    fullmatch : Stricter matching that requires the entire string to match.
     contains : Analogous, but less strict, relying on re.search instead of
         re.match.
     extract : Extract matched groups.
@@ -823,6 +830,50 @@ def str_match(arr, pat, case=True, flags=0, na=np.nan):
     return _na_map(f, arr, na, dtype=dtype)
 
 
+def str_fullmatch(
+    arr: ArrayLike,
+    pat: Union[str, Pattern],
+    case: bool = True,
+    flags: int = 0,
+    na: Scalar = np.nan,
+):
+    """
+    Determine if each string entirely matches a regular expression.
+
+    .. versionadded:: 1.1.0
+
+    Parameters
+    ----------
+    pat : str
+        Character sequence or regular expression.
+    case : bool, default True
+        If True, case sensitive.
+    flags : int, default 0 (no flags)
+        Regex module flags, e.g. re.IGNORECASE.
+    na : default NaN
+        Fill value for missing values.
+
+    Returns
+    -------
+    Series/array of boolean values
+
+    See Also
+    --------
+    match : Similar, but also returns `True` when only a *prefix* of the string
+        matches the regular expression.
+    extract : Extract matched groups.
+    """
+    if not case:
+        flags |= re.IGNORECASE
+
+    regex = re.compile(pat, flags=flags)
+
+    dtype = bool
+    f = lambda x: regex.fullmatch(x) is not None
+
+    return _na_map(f, arr, na, dtype=dtype)
+
+
 def _get_single_group_name(rx):
     try:
         return list(rx.groupindex.keys()).pop()
@@ -2762,6 +2813,12 @@ def match(self, pat, case=True, flags=0, na=np.nan):
         result = str_match(self._parent, pat, case=case, flags=flags, na=na)
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
+    @copy(str_fullmatch)
+    @forbid_nonstring_types(["bytes"])
+    def fullmatch(self, pat, case=True, flags=0, na=np.nan):
+        result = str_fullmatch(self._parent, pat, case=case, flags=flags, na=na)
+        return self._wrap_result(result, fill_value=na, returns_string=False)
+
     @copy(str_replace)
     @forbid_nonstring_types(["bytes"])
     def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True):

diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -41,6 +41,7 @@ def assert_series_or_index_equal(left, right):
     ("join", (",",), {}),
     ("ljust", (10,), {}),
     ("match", ("a",), {}),
+    ("fullmatch", ("a",), {}),
     ("normalize", ("NFC",), {}),
     ("pad", (10,), {}),
     ("partition", (" ",), {"expand": False}),
@@ -1176,9 +1177,9 @@ def test_match(self):
         exp = Series([True, np.nan, False])
         tm.assert_series_equal(result, exp)
 
-        values = Series(["fooBAD__barBAD", np.nan, "foo"])
+        values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
         result = values.str.match(".*BAD[_]+.*BAD")
-        exp = Series([True, np.nan, False])
+        exp = Series([True, True, np.nan, False])
         tm.assert_series_equal(result, exp)
 
         # mixed
@@ -1208,6 +1209,22 @@ def test_match(self):
         exp = Series([True, np.nan, np.nan])
         tm.assert_series_equal(exp, res)
 
+    def test_fullmatch(self):
+        # GH 32806
+        values = Series(["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"])
+        result = values.str.fullmatch(".*BAD[_]+.*BAD")
+        exp = Series([True, False, np.nan, False])
+        tm.assert_series_equal(result, exp)
+
+        # Make sure that the new string arrays work
+        string_values = Series(
+            ["fooBAD__barBAD", "BAD_BADleroybrown", np.nan, "foo"], dtype="string"
+        )
+        result = string_values.str.fullmatch(".*BAD[_]+.*BAD")
+        # Result is nullable boolean with StringDtype
+        string_exp = Series([True, False, np.nan, False], dtype="boolean")
+        tm.assert_series_equal(result, string_exp)
+
     def test_extract_expand_None(self):
         values = Series(["fooBAD__barBAD", np.nan, "foo"])
         with pytest.raises(ValueError, match="expand must be True or False"):
@@ -3384,6 +3401,9 @@ def test_match_findall_flags(self):
         result = data.str.match(pat, flags=re.IGNORECASE)
         assert result[0]
 
+        result = data.str.fullmatch(pat, flags=re.IGNORECASE)
+        assert result[0]
+
         result = data.str.findall(pat, flags=re.IGNORECASE)
         assert result[0][0] == ("dave", "google", "com")