From 74eaeeea1d107fdc6e6ebbfb3185247d5751438e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Fri, 12 Sep 2025 17:36:20 -0300 Subject: [PATCH] Backport PR #62283: BUG: fix pyarrow string regex replacement --- doc/source/whatsnew/v2.3.3.rst | 1 + pandas/core/arrays/_arrow_string_mixins.py | 14 ++++- pandas/core/arrays/string_arrow.py | 12 ++++- pandas/tests/strings/test_find_replace.py | 62 ++++++++++++++++++++++ 4 files changed, 86 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index cbde6f52d4472..aaed7544d9975 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -22,6 +22,7 @@ become the default string dtype in pandas 3.0. See Bug fixes ^^^^^^^^^ +- Fix bug in :meth:`Series.str.replace` using named capture groups (e.g., ``\g``) with the Arrow-backed dtype would raise an error (:issue:`57636`) - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 90de41ffb63fa..74c59dd465b52 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -168,10 +168,20 @@ def _str_replace( flags: int = 0, regex: bool = True, ) -> Self: - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + if ( + isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + or ( + isinstance(repl, str) + and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + ) + ): raise NotImplementedError( "replace is not supported with a re.Pattern, callable repl, " - "case=False, or flags!=0" + "case=False, flags!=0, or when the replacement string contains " + "named group references (\\g<...>, \\d+)" ) func = pc.replace_substring_regex if regex else pc.replace_substring diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 15d001699064a..f9fd74cbd76b1 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -419,7 +419,17 @@ def _str_replace( flags: int = 0, regex: bool = True, ): - if isinstance(pat, re.Pattern) or callable(repl) or not case or flags: + if ( + isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + or ( # substitution contains a named group pattern + # https://docs.python.org/3/library/re.html + isinstance(repl, str) + and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + ) + ): return super()._str_replace(pat, repl, n, case, flags, regex) return ArrowStringArrayMixin._str_replace( diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 3f57754af6e79..be59bc195b387 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -594,6 +594,68 @@ def test_replace_callable_raises(any_string_dtype, repl): values.str.replace("a", repl, regex=True) +@pytest.mark.parametrize( + "repl, expected_list", + [ + ( + r"\g \g \g", + ["Three Two One", "Baz Bar Foo"], + ), + ( + r"\g<3> \g<2> \g<1>", + ["Three Two One", "Baz Bar Foo"], + ), + ( + r"\g<2>0", + ["Two0", "Bar0"], + ), + ( + r"\g<2>0 \1", + ["Two0 One", "Bar0 Foo"], + ), + ], + ids=[ + "named_groups_full_swap", + "numbered_groups_full_swap", + "single_group_with_literal", + "mixed_group_reference_with_literal", + ], +) +@pytest.mark.parametrize("use_compile", [True, False]) +def test_replace_named_groups_regex_swap( + any_string_dtype, use_compile, repl, expected_list +): + # GH#57636 + ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) + pattern = r"(?P\w+) (?P\w+) (?P\w+)" + if use_compile: + pattern = re.compile(pattern) + result = ser.str.replace(pattern, repl, regex=True) + expected = Series(expected_list, dtype=any_string_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "repl", + [ + r"\g<20>", + r"\20", + ], +) +@pytest.mark.parametrize("use_compile", [True, False]) +def test_replace_named_groups_regex_swap_expected_fail( + any_string_dtype, repl, use_compile +): + # GH#57636 + pattern = r"(?P\w+) (?P\w+) (?P\w+)" + if use_compile: + pattern = re.compile(pattern) + ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) + + with pytest.raises(re.error, match="invalid group reference"): + ser.str.replace(pattern, repl, regex=True) + + def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype)