From 80eeb6360b19fad5deb90a0df7edd4085db442b3 Mon Sep 17 00:00:00 2001 From: zishan044 Date: Sat, 11 Oct 2025 17:27:24 +0600 Subject: [PATCH 1/6] BUG: fix regex numeric group replacement in PyArrow string arrays --- pandas/core/arrays/_arrow_string_mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index d4f4c5bdea0a0..48e47c44c148b 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -175,7 +175,7 @@ def _str_replace( or flags or ( isinstance(repl, str) - and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + and r"\g<" in repl # Block named group references (\g); numeric groups (\1) are supported by PyArrow ) ): raise NotImplementedError( From 6689722ec97a17af7178fef47ece06ce3080e896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 17:13:46 -0300 Subject: [PATCH 2/6] test(str): create test for non named group --- pandas/tests/strings/test_find_replace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 6528c3ddb0d18..951984664b4f8 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -639,6 +639,20 @@ def test_replace_named_groups_regex_swap_expected_fail( ser.str.replace(pattern, repl, regex=True) +@pytest.mark.parametrize("use_compile", [True, False]) +def test_replace_non_named_group(any_string_dtype, use_compile): + ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"], dtype=any_string_dtype) + pattern = r"\[(\d+)\]" + if use_compile: + pattern = re.compile(pattern) + repl = r"(\1)" + result = ser.str.replace(pattern, repl, regex=True) + expected = Series( + ["var.one(0)", "var.two(1)", "var.three(2)"], dtype=any_string_dtype + ) + tm.assert_series_equal(result, expected) + + def test_replace_callable_named_groups(any_string_dtype): # test regex named groups ser = Series(["Foo Bar Baz", np.nan], dtype=any_string_dtype) From a4f483f44f78dc5ca513022f065110bb2bb1d157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 19:02:58 -0300 Subject: [PATCH 3/6] fix: use arrow backend for "\\d" replacements --- pandas/core/arrays/_arrow_string_mixins.py | 7 ++----- pandas/core/arrays/string_arrow.py | 3 +-- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_arrow_string_mixins.py b/pandas/core/arrays/_arrow_string_mixins.py index 48e47c44c148b..e5e8ffe409788 100644 --- a/pandas/core/arrays/_arrow_string_mixins.py +++ b/pandas/core/arrays/_arrow_string_mixins.py @@ -173,15 +173,12 @@ def _str_replace( or callable(repl) or not case or flags - or ( - isinstance(repl, str) - and r"\g<" in repl # Block named group references (\g); numeric groups (\1) are supported by PyArrow - ) + or (isinstance(repl, str) and r"\g<" in repl) ): raise NotImplementedError( "replace is not supported with a re.Pattern, callable repl, " "case=False, flags!=0, or when the replacement string contains " - "named group references (\\g<...>, \\d+)" + "named group references (\\g<...>)" ) func = pc.replace_substring_regex if regex else pc.replace_substring diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d8c30c43e0046..7dd41cc0e9960 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -425,8 +425,7 @@ def _str_replace( or flags or ( # substitution contains a named group pattern # https://docs.python.org/3/library/re.html - isinstance(repl, str) - and (r"\g<" in repl or re.search(r"\\\d", repl) is not None) + isinstance(repl, str) and r"\g<" in repl ) ): return super()._str_replace(pat, repl, n, case, flags, regex) From 60d6c957fe41cd64bdc8e3f8f264b365ccbf1854 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 19:05:25 -0300 Subject: [PATCH 4/6] test: add tests for pyarrow \d replacement Also tests for ambiguous reference \d + another literal as digit. --- pandas/tests/strings/test_find_replace.py | 74 +++++++++++++++++++---- 1 file changed, 63 insertions(+), 11 deletions(-) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 951984664b4f8..5e9127d13e00b 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -9,6 +9,7 @@ import pandas as pd from pandas import ( Series, + StringDtype, _testing as tm, ) from pandas.tests.strings import ( @@ -584,6 +585,10 @@ def test_replace_callable_raises(any_string_dtype, repl): r"\g \g \g", ["Three Two One", "Baz Bar Foo"], ), + ( + r"\3 \2 \1", + ["Three Two One", "Baz Bar Foo"], + ), ( r"\g<3> \g<2> \g<1>", ["Three Two One", "Baz Bar Foo"], @@ -599,6 +604,7 @@ def test_replace_callable_raises(any_string_dtype, repl): ], ids=[ "named_groups_full_swap", + "numbered_groups_no_g_full_swap", "numbered_groups_full_swap", "single_group_with_literal", "mixed_group_reference_with_literal", @@ -623,33 +629,79 @@ def test_replace_named_groups_regex_swap( [ r"\g<20>", r"\20", + r"\40", + r"\4", ], ) @pytest.mark.parametrize("use_compile", [True, False]) def test_replace_named_groups_regex_swap_expected_fail( - any_string_dtype, repl, use_compile + any_string_dtype, repl, use_compile, request ): # GH#57636 + if ( + not use_compile + and r"\g" not in repl + and isinstance(any_string_dtype, StringDtype) + and any_string_dtype.storage == "pyarrow" + ): + # calls pyarrow method directly + if repl == r"\20": + mark = pytest.mark.xfail(reason="PyArrow interprets as group + literal") + request.applymarker(mark) + + pa = pytest.importorskip("pyarrow") + error_type = pa.ArrowInvalid + error_msg = r"only has \d parenthesized subexpressions" + else: + error_type = re.error + error_msg = "invalid group reference" + pattern = r"(?P\w+) (?P\w+) (?P\w+)" if use_compile: pattern = re.compile(pattern) ser = Series(["One Two Three", "Foo Bar Baz"], dtype=any_string_dtype) - with pytest.raises(re.error, match="invalid group reference"): + with pytest.raises(error_type, match=error_msg): ser.str.replace(pattern, repl, regex=True) -@pytest.mark.parametrize("use_compile", [True, False]) -def test_replace_non_named_group(any_string_dtype, use_compile): - ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"], dtype=any_string_dtype) - pattern = r"\[(\d+)\]" - if use_compile: - pattern = re.compile(pattern) - repl = r"(\1)" +@pytest.mark.parametrize( + "pattern, repl", + [ + (r"(\w+) (\w+) (\w+)", r"\20"), + (r"(?P\w+) (?P\w+) (?P\w+)", r"\20"), + ], +) +def test_pyarrow_ambiguous_group_references(pyarrow_string_dtype, pattern, repl): + # GH#62653 + ser = Series(["One Two Three", "Foo Bar Baz"], dtype=pyarrow_string_dtype) + result = ser.str.replace(pattern, repl, regex=True) - expected = Series( - ["var.one(0)", "var.two(1)", "var.three(2)"], dtype=any_string_dtype + expected = Series(["Two0", "Bar0"], dtype=pyarrow_string_dtype) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize( + "pattern, repl, expected_list", + [ + ( + r"\[(?P\d+)\]", + r"(\1)", + ["var.one(0)", "var.two(1)", "var.three(2)"], + ), + ( + r"\[(\d+)\]", + r"(\1)", + ["var.one(0)", "var.two(1)", "var.three(2)"], + ), + ], +) +def test_pyarrow_backend_group_replacement(pattern, repl, expected_list): + ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"]).convert_dtypes( + dtype_backend="pyarrow" ) + result = ser.str.replace(pattern, repl, regex=True) + expected = Series(expected_list).convert_dtypes(dtype_backend="pyarrow") tm.assert_series_equal(result, expected) From e714f586e3949ad7eb2be1095a4841e374dc303c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 19:19:26 -0300 Subject: [PATCH 5/6] docs(string): add bugfix entry to whatsnew for str.replace --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 75b4c5c0fe14d..ac4056cc7bfd2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1035,6 +1035,7 @@ Conversion Strings ^^^^^^^ +- Bug in :meth:`Series.str.replace` raising an error on valid group references (``\1``, ``\2``, etc.) on series converted to PyArrow backend dtype (:issue:`62653`) - Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for :class:`ArrowDtype` (:issue:`61485`) - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`) - Bug in multiplication with a :class:`StringDtype` incorrectly allowing multiplying by bools; explicitly cast to integers instead (:issue:`62595`) From 7411ce8250971d96c752e2bb4061dacfcd0f5668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Kothe?= Date: Sun, 26 Oct 2025 20:03:09 -0300 Subject: [PATCH 6/6] test(str.replace): skip if pyarrow isn't installed --- pandas/tests/strings/test_find_replace.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py index 5e9127d13e00b..12a3bd63cea90 100644 --- a/pandas/tests/strings/test_find_replace.py +++ b/pandas/tests/strings/test_find_replace.py @@ -696,6 +696,7 @@ def test_pyarrow_ambiguous_group_references(pyarrow_string_dtype, pattern, repl) ), ], ) +@td.skip_if_no("pyarrow") def test_pyarrow_backend_group_replacement(pattern, repl, expected_list): ser = Series(["var.one[0]", "var.two[1]", "var.three[2]"]).convert_dtypes( dtype_backend="pyarrow"