From a55d441515b27519a6f5fdfa43d620a682608943 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Sep 2025 10:06:40 +0200 Subject: [PATCH 1/2] BUG: avoid validation error for ufunc with string[python] array --- doc/source/whatsnew/v2.3.3.rst | 2 +- pandas/core/arrays/numpy_.py | 9 ++++++++ pandas/tests/arrays/string_/test_string.py | 27 ++++++++++++++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst index 0accd07eb366a..6c106841e970b 100644 --- a/doc/source/whatsnew/v2.3.3.rst +++ b/doc/source/whatsnew/v2.3.3.rst @@ -47,7 +47,7 @@ Bug fixes - Fix regression in ``~Series.str.contains``, ``~Series.str.match`` and ``~Series.str.fullmatch`` with a compiled regex and custom flags (:issue:`62240`) - Fix :meth:`Series.str.match` and :meth:`Series.str.fullmatch` not matching patterns with groups correctly for the Arrow-backed string dtype (:issue:`61072`) - +- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`) Improvements and fixes for Copy-on-Write ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index ef64bda3dc504..620d0b8696470 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -234,6 +234,15 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # e.g. test_np_max_nested_tuples return result else: + if self.dtype.type is str: + # StringDtype + try: + # specify dtype to preserve storage/na_value + return type(self)(result, dtype=self.dtype) + except ValueError: + # if validation of input fails (no strings) + # -> fallback to returning raw numpy array + return result # one return value; re-box array-like results return type(self)(result) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 1a71f6c41c4f1..d3effb7c33457 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -840,3 +840,30 @@ def test_string_array_view_type_error(): arr = pd.array(["a", "b", "c"], dtype="string") with pytest.raises(TypeError, match="Cannot change data-type for string array."): arr.view("i8") + + +@pytest.mark.parametrize("box", [pd.Series, pd.array]) +def test_numpy_array_ufunc(dtype, box): + arr = box(["a", "bb", "ccc"], dtype=dtype) + + # custom ufunc that works with string (object) input -> returning numeric + str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1) + result = str_len_ufunc(arr) + expected_cls = pd.Series if box is pd.Series else np.array + # TODO we should infer int64 dtype here? + expected = expected_cls([1, 2, 3], dtype=object) + tm.assert_equal(result, expected) + + # custom ufunc returning strings + str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1) + result = str_multiply_ufunc(arr) + expected = box(["aa", "bbbb", "cccccc"], dtype=dtype) + if dtype.storage == "pyarrow": + # TODO ArrowStringArray should also preserve the class / dtype + if box is pd.array: + expected = np.array(["aa", "bbbb", "cccccc"], dtype=object) + else: + # not specifying the dtype because the exact dtype is not yet preserved + expected = pd.Series(["aa", "bbbb", "cccccc"]) + + tm.assert_equal(result, expected) From 5bcad4ecbd3f107fad2063274da46e33cb825e40 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 29 Sep 2025 13:44:02 +0200 Subject: [PATCH 2/2] fix typing --- pandas/core/arrays/numpy_.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 620d0b8696470..fab51ffa56919 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -5,6 +5,7 @@ Any, Literal, Self, + cast, ) import numpy as np @@ -48,6 +49,7 @@ ) from pandas import Index + from pandas.arrays import StringArray class NumpyExtensionArray( @@ -234,8 +236,9 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): # e.g. test_np_max_nested_tuples return result else: - if self.dtype.type is str: + if self.dtype.type is str: # type: ignore[comparison-overlap] # StringDtype + self = cast("StringArray", self) try: # specify dtype to preserve storage/na_value return type(self)(result, dtype=self.dtype)