diff --git a/doc/release/upcoming_changes/22863.new_feature.rst b/doc/release/upcoming_changes/22863.new_feature.rst index 88ec3f641574..3f45ed8342ff 100644 --- a/doc/release/upcoming_changes/22863.new_feature.rst +++ b/doc/release/upcoming_changes/22863.new_feature.rst @@ -1,7 +1,4 @@ -String dtype instances can be created from the string abstract dtype classes ----------------------------------------------------------------------------- -It is now possible to create a string dtype instance with a size without -using the string name of the dtype. For example, ``type(np.dtype('U'))(8)`` -will create a dtype that is equivalent to ``np.dtype('U8')``. This feature -is most useful when writing generic code dealing with string dtype -classes. +String functions in np.char are compatible with NEP 42 custom dtypes +-------------------------------------------------------------------- +Custom dtypes that represent unicode strings or byte strings can now be +passed to the string functions in np.char. diff --git a/doc/release/upcoming_changes/22963.new_feature.rst b/doc/release/upcoming_changes/22963.new_feature.rst new file mode 100644 index 000000000000..88ec3f641574 --- /dev/null +++ b/doc/release/upcoming_changes/22963.new_feature.rst @@ -0,0 +1,7 @@ +String dtype instances can be created from the string abstract dtype classes +---------------------------------------------------------------------------- +It is now possible to create a string dtype instance with a size without +using the string name of the dtype. For example, ``type(np.dtype('U'))(8)`` +will create a dtype that is equivalent to ``np.dtype('U8')``. This feature +is most useful when writing generic code dealing with string dtype +classes. diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py index d312506ffed2..98db3d882a30 100644 --- a/numpy/core/defchararray.py +++ b/numpy/core/defchararray.py @@ -46,26 +46,29 @@ overrides.array_function_dispatch, module='numpy.char') -def _use_unicode(*args): - """ - Helper function for determining the output type of some string - operations. +def _is_unicode(arr): + """Returns True if arr is a string or a string array with a dtype that + represents a unicode string, otherwise returns False. - For an operation on two ndarrays, if at least one is unicode, the - result should be unicode. """ - for x in args: - if (isinstance(x, str) or - issubclass(numpy.asarray(x).dtype.type, unicode_)): - return unicode_ - return string_ + if (isinstance(arr, str) or + issubclass(numpy.asarray(arr).dtype.type, str)): + return True + return False + -def _to_string_or_unicode_array(result): +def _to_string_or_unicode_array(result, output_dtype_like=None): """ - Helper function to cast a result back into a string or unicode array - if an object array must be used as an intermediary. + Helper function to cast a result back into an array + with the appropriate dtype if an object array must be used + as an intermediary. """ - return numpy.asarray(result.tolist()) + ret = numpy.asarray(result.tolist()) + dtype = getattr(output_dtype_like, 'dtype', None) + if dtype is not None: + return ret.astype(type(dtype)(_get_num_chars(ret)), copy=False) + return ret + def _clean_args(*args): """ @@ -319,9 +322,19 @@ def add(x1, x2): arr1 = numpy.asarray(x1) arr2 = numpy.asarray(x2) out_size = _get_num_chars(arr1) + _get_num_chars(arr2) - dtype = _use_unicode(arr1, arr2) - return _vec_string(arr1, (dtype, out_size), '__add__', (arr2,)) + if type(arr1.dtype) != type(arr2.dtype): + # Enforce this for now. The solution to it will be implement add + # as a ufunc. It never worked right on Python 3: bytes + unicode gave + # nonsense unicode + bytes errored, and unicode + object used the + # object dtype itemsize as num chars (worked on short strings). + # bytes + void worked but promoting void->bytes is dubious also. + raise TypeError( + "np.char.add() requires both arrays of the same dtype kind, but " + f"got dtypes: '{arr1.dtype}' and '{arr2.dtype}' (the few cases " + "where this used to work often lead to incorrect results).") + + return _vec_string(arr1, type(arr1.dtype)(out_size), '__add__', (arr2,)) def _multiply_dispatcher(a, i): return (a,) @@ -371,7 +384,7 @@ def multiply(a, i): raise ValueError("Can only multiply by integers") out_size = _get_num_chars(a_arr) * max(int(i_arr.max()), 0) return _vec_string( - a_arr, (a_arr.dtype.type, out_size), '__mul__', (i_arr,)) + a_arr, type(a_arr.dtype)(out_size), '__mul__', (i_arr,)) def _mod_dispatcher(a, values): @@ -403,7 +416,7 @@ def mod(a, values): """ return _to_string_or_unicode_array( - _vec_string(a, object_, '__mod__', (values,))) + _vec_string(a, object_, '__mod__', (values,)), a) @array_function_dispatch(_unary_op_dispatcher) @@ -499,7 +512,7 @@ def center(a, width, fillchar=' '): if numpy.issubdtype(a_arr.dtype, numpy.string_): fillchar = asbytes(fillchar) return _vec_string( - a_arr, (a_arr.dtype.type, size), 'center', (width_arr, fillchar)) + a_arr, type(a_arr.dtype)(size), 'center', (width_arr, fillchar)) def _count_dispatcher(a, sub, start=None, end=None): @@ -723,7 +736,7 @@ def expandtabs(a, tabsize=8): """ return _to_string_or_unicode_array( - _vec_string(a, object_, 'expandtabs', (tabsize,))) + _vec_string(a, object_, 'expandtabs', (tabsize,)), a) @array_function_dispatch(_count_dispatcher) @@ -1043,7 +1056,7 @@ def join(sep, seq): """ return _to_string_or_unicode_array( - _vec_string(sep, object_, 'join', (seq,))) + _vec_string(sep, object_, 'join', (seq,)), seq) @@ -1084,7 +1097,7 @@ def ljust(a, width, fillchar=' '): if numpy.issubdtype(a_arr.dtype, numpy.string_): fillchar = asbytes(fillchar) return _vec_string( - a_arr, (a_arr.dtype.type, size), 'ljust', (width_arr, fillchar)) + a_arr, type(a_arr.dtype)(size), 'ljust', (width_arr, fillchar)) @array_function_dispatch(_unary_op_dispatcher) @@ -1218,7 +1231,7 @@ def partition(a, sep): """ return _to_string_or_unicode_array( - _vec_string(a, object_, 'partition', (sep,))) + _vec_string(a, object_, 'partition', (sep,)), a) def _replace_dispatcher(a, old, new, count=None): @@ -1263,8 +1276,7 @@ def replace(a, old, new, count=None): array(['The dwash was fresh', 'Thwas was it'], dtype='flags & NPY_DT_LEGACY) != 0) #define NPY_DT_is_abstract(dtype) (((dtype)->flags & NPY_DT_ABSTRACT) != 0) #define NPY_DT_is_parametric(dtype) (((dtype)->flags & NPY_DT_PARAMETRIC) != 0) +#define NPY_DT_is_user_defined(dtype) (((dtype)->type_num == -1)) /* * Macros for convenient classmethod calls, since these require diff --git a/numpy/core/src/multiarray/multiarraymodule.c b/numpy/core/src/multiarray/multiarraymodule.c index 5da3d66df196..94fa2a9092ac 100644 --- a/numpy/core/src/multiarray/multiarraymodule.c +++ b/numpy/core/src/multiarray/multiarraymodule.c @@ -3785,6 +3785,34 @@ format_longfloat(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) TrimMode_LeaveOneZero, -1, -1); } +/* + * returns 1 if array is a user-defined string dtype, sets an error and + * returns 0 otherwise + */ +static int _is_user_defined_string_array(PyArrayObject* array) +{ + if (NPY_DT_is_user_defined(PyArray_DESCR(array))) { + PyTypeObject* scalar_type = NPY_DTYPE(PyArray_DESCR(array))->scalar_type; + if (PyType_IsSubtype(scalar_type, &PyBytes_Type) || + PyType_IsSubtype(scalar_type, &PyUnicode_Type)) { + return 1; + } + else { + PyErr_SetString( + PyExc_TypeError, + "string comparisons are only allowed for dtypes with a " + "scalar type that is a subtype of str or bytes."); + return 0; + } + } + else { + PyErr_SetString( + PyExc_TypeError, + "string operation on non-string array"); + return 0; + } +} + /* * The only purpose of this function is that it allows the "rstrip". @@ -3861,6 +3889,9 @@ compare_chararrays(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds) else { PyErr_SetString(PyExc_TypeError, "comparison of non-string arrays"); + Py_DECREF(newarr); + Py_DECREF(newoth); + return NULL; } Py_DECREF(newarr); Py_DECREF(newoth); @@ -4061,10 +4092,15 @@ _vec_string(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *NPY_UNUSED(kw method = PyObject_GetAttr((PyObject *)&PyUnicode_Type, method_name); } else { - PyErr_SetString(PyExc_TypeError, - "string operation on non-string array"); - Py_DECREF(type); - goto err; + if (_is_user_defined_string_array(char_array)) { + PyTypeObject* scalar_type = + NPY_DTYPE(PyArray_DESCR(char_array))->scalar_type; + method = PyObject_GetAttr((PyObject*)scalar_type, method_name); + } + else { + Py_DECREF(type); + goto err; + } } if (method == NULL) { Py_DECREF(type); diff --git a/numpy/core/tests/test_defchararray.py b/numpy/core/tests/test_defchararray.py index 22296604ee5c..8d92d97f7840 100644 --- a/numpy/core/tests/test_defchararray.py +++ b/numpy/core/tests/test_defchararray.py @@ -1,3 +1,5 @@ +import pytest + import numpy as np from numpy.core.multiarray import _vec_string from numpy.testing import ( @@ -670,3 +672,15 @@ def test_empty_indexing(): # empty chararray instead of a chararray with a single empty string in it. s = np.chararray((4,)) assert_(s[[]].size == 0) + + +@pytest.mark.parametrize(["dt1", "dt2"], + [("S", "U"), ("U", "S"), ("S", "O"), ("U", "O"), + ("S", "d"), ("S", "V")]) +def test_add_types(dt1, dt2): + arr1 = np.array([1234234], dtype=dt1) + # If the following fails, e.g. use a number and test "V" explicitly + arr2 = np.array([b"423"], dtype=dt2) + with pytest.raises(TypeError, + match=f".*same dtype kind.*{arr1.dtype}.*{arr2.dtype}"): + np.char.add(arr1, arr2)