diff --git a/numpy/_core/code_generators/generate_umath.py b/numpy/_core/code_generators/generate_umath.py index b72d13d11c6c..b64624702db7 100644 --- a/numpy/_core/code_generators/generate_umath.py +++ b/numpy/_core/code_generators/generate_umath.py @@ -1300,6 +1300,26 @@ def english_upper(s): docstrings.get('numpy._core.umath._zfill'), None, ), +'_partition_index': + Ufunc(3, 3, None, + docstrings.get('numpy._core.umath._partition_index'), + None, + ), +'_rpartition_index': + Ufunc(3, 3, None, + docstrings.get('numpy._core.umath._rpartition_index'), + None, + ), +'_partition': + Ufunc(2, 3, None, + docstrings.get('numpy._core.umath._partition'), + None, + ), +'_rpartition': + Ufunc(2, 3, None, + docstrings.get('numpy._core.umath._rpartition'), + None, + ), } def indent(st, spaces): diff --git a/numpy/_core/code_generators/ufunc_docstrings.py b/numpy/_core/code_generators/ufunc_docstrings.py index d214ffbccb55..a3e1965151f1 100644 --- a/numpy/_core/code_generators/ufunc_docstrings.py +++ b/numpy/_core/code_generators/ufunc_docstrings.py @@ -5028,3 +5028,184 @@ def add_newdoc(place, name, doc): array(['001', '-01', '+01'], dtype='>> x = np.array(["Numpy is nice!"]) + >>> np.strings.partition(x, " ") + (array(['Numpy'], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) + >>> np.strings.rpartition(a, 'A') + (array(['aAaAa', ' a', 'abB'], dtype='>> x = np.array(["Numpy is nice!"], dtype="T") + >>> np.strings.partition(x, " ") + (array(['Numpy'], dtype=StringDType()), + array([' '], dtype=StringDType()), + array(['is nice!'], dtype=StringDType())) + + """) + +add_newdoc('numpy._core.umath', '_rpartition', + """ + Partition each element in ``x1`` around the right-most separator, + ``x2``. + + For each element in ``x1``, split the element at the last + occurrence of ``x2`` at location ``x3``, and return a 3-tuple + containing the part before the separator, the separator itself, + and the part after the separator. If the separator is not found, + the third item of the tuple will contain the whole string, and + the first and second ones will be the empty string. + + Parameters + ---------- + x1 : array-like, with ``StringDType`` dtype + Input array + x2 : array-like, with ``StringDType`` dtype + Separator to split each string element in ``x1``. + + Returns + ------- + out : 3-tuple: + - ``StringDType`` array with the part before the separator + - ``StringDType`` array with the separator + - ``StringDType`` array with the part after the separator + + See Also + -------- + str.rpartition + + Examples + -------- + The ufunc is used most easily via ``np.strings.rpartition``, + which calls it after calculating the indices:: + + >>> a = np.array(['aAaAaA', ' aA ', 'abBABba'], dtype="T") + >>> np.strings.rpartition(a, 'A') + (array(['aAaAa', ' a', 'abB'], dtype=StringDType()), + array(['A', 'A', 'A'], dtype=StringDType()), + array(['', ' ', 'Bba'], dtype=StringDType())) + + """) diff --git a/numpy/_core/defchararray.py b/numpy/_core/defchararray.py index 44754a747cec..52a62791d382 100644 --- a/numpy/_core/defchararray.py +++ b/numpy/_core/defchararray.py @@ -17,16 +17,19 @@ """ import functools +import numpy as np from .._utils import set_module from .numerictypes import bytes_, str_, character from .numeric import ndarray, array as narray, asarray as asnarray from numpy._core.multiarray import compare_chararrays from numpy._core import overrides from numpy.strings import * -from numpy.strings import multiply as strings_multiply +from numpy.strings import ( + multiply as strings_multiply, + partition as strings_partition, + rpartition as strings_rpartition, +) from numpy._core.strings import ( - _partition as partition, - _rpartition as rpartition, _split as split, _rsplit as rsplit, _splitlines as splitlines, @@ -303,6 +306,88 @@ def multiply(a, i): raise ValueError("Can only multiply by integers") +def partition(a, sep): + """ + Partition each element in `a` around `sep`. + + Calls :meth:`str.partition` element-wise. + + For each element in `a`, split the element as the first + occurrence of `sep`, and return 3 strings containing the part + before the separator, the separator itself, and the part after + the separator. If the separator is not found, return 3 strings + containing the string itself, followed by two empty strings. + + Parameters + ---------- + a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype + Input array + sep : {str, unicode} + Separator to split each string element in `a`. + + Returns + ------- + out : ndarray + Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype, + depending on input types. The output array will have an extra + dimension with 3 elements per input element. + + Examples + -------- + >>> x = np.array(["Numpy is nice!"]) + >>> np.char.partition(x, " ") + array([['Numpy', ' ', 'is nice!']], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) + >>> np.char.rpartition(a, 'A') + array([['aAaAa', 'A', ''], + [' a', 'A', ' '], + ['abB', 'A', 'Bba']], dtype=' buf, npy_int64 width, Buffer out) } +template +static inline void +string_partition(Buffer buf1, Buffer buf2, npy_int64 idx, + Buffer out1, Buffer out2, Buffer out3, + npy_intp *final_len1, npy_intp *final_len2, npy_intp *final_len3, + STARTPOSITION pos) +{ + // StringDType uses a ufunc that implements the find-part as well + assert(enc != ENCODING::UTF8); + + size_t len1 = buf1.num_codepoints(); + size_t len2 = buf2.num_codepoints(); + + if (len2 == 0) { + npy_gil_error(PyExc_ValueError, "empty separator"); + *final_len1 = *final_len2 = *final_len3 = -1; + return; + } + + if (idx < 0) { + if (pos == STARTPOSITION::FRONT) { + buf1.buffer_memcpy(out1, len1); + *final_len1 = len1; + *final_len2 = *final_len3 = 0; + } + else { + buf1.buffer_memcpy(out3, len1); + *final_len1 = *final_len2 = 0; + *final_len3 = len1; + } + return; + } + + buf1.buffer_memcpy(out1, idx); + *final_len1 = idx; + buf2.buffer_memcpy(out2, len2); + *final_len2 = len2; + (buf1 + idx + len2).buffer_memcpy(out3, len1 - idx - len2); + *final_len3 = len1 - idx - len2; +} + + #endif /* _NPY_CORE_SRC_UMATH_STRING_BUFFER_H_ */ diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 858493471f09..2bc4ce20acd6 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -582,6 +582,57 @@ string_zfill_loop(PyArrayMethod_Context *context, } +template +static int +string_partition_index_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + STARTPOSITION startposition = *(STARTPOSITION *)(context->method->static_data); + int elsize1 = context->descriptors[0]->elsize; + int elsize2 = context->descriptors[1]->elsize; + int outsize1 = context->descriptors[3]->elsize; + int outsize2 = context->descriptors[4]->elsize; + int outsize3 = context->descriptors[5]->elsize; + + char *in1 = data[0]; + char *in2 = data[1]; + char *in3 = data[2]; + char *out1 = data[3]; + char *out2 = data[4]; + char *out3 = data[5]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf1(in1, elsize1); + Buffer buf2(in2, elsize2); + Buffer outbuf1(out1, outsize1); + Buffer outbuf2(out2, outsize2); + Buffer outbuf3(out3, outsize3); + + npy_intp final_len1, final_len2, final_len3; + string_partition(buf1, buf2, *(npy_int64 *)in3, outbuf1, outbuf2, outbuf3, + &final_len1, &final_len2, &final_len3, startposition); + if (final_len1 < 0 || final_len2 < 0 || final_len3 < 0) { + return -1; + } + outbuf1.buffer_fill_with_zeros_after_index(final_len1); + outbuf2.buffer_fill_with_zeros_after_index(final_len2); + outbuf3.buffer_fill_with_zeros_after_index(final_len3); + + in1 += strides[0]; + in2 += strides[1]; + in3 += strides[2]; + out1 += strides[3]; + out2 += strides[4]; + out3 += strides[5]; + } + + return 0; +} + + /* Resolve descriptors & promoter functions */ static NPY_CASTING @@ -947,6 +998,55 @@ string_zfill_resolve_descriptors( } +static int +string_partition_promoter(PyObject *NPY_UNUSED(ufunc), + PyArray_DTypeMeta *const op_dtypes[], PyArray_DTypeMeta *const signature[], + PyArray_DTypeMeta *new_op_dtypes[]) +{ + Py_INCREF(op_dtypes[0]); + new_op_dtypes[0] = op_dtypes[0]; + Py_INCREF(op_dtypes[1]); + new_op_dtypes[1] = op_dtypes[1]; + + new_op_dtypes[2] = NPY_DT_NewRef(&PyArray_Int64DType); + + Py_INCREF(op_dtypes[0]); + new_op_dtypes[3] = op_dtypes[0]; + Py_INCREF(op_dtypes[0]); + new_op_dtypes[4] = op_dtypes[0]; + Py_INCREF(op_dtypes[0]); + new_op_dtypes[5] = op_dtypes[0]; + return 0; +} + + +static NPY_CASTING +string_partition_resolve_descriptors( + PyArrayMethodObject *self, + PyArray_DTypeMeta *const NPY_UNUSED(dtypes[3]), + PyArray_Descr *const given_descrs[3], + PyArray_Descr *loop_descrs[3], + npy_intp *NPY_UNUSED(view_offset)) +{ + if (!given_descrs[3] || !given_descrs[4] || !given_descrs[5]) { + PyErr_Format(PyExc_TypeError, + "The '%s' ufunc requires the 'out' keyword to be set. The " + "python wrapper in numpy.strings can be used without the " + "out keyword.", self->name); + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + for (int i = 0; i < 6; i++) { + loop_descrs[i] = NPY_DT_CALL_ensure_canonical(given_descrs[i]); + if (!loop_descrs[i]) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + } + + return NPY_NO_CASTING; +} + + /* * Machinery to add the string loops to the existing ufuncs. */ @@ -1228,7 +1328,7 @@ init_mixed_type_ufunc(PyObject *umath, const char *name, int nin, int nout, NPY_NO_EXPORT int init_string_ufuncs(PyObject *umath) { - NPY_TYPES dtypes[] = {NPY_STRING, NPY_STRING, NPY_STRING, NPY_STRING, NPY_STRING}; + NPY_TYPES dtypes[] = {NPY_STRING, NPY_STRING, NPY_STRING, NPY_STRING, NPY_STRING, NPY_STRING}; if (init_comparison(umath) < 0) { return -1; @@ -1599,6 +1699,34 @@ init_string_ufuncs(PyObject *umath) return -1; } + dtypes[0] = dtypes[1] = dtypes[3] = dtypes[4] = dtypes[5] = NPY_OBJECT; + dtypes[2] = NPY_INT64; + + const char *partition_names[] = {"_partition_index", "_rpartition_index"}; + + static STARTPOSITION partition_startpositions[] = { + STARTPOSITION::FRONT, STARTPOSITION::BACK + }; + + for (int i = 0; i < 2; i++) { + if (init_ufunc( + umath, partition_names[i], 3, 3, dtypes, ENCODING::ASCII, + string_partition_index_loop, + string_partition_resolve_descriptors, &partition_startpositions[i]) < 0) { + return -1; + } + if (init_ufunc( + umath, partition_names[i], 3, 3, dtypes, ENCODING::UTF32, + string_partition_index_loop, + string_partition_resolve_descriptors, &partition_startpositions[i]) < 0) { + return -1; + } + if (init_promoter(umath, partition_names[i], 3, 3, + string_partition_promoter) < 0) { + return -1; + } + } + return 0; } diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index 1f24bec59c63..2380fa9495bd 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -1888,6 +1888,181 @@ zfill_strided_loop(PyArrayMethod_Context *context, return -1; } +static NPY_CASTING +string_partition_resolve_descriptors( + PyArrayMethodObject *self, + PyArray_DTypeMeta *const NPY_UNUSED(dtypes[3]), + PyArray_Descr *const given_descrs[3], + PyArray_Descr *loop_descrs[3], + npy_intp *NPY_UNUSED(view_offset)) +{ + if (given_descrs[2] || given_descrs[3] || given_descrs[4]) { + PyErr_Format(PyExc_TypeError, "The StringDType '%s' ufunc does not " + "currently support the 'out' keyword", self->name); + return (NPY_CASTING)-1; + } + for (int i=0; i<2; i++) { + Py_INCREF(given_descrs[i]); + loop_descrs[i] = given_descrs[i]; + } + PyArray_StringDTypeObject *adescr = (PyArray_StringDTypeObject *)given_descrs[0]; + for (int i=2; i<5; i++) { + loop_descrs[i] = (PyArray_Descr *)new_stringdtype_instance( + adescr->na_object, adescr->coerce); + if (loop_descrs[i] == NULL) { + return (NPY_CASTING)-1; + } + } + + return NPY_NO_CASTING; +} + +NPY_NO_EXPORT int +string_partition_strided_loop( + PyArrayMethod_Context *context, + char *const data[], + npy_intp const dimensions[], + npy_intp const strides[], + NpyAuxData *NPY_UNUSED(auxdata)) +{ + STARTPOSITION startposition = *(STARTPOSITION *)(context->method->static_data); + int fastsearch_direction = + startposition == STARTPOSITION::FRONT ? FAST_SEARCH : FAST_RSEARCH; + + npy_intp N = dimensions[0]; + + char *in1 = data[0]; + char *in2 = data[1]; + char *out1 = data[2]; + char *out2 = data[3]; + char *out3 = data[4]; + + npy_intp in1_stride = strides[0]; + npy_intp in2_stride = strides[1]; + npy_intp out1_stride = strides[2]; + npy_intp out2_stride = strides[3]; + npy_intp out3_stride = strides[4]; + + npy_string_allocator *allocators[5] = {}; + NpyString_acquire_allocators(5, context->descriptors, allocators); + npy_string_allocator *in1allocator = allocators[0]; + npy_string_allocator *in2allocator = allocators[1]; + npy_string_allocator *out1allocator = allocators[2]; + npy_string_allocator *out2allocator = allocators[3]; + npy_string_allocator *out3allocator = allocators[4]; + + PyArray_StringDTypeObject *idescr = + (PyArray_StringDTypeObject *)context->descriptors[0]; + int has_string_na = idescr->has_string_na; + const npy_static_string *default_string = &idescr->default_string; + + while (N--) { + const npy_packed_static_string *i1ps = (npy_packed_static_string *)in1; + npy_static_string i1s = {0, NULL}; + const npy_packed_static_string *i2ps = (npy_packed_static_string *)in2; + npy_static_string i2s = {0, NULL}; + + int i1_isnull = NpyString_load(in1allocator, i1ps, &i1s); + int i2_isnull = NpyString_load(in2allocator, i2ps, &i2s); + + if (i1_isnull == -1 || i2_isnull == -1) { + npy_gil_error(PyExc_MemoryError, "Failed to load string in %s", + ((PyUFuncObject *)context->caller)->name); + goto fail; + } + else if (NPY_UNLIKELY(i1_isnull || i2_isnull)) { + if (!has_string_na) { + npy_gil_error(PyExc_ValueError, + "Null values are not supported in %s", + ((PyUFuncObject *)context->caller)->name); + goto fail; + } + else { + if (i1_isnull) { + i1s = *default_string; + } + if (i2_isnull) { + i2s = *default_string; + } + } + } + + if (i2s.size == 0) { + npy_gil_error(PyExc_ValueError, "empty separator"); + goto fail; + } + + npy_intp idx = fastsearch((char *)i1s.buf, i1s.size, (char *)i2s.buf, i2s.size, -1, + fastsearch_direction); + + npy_intp out1_size, out2_size, out3_size; + + if (idx == -1) { + if (startposition == STARTPOSITION::FRONT) { + out1_size = i1s.size; + out2_size = out3_size = 0; + } + else { + out1_size = out2_size = 0; + out3_size = i1s.size; + } + } + else { + out1_size = idx; + out2_size = i2s.size; + out3_size = i1s.size - out2_size - out1_size; + } + + npy_packed_static_string *o1ps = (npy_packed_static_string *)out1; + npy_static_string o1s = {0, NULL}; + npy_packed_static_string *o2ps = (npy_packed_static_string *)out2; + npy_static_string o2s = {0, NULL}; + npy_packed_static_string *o3ps = (npy_packed_static_string *)out3; + npy_static_string o3s = {0, NULL}; + + if (load_new_string(o1ps, &o1s, out1_size, out1allocator, + ((PyUFuncObject *)context->caller)->name) == -1) { + goto fail; + } + if (load_new_string(o2ps, &o2s, out2_size, out2allocator, + ((PyUFuncObject *)context->caller)->name) == -1) { + goto fail; + } + if (load_new_string(o3ps, &o3s, out3_size, out3allocator, + ((PyUFuncObject *)context->caller)->name) == -1) { + goto fail; + } + + if (idx == -1) { + if (startposition == STARTPOSITION::FRONT) { + memcpy((char *)o1s.buf, i1s.buf, out1_size); + } + else { + memcpy((char *)o3s.buf, i1s.buf, out3_size); + } + } + else { + memcpy((char *)o1s.buf, i1s.buf, out1_size); + memcpy((char *)o2s.buf, i2s.buf, out2_size); + memcpy((char *)o3s.buf, i1s.buf + out1_size + out2_size, out3_size); + } + + in1 += in1_stride; + in2 += in2_stride; + out1 += out1_stride; + out2 += out2_stride; + out3 += out3_stride; + } + + NpyString_release_allocators(5, allocators); + return 0; + + fail: + + NpyString_release_allocators(5, allocators); + return -1; +} + NPY_NO_EXPORT int string_inputs_promoter( PyObject *ufunc_obj, PyArray_DTypeMeta *const op_dtypes[], @@ -2645,5 +2820,28 @@ init_stringdtype_ufuncs(PyObject *umath) return -1; } + PyArray_DTypeMeta *partition_dtypes[] = { + &PyArray_StringDType, + &PyArray_StringDType, + &PyArray_StringDType, + &PyArray_StringDType, + &PyArray_StringDType + }; + + const char *partition_names[] = {"_partition", "_rpartition"}; + + static STARTPOSITION partition_startpositions[] = { + STARTPOSITION::FRONT, STARTPOSITION::BACK + }; + + for (int i=0; i<2; i++) { + if (init_ufunc(umath, partition_names[i], partition_dtypes, + string_partition_resolve_descriptors, + string_partition_strided_loop, 2, 3, NPY_NO_CASTING, + (NPY_ARRAYMETHOD_FLAGS) 0, &partition_startpositions[i]) < 0) { + return -1; + } + } + return 0; } diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index c79c7db494ff..8707bed2ffbb 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -41,6 +41,10 @@ _ljust, _rjust, _zfill, + _partition, + _partition_index, + _rpartition, + _rpartition_index, ) @@ -51,7 +55,7 @@ "isupper", "istitle", "isdecimal", "isnumeric", "str_len", "find", "rfind", "index", "rindex", "count", "startswith", "endswith", "lstrip", "rstrip", "strip", "replace", "expandtabs", "center", "ljust", "rjust", - "zfill", + "zfill", "partition", "rpartition", # _vec_string - Will gradually become ufuncs as well "upper", "lower", "swapcase", "capitalize", "title", @@ -60,7 +64,7 @@ "mod", "decode", "encode", "translate", # Removed from namespace until behavior has been crystalized - # "join", "split", "rsplit", "splitlines", "partition", "rpartition", + # "join", "split", "rsplit", "splitlines", ] @@ -1315,72 +1319,98 @@ def _splitlines(a, keepends=None): a, np.object_, 'splitlines', _clean_args(keepends)) -def _partition(a, sep): +def partition(a, sep): """ - Partition each element in `a` around `sep`. + Partition each element in ``a`` around ``sep``. - Calls :meth:`str.partition` element-wise. - - For each element in `a`, split the element as the first - occurrence of `sep`, and return 3 strings containing the part + For each element in ``a``, split the element at the first + occurrence of ``sep``, and return a 3-tuple containing the part before the separator, the separator itself, and the part after - the separator. If the separator is not found, return 3 strings - containing the string itself, followed by two empty strings. + the separator. If the separator is not found, the first item of + the tuple will contain the whole string, and the second and third + ones will be the empty string. Parameters ---------- a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype Input array - sep : {str, unicode} - Separator to split each string element in `a`. + sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype + Separator to split each string element in ``a``. Returns ------- - out : ndarray - Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype, - depending on input types. The output array will have an extra - dimension with 3 elements per input element. + out : 3-tuple: + - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the + part before the separator + - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the + separator + - array with ``StringDType``, ``bytes_`` or ``str_`` dtype with the + part after the separator - Examples - -------- - >>> x = np.array(["Numpy is nice!"]) - >>> np.strings.partition(x, " ") # doctest: +SKIP - array([['Numpy', ' ', 'is nice!']], dtype='>> x = np.array(["Numpy is nice!"]) + >>> np.strings.partition(x, " ") + (array(['Numpy'], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) - >>> np.strings.rpartition(a, 'A') # doctest: +SKIP - array([['aAaAa', 'A', ''], # doctest: +SKIP - [' a', 'A', ' '], # doctest: +SKIP - ['abB', 'A', 'Bba']], dtype='>> np.strings.rpartition(a, 'A') + (array(['aAaAa', ' a', 'abB'], dtype='