diff --git a/numpy/_core/code_generators/generate_umath.py b/numpy/_core/code_generators/generate_umath.py index b72d13d11c6c..6d3fc13d7a50 100644 --- a/numpy/_core/code_generators/generate_umath.py +++ b/numpy/_core/code_generators/generate_umath.py @@ -1300,6 +1300,16 @@ def english_upper(s): docstrings.get('numpy._core.umath._zfill'), None, ), +'_partition': + Ufunc(2, 3, None, + docstrings.get('numpy._core.umath._partition'), + None, + ), +'_rpartition': + Ufunc(2, 3, None, + docstrings.get('numpy._core.umath._rpartition'), + None, + ), } def indent(st, spaces): diff --git a/numpy/_core/code_generators/ufunc_docstrings.py b/numpy/_core/code_generators/ufunc_docstrings.py index d214ffbccb55..531c3763b464 100644 --- a/numpy/_core/code_generators/ufunc_docstrings.py +++ b/numpy/_core/code_generators/ufunc_docstrings.py @@ -5028,3 +5028,85 @@ def add_newdoc(place, name, doc): array(['001', '-01', '+01'], dtype='>> x = np.array(["Numpy is nice!"]) + >>> np.strings.partition(x, " ") + array([['Numpy', ' ', 'is nice!']], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) + >>> np.strings.rpartition(a, 'A') + array([['aAaAa', 'A', ''], + [' a', 'A', ' '], + ['abB', 'A', 'Bba']], dtype='>> x = np.array(["Numpy is nice!"]) + >>> np.strings.partition(x, " ") # doctest: +SKIP + array([['Numpy', ' ', 'is nice!']], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) + >>> np.strings.rpartition(a, 'A') # doctest: +SKIP + array([['aAaAa', 'A', ''], # doctest: +SKIP + [' a', 'A', ' '], # doctest: +SKIP + ['abB', 'A', 'Bba']], dtype=' buf, npy_int64 width, Buffer out) } +template +static inline npy_bool +string_partition(Buffer buf1, Buffer buf2, + Buffer out1, Buffer out2, + npy_intp *final_len1, npy_intp *final_len2, + STARTPOSITION pos) +{ + size_t len1 = buf1.num_codepoints(); + size_t len2 = buf2.num_codepoints(); + + if (len2 == 0) { + npy_gil_error(PyExc_ValueError, "empty separator"); + *final_len1 = *final_len2 = -1; + return false; + } + + if (len1 < len2) { + buf1.buffer_memcpy(out1, len1); + *final_len1 = len1; + *final_len2 = 0; + return false; + } + + npy_intp idx; + switch(enc) { + case ENCODING::UTF8: + assert(0); // TODO + break; + case ENCODING::ASCII: + idx = fastsearch(buf1.buf, len1, buf2.buf, len2, -1, + pos == STARTPOSITION::FRONT ? FAST_SEARCH : FAST_RSEARCH); + break; + case ENCODING::UTF32: + idx = fastsearch((npy_ucs4 *)buf1.buf, len1, (npy_ucs4 *)buf2.buf, len2, -1, + pos == STARTPOSITION::FRONT ? FAST_SEARCH : FAST_RSEARCH); + break; + } + + if (idx < 0) { + if (pos == STARTPOSITION::FRONT) { + buf1.buffer_memcpy(out1, len1); + *final_len1 = len1; + *final_len2 = 0; + } + else { + buf1.buffer_memcpy(out2, len1); + *final_len1 = 0; + *final_len2 = len1; + } + return false; + } + + buf1.buffer_memcpy(out1, idx); + *final_len1 = idx; + (buf1 + idx + len2).buffer_memcpy(out2, len1 - idx - len2); + *final_len2 = len1 - idx - len2; + return true; +} + + #endif /* _NPY_CORE_SRC_UMATH_STRING_BUFFER_H_ */ diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 337c8f65ba1e..380f4206889c 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -582,6 +582,52 @@ string_zfill_loop(PyArrayMethod_Context *context, } +template +static int +string_partition_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + STARTPOSITION startposition = *(STARTPOSITION *)(context->method->static_data); + int elsize1 = context->descriptors[0]->elsize; + int elsize2 = context->descriptors[1]->elsize; + int outsize1 = context->descriptors[2]->elsize; + int outsize2 = context->descriptors[4]->elsize; + + char *in1 = data[0]; + char *in2 = data[1]; + char *out1 = data[2]; + char *out2 = data[3]; + char *out3 = data[4]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf1(in1, elsize1); + Buffer buf2(in2, elsize2); + Buffer outbuf1(out1, outsize1); + Buffer outbuf2(out3, outsize2); + + npy_intp final_len1, final_len2; + *(npy_bool *) out2 = string_partition(buf1, buf2, outbuf1, outbuf2, + &final_len1, &final_len2, startposition); + if (final_len1 < 0 || final_len2 < 0) { + return -1; + } + outbuf1.buffer_fill_with_zeros_after_index(final_len1); + outbuf2.buffer_fill_with_zeros_after_index(final_len2); + + in1 += strides[0]; + in2 += strides[1]; + out1 += strides[2]; + out2 += strides[3]; + out3 += strides[4]; + } + + return 0; +} + + /* Resolve descriptors & promoter functions */ static NPY_CASTING @@ -947,6 +993,82 @@ string_zfill_resolve_descriptors( } +static int +string_partition_promoter(PyObject *NPY_UNUSED(ufunc), + PyArray_DTypeMeta *op_dtypes[], PyArray_DTypeMeta *signature[], + PyArray_DTypeMeta *new_op_dtypes[]) +{ + Py_INCREF(op_dtypes[0]); + new_op_dtypes[0] = op_dtypes[0]; + Py_INCREF(op_dtypes[1]); + new_op_dtypes[1] = op_dtypes[1]; + + + Py_INCREF(op_dtypes[0]); + new_op_dtypes[2] = op_dtypes[0]; + new_op_dtypes[3] = NPY_DT_NewRef(&PyArray_BoolDType); + Py_INCREF(op_dtypes[4]); + new_op_dtypes[4] = op_dtypes[0]; + return 0; +} + + +static NPY_CASTING +string_partition_resolve_descriptors( + PyArrayMethodObject *NPY_UNUSED(self), + PyArray_DTypeMeta *NPY_UNUSED(dtypes[3]), + PyArray_Descr *given_descrs[3], + PyArray_Descr *loop_descrs[3], + npy_intp *NPY_UNUSED(view_offset)) +{ + if (given_descrs[2] == NULL) { + PyErr_SetString( + PyExc_TypeError, + "The 'out' kwarg is necessary. Use numpy.strings without it."); + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + if (given_descrs[4] == NULL) { + PyErr_SetString( + PyExc_TypeError, + "The 'out' kwarg is necessary. Use numpy.strings without it."); + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + loop_descrs[0] = NPY_DT_CALL_ensure_canonical(given_descrs[0]); + if (loop_descrs[0] == NULL) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + loop_descrs[1] = NPY_DT_CALL_ensure_canonical(given_descrs[1]); + if (loop_descrs[1] == NULL) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + loop_descrs[2] = NPY_DT_CALL_ensure_canonical(given_descrs[2]); + if (loop_descrs[2] == NULL) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + if (loop_descrs[3] == NULL) { + loop_descrs[3] = PyArray_DescrFromType(NPY_BOOL); + } + else { + loop_descrs[3] = NPY_DT_CALL_ensure_canonical(given_descrs[3]); + } + if (loop_descrs[3] == NULL) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + loop_descrs[4] = NPY_DT_CALL_ensure_canonical(given_descrs[4]); + if (loop_descrs[4] == NULL) { + return _NPY_ERROR_OCCURRED_IN_CAST; + } + + return NPY_NO_CASTING; +} + + /* * Machinery to add the string loops to the existing ufuncs. */ @@ -1599,6 +1721,34 @@ init_string_ufuncs(PyObject *umath) return -1; } + dtypes[0] = dtypes[1] = dtypes[2] = dtypes[4] = NPY_OBJECT; + dtypes[3] = NPY_BOOL; + + const char *partition_names[] = {"_partition", "_rpartition"}; + + static STARTPOSITION partition_startpositions[] = { + STARTPOSITION::FRONT, STARTPOSITION::BACK + }; + + for (int i = 0; i < 2; i++) { + if (init_ufunc( + umath, partition_names[i], 2, 3, dtypes, ENCODING::ASCII, + string_partition_loop, + string_partition_resolve_descriptors, &partition_startpositions[i]) < 0) { + return -1; + } + if (init_ufunc( + umath, partition_names[i], 2, 3, dtypes, ENCODING::UTF32, + string_partition_loop, + string_partition_resolve_descriptors, &partition_startpositions[i]) < 0) { + return -1; + } + if (init_promoter(umath, partition_names[i], 2, 3, + string_partition_promoter) < 0) { + return -1; + } + } + return 0; } diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index c79c7db494ff..259460c3f63c 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -41,6 +41,8 @@ _ljust, _rjust, _zfill, + _partition, + _rpartition, ) @@ -51,7 +53,7 @@ "isupper", "istitle", "isdecimal", "isnumeric", "str_len", "find", "rfind", "index", "rindex", "count", "startswith", "endswith", "lstrip", "rstrip", "strip", "replace", "expandtabs", "center", "ljust", "rjust", - "zfill", + "zfill", "partition", "rpartition", # _vec_string - Will gradually become ufuncs as well "upper", "lower", "swapcase", "capitalize", "title", @@ -60,7 +62,7 @@ "mod", "decode", "encode", "translate", # Removed from namespace until behavior has been crystalized - # "join", "split", "rsplit", "splitlines", "partition", "rpartition", + # "join", "split", "rsplit", "splitlines", ] @@ -1315,72 +1317,90 @@ def _splitlines(a, keepends=None): a, np.object_, 'splitlines', _clean_args(keepends)) -def _partition(a, sep): +def partition(a, sep): """ - Partition each element in `a` around `sep`. + Partition each element in ``a`` around ``sep``. - Calls :meth:`str.partition` element-wise. - - For each element in `a`, split the element as the first - occurrence of `sep`, and return 3 strings containing the part - before the separator, the separator itself, and the part after - the separator. If the separator is not found, return 3 strings - containing the string itself, followed by two empty strings. + For each element in ``a``, split the element at the first + occurrence of ``sep``, and return a 3-tuple containing the part + before the separator, a boolean signifying whether the separator + was found, and the part after the separator. If the separator is + not found, the first part will contain the whole string, + the boolean will be false, and the third part will be the empty + string. Parameters ---------- a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype Input array - sep : {str, unicode} - Separator to split each string element in `a`. + sep : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype + Separator to split each string element in ``a``. Returns ------- out : ndarray - Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype, - depending on input types. The output array will have an extra - dimension with 3 elements per input element. + 3-tuple: + - ``StringDType``, ``bytes_`` or ``str_`` dtype string with the part + before the separator + - ``bool_`` dtype, whether the separator was found + - ``StringDType``, ``bytes_`` or ``str_`` dtype string with the part + after the separator - Examples - -------- - >>> x = np.array(["Numpy is nice!"]) - >>> np.strings.partition(x, " ") # doctest: +SKIP - array([['Numpy', ' ', 'is nice!']], dtype='>> x = np.array(["Numpy is nice!"]) + >>> np.strings.partition(x, " ") + array([['Numpy', ' ', 'is nice!']], dtype='>> a = np.array(['aAaAaA', ' aA ', 'abBABba']) - >>> np.strings.rpartition(a, 'A') # doctest: +SKIP - array([['aAaAa', 'A', ''], # doctest: +SKIP - [' a', 'A', ' '], # doctest: +SKIP - ['abB', 'A', 'Bba']], dtype='>> np.strings.rpartition(a, 'A') + array([['aAaAa', 'A', ''], + [' a', 'A', ' '], + ['abB', 'A', 'Bba']], dtype='