Skip to content

Commit

Permalink
ENH: Add partition/rpartition ufunc for string dtypes (#26082)
Browse files Browse the repository at this point in the history
* ENH: Add partition/rpartition ufunc for string dtypes

Closes #25993.

* Fix doctests

* Fix docstrings in ufunc_docstrings.py as well

* Return array with the separators // optimize using find ufunc results

* Address feedback

* Fix chararray __array_finalize__

* ENH: add stringdtype partition/rpartition

* BUG: remove unnecessary size_t cast

* BUG: fix error handling and resource cleanup

* MNT: refactor so stringdtype can combine find and partition

* MNT: update signatures to reflect const API changes

* MNT: simplfy fastsearch call

* MNT: move variable binding out of inner loop

* Fix error message about out; fix promoter

* Remove unused import in defchararray; add assertion

* BUG: don't use a user-provided descriptor to initialize a new stringdtype view

* MNT: back out attempted fix for stringdtype view problem

* MNT: address code review comments

---------

Co-authored-by: Nathan Goldbaum <nathan.goldbaum@gmail.com>
  • Loading branch information
lysnikolaou and ngoldbaum committed Mar 26, 2024
1 parent 6d4a0f7 commit e1bf1d6
Show file tree
Hide file tree
Showing 9 changed files with 850 additions and 49 deletions.
20 changes: 20 additions & 0 deletions numpy/_core/code_generators/generate_umath.py
Expand Up @@ -1300,6 +1300,26 @@ def english_upper(s):
docstrings.get('numpy._core.umath._zfill'),
None,
),
'_partition_index':
Ufunc(3, 3, None,
docstrings.get('numpy._core.umath._partition_index'),
None,
),
'_rpartition_index':
Ufunc(3, 3, None,
docstrings.get('numpy._core.umath._rpartition_index'),
None,
),
'_partition':
Ufunc(2, 3, None,
docstrings.get('numpy._core.umath._partition'),
None,
),
'_rpartition':
Ufunc(2, 3, None,
docstrings.get('numpy._core.umath._rpartition'),
None,
),
}

def indent(st, spaces):
Expand Down
181 changes: 181 additions & 0 deletions numpy/_core/code_generators/ufunc_docstrings.py
Expand Up @@ -5028,3 +5028,184 @@ def add_newdoc(place, name, doc):
array(['001', '-01', '+01'], dtype='<U3')
""")

add_newdoc('numpy._core.umath', '_partition_index',
"""
Partition each element in ``x1`` around ``x2``, at precomputed
index ``x3``.
For each element in ``x1``, split the element at the first
occurrence of ``x2`` at location ``x3``, and return a 3-tuple
containing the part before the separator, the separator itself,
and the part after the separator. If the separator is not found,
the first item of the tuple will contain the whole string, and
the second and third ones will be the empty string.
Parameters
----------
x1 : array-like, with ``bytes_``, or ``str_`` dtype
Input array
x2 : array-like, with ``bytes_``, or ``str_`` dtype
Separator to split each string element in ``x1``.
x3 : array-like, with any integer dtype
The indices of the separator (<0 to indicate the separator is not
present).
Returns
-------
out : 3-tuple:
- array with ``bytes_`` or ``str_`` dtype with the part before the
separator
- array with ``bytes_`` or ``str_`` dtype with the separator
- array with ``bytes_`` or ``str_`` dtype with the part after the
separator
See Also
--------
str.partition
Examples
--------
The ufunc is used most easily via ``np.strings.partition``,
which calls it after calculating the indices::
>>> x = np.array(["Numpy is nice!"])
>>> np.strings.partition(x, " ")
(array(['Numpy'], dtype='<U5'),
array([' '], dtype='<U1'),
array(['is nice!'], dtype='<U8'))
""")

add_newdoc('numpy._core.umath', '_rpartition_index',
"""
Partition each element in ``x1`` around the right-most separator,
``x2``, at precomputed index ``x3``.
For each element in ``x1``, split the element at the last
occurrence of ``x2`` at location ``x3``, and return a 3-tuple
containing the part before the separator, the separator itself,
and the part after the separator. If the separator is not found,
the third item of the tuple will contain the whole string, and
the first and second ones will be the empty string.
Parameters
----------
x1 : array-like, with ``bytes_``, or ``str_`` dtype
Input array
x2 : array-like, with ``bytes_``, or ``str_`` dtype
Separator to split each string element in ``x1``.
x3 : array-like, with any integer dtype
The indices of the separator (<0 to indicate the separator is not
present).
Returns
-------
out : 3-tuple:
- array with ``bytes_`` or ``str_`` dtype with the part before the
separator
- array with ``bytes_`` or ``str_`` dtype with the separator
- array with ``bytes_`` or ``str_`` dtype with the part after the
separator
See Also
--------
str.rpartition
Examples
--------
The ufunc is used most easily via ``np.strings.rpartition``,
which calls it after calculating the indices::
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.strings.rpartition(a, 'A')
(array(['aAaAa', ' a', 'abB'], dtype='<U5'),
array(['A', 'A', 'A'], dtype='<U1'),
array(['', ' ', 'Bba'], dtype='<U3'))
""")

add_newdoc('numpy._core.umath', '_partition',
"""
Partition each element in ``x1`` around ``x2``.
For each element in ``x1``, split the element at the first
occurrence of ``x2`` and return a 3-tuple containing the part before
the separator, the separator itself, and the part after the
separator. If the separator is not found, the first item of the
tuple will contain the whole string, and the second and third ones
will be the empty string.
Parameters
----------
x1 : array-like, with ``StringDType`` dtype
Input array
x2 : array-like, with ``StringDType`` dtype
Separator to split each string element in ``x1``.
Returns
-------
out : 3-tuple:
- ``StringDType`` array with the part before the separator
- ``StringDType`` array with the separator
- ``StringDType`` array with the part after the separator
See Also
--------
str.partition
Examples
--------
The ufunc is used most easily via ``np.strings.partition``,
which calls it under the hood::
>>> x = np.array(["Numpy is nice!"], dtype="T")
>>> np.strings.partition(x, " ")
(array(['Numpy'], dtype=StringDType()),
array([' '], dtype=StringDType()),
array(['is nice!'], dtype=StringDType()))
""")

add_newdoc('numpy._core.umath', '_rpartition',
"""
Partition each element in ``x1`` around the right-most separator,
``x2``.
For each element in ``x1``, split the element at the last
occurrence of ``x2`` at location ``x3``, and return a 3-tuple
containing the part before the separator, the separator itself,
and the part after the separator. If the separator is not found,
the third item of the tuple will contain the whole string, and
the first and second ones will be the empty string.
Parameters
----------
x1 : array-like, with ``StringDType`` dtype
Input array
x2 : array-like, with ``StringDType`` dtype
Separator to split each string element in ``x1``.
Returns
-------
out : 3-tuple:
- ``StringDType`` array with the part before the separator
- ``StringDType`` array with the separator
- ``StringDType`` array with the part after the separator
See Also
--------
str.rpartition
Examples
--------
The ufunc is used most easily via ``np.strings.rpartition``,
which calls it after calculating the indices::
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'], dtype="T")
>>> np.strings.rpartition(a, 'A')
(array(['aAaAa', ' a', 'abB'], dtype=StringDType()),
array(['A', 'A', 'A'], dtype=StringDType()),
array(['', ' ', 'Bba'], dtype=StringDType()))
""")
93 changes: 89 additions & 4 deletions numpy/_core/defchararray.py
Expand Up @@ -17,16 +17,19 @@
"""
import functools

import numpy as np
from .._utils import set_module
from .numerictypes import bytes_, str_, character
from .numeric import ndarray, array as narray, asarray as asnarray
from numpy._core.multiarray import compare_chararrays
from numpy._core import overrides
from numpy.strings import *
from numpy.strings import multiply as strings_multiply
from numpy.strings import (
multiply as strings_multiply,
partition as strings_partition,
rpartition as strings_rpartition,
)
from numpy._core.strings import (
_partition as partition,
_rpartition as rpartition,
_split as split,
_rsplit as rsplit,
_splitlines as splitlines,
Expand Down Expand Up @@ -303,6 +306,88 @@ def multiply(a, i):
raise ValueError("Can only multiply by integers")


def partition(a, sep):
"""
Partition each element in `a` around `sep`.
Calls :meth:`str.partition` element-wise.
For each element in `a`, split the element as the first
occurrence of `sep`, and return 3 strings containing the part
before the separator, the separator itself, and the part after
the separator. If the separator is not found, return 3 strings
containing the string itself, followed by two empty strings.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array
sep : {str, unicode}
Separator to split each string element in `a`.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types. The output array will have an extra
dimension with 3 elements per input element.
Examples
--------
>>> x = np.array(["Numpy is nice!"])
>>> np.char.partition(x, " ")
array([['Numpy', ' ', 'is nice!']], dtype='<U8')
See Also
--------
str.partition
"""
return np.stack(strings_partition(a, sep), axis=-1)


def rpartition(a, sep):
"""
Partition (split) each element around the right-most separator.
Calls :meth:`str.rpartition` element-wise.
For each element in `a`, split the element as the last
occurrence of `sep`, and return 3 strings containing the part
before the separator, the separator itself, and the part after
the separator. If the separator is not found, return 3 strings
containing the string itself, followed by two empty strings.
Parameters
----------
a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
Input array
sep : str or unicode
Right-most separator to split each element in array.
Returns
-------
out : ndarray
Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
depending on input types. The output array will have an extra
dimension with 3 elements per input element.
See Also
--------
str.rpartition
Examples
--------
>>> a = np.array(['aAaAaA', ' aA ', 'abBABba'])
>>> np.char.rpartition(a, 'A')
array([['aAaAa', 'A', ''],
[' a', 'A', ' '],
['abB', 'A', 'Bba']], dtype='<U5')
"""
return np.stack(strings_rpartition(a, sep), axis=-1)


@set_module("numpy.char")
class chararray(ndarray):
"""
Expand Down Expand Up @@ -487,7 +572,7 @@ def __array_wrap__(self, arr, context=None, return_scalar=False):

def __array_finalize__(self, obj):
# The b is a special case because it is used for reconstructing.
if self.dtype.char not in 'SUbc':
if self.dtype.char not in 'VSUbc':
raise ValueError("Can only create a chararray from string data.")

def __getitem__(self, obj):
Expand Down
42 changes: 42 additions & 0 deletions numpy/_core/src/umath/string_buffer.h
Expand Up @@ -1593,4 +1593,46 @@ string_zfill(Buffer<enc> buf, npy_int64 width, Buffer<enc> out)
}


template <ENCODING enc>
static inline void
string_partition(Buffer<enc> buf1, Buffer<enc> buf2, npy_int64 idx,
Buffer<enc> out1, Buffer<enc> out2, Buffer<enc> out3,
npy_intp *final_len1, npy_intp *final_len2, npy_intp *final_len3,
STARTPOSITION pos)
{
// StringDType uses a ufunc that implements the find-part as well
assert(enc != ENCODING::UTF8);

size_t len1 = buf1.num_codepoints();
size_t len2 = buf2.num_codepoints();

if (len2 == 0) {
npy_gil_error(PyExc_ValueError, "empty separator");
*final_len1 = *final_len2 = *final_len3 = -1;
return;
}

if (idx < 0) {
if (pos == STARTPOSITION::FRONT) {
buf1.buffer_memcpy(out1, len1);
*final_len1 = len1;
*final_len2 = *final_len3 = 0;
}
else {
buf1.buffer_memcpy(out3, len1);
*final_len1 = *final_len2 = 0;
*final_len3 = len1;
}
return;
}

buf1.buffer_memcpy(out1, idx);
*final_len1 = idx;
buf2.buffer_memcpy(out2, len2);
*final_len2 = len2;
(buf1 + idx + len2).buffer_memcpy(out3, len1 - idx - len2);
*final_len3 = len1 - idx - len2;
}


#endif /* _NPY_CORE_SRC_UMATH_STRING_BUFFER_H_ */

0 comments on commit e1bf1d6

Please sign in to comment.