ENH: Add partition/rpartition ufunc for string dtypes (#26082)

* ENH: Add partition/rpartition ufunc for string dtypes Closes #25993. * Fix doctests * Fix docstrings in ufunc_docstrings.py as well * Return array with the separators // optimize using find ufunc results * Address feedback * Fix chararray __array_finalize__ * ENH: add stringdtype partition/rpartition * BUG: remove unnecessary size_t cast * BUG: fix error handling and resource cleanup * MNT: refactor so stringdtype can combine find and partition * MNT: update signatures to reflect const API changes * MNT: simplfy fastsearch call * MNT: move variable binding out of inner loop * Fix error message about out; fix promoter * Remove unused import in defchararray; add assertion * BUG: don't use a user-provided descriptor to initialize a new stringdtype view * MNT: back out attempted fix for stringdtype view problem * MNT: address code review comments --------- Co-authored-by: Nathan Goldbaum <nathan.goldbaum@gmail.com>
numpy · Mar 26, 2024 · e1bf1d6 · e1bf1d6
1 parent 6d4a0f7
commit e1bf1d6
Show file tree

Hide file tree

Showing 9 changed files with 850 additions and 49 deletions.
diff --git a/numpy/_core/code_generators/generate_umath.py b/numpy/_core/code_generators/generate_umath.py
@@ -1300,6 +1300,26 @@ def english_upper(s):
           docstrings.get('numpy._core.umath._zfill'),
           None,
           ),
+'_partition_index':
+    Ufunc(3, 3, None,
+          docstrings.get('numpy._core.umath._partition_index'),
+          None,
+          ),
+'_rpartition_index':
+    Ufunc(3, 3, None,
+          docstrings.get('numpy._core.umath._rpartition_index'),
+          None,
+          ),
+'_partition':
+    Ufunc(2, 3, None,
+          docstrings.get('numpy._core.umath._partition'),
+          None,
+          ),
+'_rpartition':
+    Ufunc(2, 3, None,
+          docstrings.get('numpy._core.umath._rpartition'),
+          None,
+          ),
 }
 
 def indent(st, spaces):

diff --git a/numpy/_core/code_generators/ufunc_docstrings.py b/numpy/_core/code_generators/ufunc_docstrings.py
@@ -5028,3 +5028,184 @@ def add_newdoc(place, name, doc):
     array(['001', '-01', '+01'], dtype='<U3')
 
     """)
+
+add_newdoc('numpy._core.umath', '_partition_index',
+    """
+    Partition each element in ``x1`` around ``x2``, at precomputed
+    index ``x3``.
+
+    For each element in ``x1``, split the element at the first
+    occurrence of ``x2`` at location ``x3``, and return a 3-tuple
+    containing the part before the separator, the separator itself,
+    and the part after the separator. If the separator is not found,
+    the first item of the tuple will contain the whole string, and
+    the second and third ones will be the empty string.
+
+    Parameters
+    ----------
+    x1 : array-like, with ``bytes_``, or ``str_`` dtype
+        Input array
+    x2 : array-like, with ``bytes_``, or ``str_`` dtype
+        Separator to split each string element in ``x1``.
+    x3 : array-like, with any integer dtype
+        The indices of the separator (<0 to indicate the separator is not
+        present).
+
+    Returns
+    -------
+    out : 3-tuple:
+        - array with ``bytes_`` or ``str_`` dtype with the part before the
+          separator
+        - array with ``bytes_`` or ``str_`` dtype with the separator
+        - array with ``bytes_`` or ``str_`` dtype with the part after the
+          separator
+
+    See Also
+    --------
+    str.partition
+
+    Examples
+    --------
+    The ufunc is used most easily via ``np.strings.partition``,
+    which calls it after calculating the indices::
+
+    >>> x = np.array(["Numpy is nice!"])
+    >>> np.strings.partition(x, " ")
+    (array(['Numpy'], dtype='<U5'),
+     array([' '], dtype='<U1'),
+     array(['is nice!'], dtype='<U8'))
+
+    """)
+
+add_newdoc('numpy._core.umath', '_rpartition_index',
+    """
+    Partition each element in ``x1`` around the right-most separator,
+    ``x2``, at precomputed index ``x3``.
+
+    For each element in ``x1``, split the element at the last
+    occurrence of ``x2`` at location ``x3``, and return a 3-tuple
+    containing the part before the separator, the separator itself,
+    and the part after the separator. If the separator is not found,
+    the third item of the tuple will contain the whole string, and
+    the first and second ones will be the empty string.
+
+    Parameters
+    ----------
+    x1 : array-like, with ``bytes_``, or ``str_`` dtype
+        Input array
+    x2 : array-like, with ``bytes_``, or ``str_`` dtype
+        Separator to split each string element in ``x1``.
+    x3 : array-like, with any integer dtype
+        The indices of the separator (<0 to indicate the separator is not
+        present).
+
+    Returns
+    -------
+    out : 3-tuple:
+        - array with ``bytes_`` or ``str_`` dtype with the part before the
+          separator
+        - array with ``bytes_`` or ``str_`` dtype with the separator
+        - array with ``bytes_`` or ``str_`` dtype with the part after the
+          separator
+
+    See Also
+    --------
+    str.rpartition
+
+    Examples
+    --------
+    The ufunc is used most easily via ``np.strings.rpartition``,
+    which calls it after calculating the indices::
+
+    >>> a = np.array(['aAaAaA', '  aA  ', 'abBABba'])
+    >>> np.strings.rpartition(a, 'A')
+    (array(['aAaAa', '  a', 'abB'], dtype='<U5'),
+     array(['A', 'A', 'A'], dtype='<U1'),
+     array(['', '  ', 'Bba'], dtype='<U3'))
+
+    """)
+
+add_newdoc('numpy._core.umath', '_partition',
+    """
+    Partition each element in ``x1`` around ``x2``.
+
+    For each element in ``x1``, split the element at the first
+    occurrence of ``x2`` and return a 3-tuple containing the part before
+    the separator, the separator itself, and the part after the
+    separator. If the separator is not found, the first item of the
+    tuple will contain the whole string, and the second and third ones
+    will be the empty string.
+
+    Parameters
+    ----------
+    x1 : array-like, with ``StringDType`` dtype
+        Input array
+    x2 : array-like, with ``StringDType`` dtype
+        Separator to split each string element in ``x1``.
+
+    Returns
+    -------
+    out : 3-tuple:
+        - ``StringDType`` array with the part before the separator
+        - ``StringDType`` array with the separator
+        - ``StringDType`` array with the part after the separator
+
+    See Also
+    --------
+    str.partition
+
+    Examples
+    --------
+    The ufunc is used most easily via ``np.strings.partition``,
+    which calls it under the hood::
+
+    >>> x = np.array(["Numpy is nice!"], dtype="T")
+    >>> np.strings.partition(x, " ")
+    (array(['Numpy'], dtype=StringDType()),
+     array([' '], dtype=StringDType()),
+     array(['is nice!'], dtype=StringDType()))
+
+    """)
+
+add_newdoc('numpy._core.umath', '_rpartition',
+    """
+    Partition each element in ``x1`` around the right-most separator,
+    ``x2``.
+
+    For each element in ``x1``, split the element at the last
+    occurrence of ``x2`` at location ``x3``, and return a 3-tuple
+    containing the part before the separator, the separator itself,
+    and the part after the separator. If the separator is not found,
+    the third item of the tuple will contain the whole string, and
+    the first and second ones will be the empty string.
+
+    Parameters
+    ----------
+    x1 : array-like, with ``StringDType`` dtype
+        Input array
+    x2 : array-like, with ``StringDType`` dtype
+        Separator to split each string element in ``x1``.
+
+    Returns
+    -------
+    out : 3-tuple:
+        - ``StringDType`` array with the part before the separator
+        - ``StringDType`` array with the separator
+        - ``StringDType`` array with the part after the separator
+
+    See Also
+    --------
+    str.rpartition
+
+    Examples
+    --------
+    The ufunc is used most easily via ``np.strings.rpartition``,
+    which calls it after calculating the indices::
+
+    >>> a = np.array(['aAaAaA', '  aA  ', 'abBABba'], dtype="T")
+    >>> np.strings.rpartition(a, 'A')
+    (array(['aAaAa', '  a', 'abB'], dtype=StringDType()),
+     array(['A', 'A', 'A'], dtype=StringDType()),
+     array(['', '  ', 'Bba'], dtype=StringDType()))
+
+    """)
diff --git a/numpy/_core/defchararray.py b/numpy/_core/defchararray.py
@@ -17,16 +17,19 @@
 """
 import functools
 
+import numpy as np
 from .._utils import set_module
 from .numerictypes import bytes_, str_, character
 from .numeric import ndarray, array as narray, asarray as asnarray
 from numpy._core.multiarray import compare_chararrays
 from numpy._core import overrides
 from numpy.strings import *
-from numpy.strings import multiply as strings_multiply
+from numpy.strings import (
+    multiply as strings_multiply,
+    partition as strings_partition,
+    rpartition as strings_rpartition,
+)
 from numpy._core.strings import (
-    _partition as partition,
-    _rpartition as rpartition,
     _split as split,
     _rsplit as rsplit,
     _splitlines as splitlines,
@@ -303,6 +306,88 @@ def multiply(a, i):
         raise ValueError("Can only multiply by integers")
 
 
+def partition(a, sep):
+    """
+    Partition each element in `a` around `sep`.
+
+    Calls :meth:`str.partition` element-wise.
+
+    For each element in `a`, split the element as the first
+    occurrence of `sep`, and return 3 strings containing the part
+    before the separator, the separator itself, and the part after
+    the separator. If the separator is not found, return 3 strings
+    containing the string itself, followed by two empty strings.
+
+    Parameters
+    ----------
+    a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
+        Input array
+    sep : {str, unicode}
+        Separator to split each string element in `a`.
+
+    Returns
+    -------
+    out : ndarray
+        Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
+        depending on input types. The output array will have an extra
+        dimension with 3 elements per input element.
+
+    Examples
+    --------
+    >>> x = np.array(["Numpy is nice!"])
+    >>> np.char.partition(x, " ")
+    array([['Numpy', ' ', 'is nice!']], dtype='<U8')
+    
+    See Also
+    --------
+    str.partition
+
+    """
+    return np.stack(strings_partition(a, sep), axis=-1)
+
+
+def rpartition(a, sep):
+    """
+    Partition (split) each element around the right-most separator.
+
+    Calls :meth:`str.rpartition` element-wise.
+
+    For each element in `a`, split the element as the last
+    occurrence of `sep`, and return 3 strings containing the part
+    before the separator, the separator itself, and the part after
+    the separator. If the separator is not found, return 3 strings
+    containing the string itself, followed by two empty strings.
+
+    Parameters
+    ----------
+    a : array-like, with ``StringDType``, ``bytes_``, or ``str_`` dtype
+        Input array
+    sep : str or unicode
+        Right-most separator to split each element in array.
+
+    Returns
+    -------
+    out : ndarray
+        Output array of ``StringDType``, ``bytes_`` or ``str_`` dtype,
+        depending on input types. The output array will have an extra
+        dimension with 3 elements per input element.
+
+    See Also
+    --------
+    str.rpartition
+
+    Examples
+    --------
+    >>> a = np.array(['aAaAaA', '  aA  ', 'abBABba'])
+    >>> np.char.rpartition(a, 'A')
+    array([['aAaAa', 'A', ''],
+       ['  a', 'A', '  '],
+       ['abB', 'A', 'Bba']], dtype='<U5')
+
+    """
+    return np.stack(strings_rpartition(a, sep), axis=-1)
+
+
 @set_module("numpy.char")
 class chararray(ndarray):
     """
@@ -487,7 +572,7 @@ def __array_wrap__(self, arr, context=None, return_scalar=False):
 
     def __array_finalize__(self, obj):
         # The b is a special case because it is used for reconstructing.
-        if self.dtype.char not in 'SUbc':
+        if self.dtype.char not in 'VSUbc':
             raise ValueError("Can only create a chararray from string data.")
 
     def __getitem__(self, obj):

diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h
@@ -1593,4 +1593,46 @@ string_zfill(Buffer<enc> buf, npy_int64 width, Buffer<enc> out)
 }
 
 
+template <ENCODING enc>
+static inline void
+string_partition(Buffer<enc> buf1, Buffer<enc> buf2, npy_int64 idx,
+                 Buffer<enc> out1, Buffer<enc> out2, Buffer<enc> out3,
+                 npy_intp *final_len1, npy_intp *final_len2, npy_intp *final_len3,
+                 STARTPOSITION pos)
+{
+    // StringDType uses a ufunc that implements the find-part as well
+    assert(enc != ENCODING::UTF8);
+
+    size_t len1 = buf1.num_codepoints();
+    size_t len2 = buf2.num_codepoints();
+
+    if (len2 == 0) {
+        npy_gil_error(PyExc_ValueError, "empty separator");
+        *final_len1 = *final_len2 = *final_len3 = -1;
+        return;
+    }
+
+    if (idx < 0) {
+        if (pos == STARTPOSITION::FRONT) {
+            buf1.buffer_memcpy(out1, len1);
+            *final_len1 = len1;
+            *final_len2 = *final_len3 = 0;
+        }
+        else {
+            buf1.buffer_memcpy(out3, len1);
+            *final_len1 = *final_len2 = 0;
+            *final_len3 = len1;
+        }
+        return;
+    }
+
+    buf1.buffer_memcpy(out1, idx);
+    *final_len1 = idx;
+    buf2.buffer_memcpy(out2, len2);
+    *final_len2 = len2;
+    (buf1 + idx + len2).buffer_memcpy(out3, len1 - idx - len2);
+    *final_len3 = len1 - idx - len2;
+}
+
+
 #endif /* _NPY_CORE_SRC_UMATH_STRING_BUFFER_H_ */