From fdb306219a69da782486b7304a5203923aa99d38 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Thu, 8 Feb 2024 13:30:30 +0100 Subject: [PATCH 1/9] ENH: Add islower/isupper/istitle ufuncs for unicode/bytes dtypes --- numpy/_core/code_generators/generate_umath.py | 15 ++ .../_core/code_generators/ufunc_docstrings.py | 91 ++++++++++ numpy/_core/src/umath/string_buffer.h | 162 ++++++++++++++++++ numpy/_core/src/umath/string_ufuncs.cpp | 111 ++++++++++++ numpy/_core/strings.py | 114 +----------- numpy/_core/tests/test_strings.py | 111 ++++++++++++ 6 files changed, 497 insertions(+), 107 deletions(-) diff --git a/numpy/_core/code_generators/generate_umath.py b/numpy/_core/code_generators/generate_umath.py index c4e0c0facfff..50c0ea673d37 100644 --- a/numpy/_core/code_generators/generate_umath.py +++ b/numpy/_core/code_generators/generate_umath.py @@ -1170,6 +1170,21 @@ def english_upper(s): docstrings.get('numpy._core.umath.isspace'), None, ), +'islower': + Ufunc(1, 1, False_, + docstrings.get('numpy._core.umath.islower'), + None, + ), +'isupper': + Ufunc(1, 1, False_, + docstrings.get('numpy._core.umath.isupper'), + None, + ), +'istitle': + Ufunc(1, 1, False_, + docstrings.get('numpy._core.umath.istitle'), + None, + ), 'isdecimal': Ufunc(1, 1, False_, docstrings.get('numpy._core.umath.isdecimal'), diff --git a/numpy/_core/code_generators/ufunc_docstrings.py b/numpy/_core/code_generators/ufunc_docstrings.py index 461549367084..a081856f44b6 100644 --- a/numpy/_core/code_generators/ufunc_docstrings.py +++ b/numpy/_core/code_generators/ufunc_docstrings.py @@ -4441,6 +4441,97 @@ def add_newdoc(place, name, doc): """) +add_newdoc('numpy._core.umath', 'islower', + """ + Returns true for each element if all cased characters in the + string are lowercase and there is at least one cased character, + false otherwise. + + Parameters + ---------- + x : array_like, with `np.bytes_` or `np.str_` dtype + $PARAMS + + Returns + ------- + out : ndarray + Output array of bools + $OUT_SCALAR_1 + + See Also + -------- + str.islower + + Examples + -------- + >>> np.strings.islower("GHC") + array(False) + >>> np.strings.islower("ghc") + array(True) + + """) + +add_newdoc('numpy._core.umath', 'isupper', + """ + Return true for each element if all cased characters in the + string are uppercase and there is at least one character, false + otherwise. + + Parameters + ---------- + x : array_like, with `np.bytes_` or `np.str_` dtype + $PARAMS + + Returns + ------- + out : ndarray + Output array of bools + $OUT_SCALAR_1 + + See Also + -------- + str.isupper + + Examples + -------- + >>> np.strings.isupper("GHC") + array(True) + >>> a = np.array(["hello", "HELLO", "Hello"]) + >>> np.strings.isupper(a) + array([False, True, False]) + + """) + +add_newdoc('numpy._core.umath', 'istitle', + """ + Returns true for each element if the element is a titlecased + string and there is at least one character, false otherwise. + + Parameters + ---------- + x : array_like, with `np.bytes_` or `np.str_` dtype + $PARAMS + + Returns + ------- + out : ndarray + Output array of bools + $OUT_SCALAR_1 + + See Also + -------- + str.istitle + + Examples + -------- + >>> np.strings.istitle("Numpy Is Great") + array(True) + + >>> np.strings.istitle("Numpy is great") + array(False) + + """) + add_newdoc('numpy._core.umath', 'isdecimal', """ For each element, return True if there are only decimal diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index 45139237ff5d..c3bc75e3a759 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -26,6 +26,9 @@ enum class IMPLEMENTED_UNARY_FUNCTIONS { ISDECIMAL, ISDIGIT, ISSPACE, + ISLOWER, + ISUPPER, + ISTITLE, ISNUMERIC, STR_LEN, }; @@ -136,6 +139,81 @@ codepoint_isspace(npy_ucs4 code) return Py_UNICODE_ISSPACE(code); } +template +inline bool +codepoint_islower(npy_ucs4 code); + +template<> +inline bool +codepoint_islower(npy_ucs4 code) +{ + return Py_ISLOWER((char) code); +} + +template<> +inline bool +codepoint_islower(npy_ucs4 code) +{ + return Py_UNICODE_ISLOWER(code); +} + +template<> +inline bool +codepoint_islower(npy_ucs4 code) +{ + return Py_UNICODE_ISLOWER(code); +} + +template +inline bool +codepoint_isupper(npy_ucs4 code); + +template<> +inline bool +codepoint_isupper(npy_ucs4 code) +{ + return Py_ISUPPER((char) code); +} + +template<> +inline bool +codepoint_isupper(npy_ucs4 code) +{ + return Py_UNICODE_ISUPPER(code); +} + +template<> +inline bool +codepoint_isupper(npy_ucs4 code) +{ + return Py_UNICODE_ISUPPER(code); +} + +template +inline bool +codepoint_istitle(npy_ucs4); + +template<> +inline bool +codepoint_istitle(npy_ucs4 code) +{ + return false; +} + +template<> +inline bool +codepoint_istitle(npy_ucs4 code) +{ + return Py_UNICODE_ISTITLE(code); +} + +template<> +inline bool +codepoint_istitle(npy_ucs4 code) +{ + return Py_UNICODE_ISTITLE(code); +} + inline bool codepoint_isnumeric(npy_ucs4 code) { @@ -389,6 +467,84 @@ struct Buffer { return unary_loop(); } + inline bool + islower() + { + size_t len = num_codepoints(); + if (len == 0) { + return false; + } + + Buffer tmp = *this; + bool cased = 0; + for (size_t i = 0; i < len; i++) { + if (codepoint_isupper(*tmp) || codepoint_istitle(*tmp)) { + return false; + } + else if (!cased && codepoint_islower(*tmp)) { + cased = true; + } + tmp++; + } + return cased; + } + + inline bool + isupper() + { + size_t len = num_codepoints(); + if (len == 0) { + return false; + } + + Buffer tmp = *this; + bool cased = 0; + for (size_t i = 0; i < len; i++) { + if (codepoint_islower(*tmp) || codepoint_istitle(*tmp)) { + return false; + } + else if (!cased && codepoint_isupper(*tmp)) { + cased = true; + } + tmp++; + } + return cased; + } + + inline bool + istitle() + { + size_t len = num_codepoints(); + if (len == 0) { + return false; + } + + Buffer tmp = *this; + bool cased = false; + bool previous_is_cased = false; + for (size_t i = 0; i < len; i++) { + if (codepoint_isupper(*tmp) || codepoint_istitle(*tmp)) { + if (previous_is_cased) { + return false; + } + previous_is_cased = true; + cased = true; + } + else if (codepoint_islower(*tmp)) { + if (!previous_is_cased) { + return false; + } + previous_is_cased = true; + cased = true; + } + else { + previous_is_cased = false; + } + tmp++; + } + return cased; + } + inline bool isnumeric() { @@ -466,6 +622,12 @@ struct call_buffer_member_function { return codepoint_isspace(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::STR_LEN: return (T)buf.num_codepoints(); + case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: + return (T)buf.islower(); + case IMPLEMENTED_UNARY_FUNCTIONS::ISUPPER: + return (T)buf.isupper(); + case IMPLEMENTED_UNARY_FUNCTIONS::ISTITLE: + return (T)buf.istitle(); case IMPLEMENTED_UNARY_FUNCTIONS::ISNUMERIC: return codepoint_isnumeric(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISDECIMAL: diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index a9ebd929e4a2..07fe1b81fd2d 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -283,6 +283,84 @@ string_isspace_loop(PyArrayMethod_Context *context, } +template +static int +string_islower_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + int elsize = context->descriptors[0]->elsize; + + char *in = data[0]; + char *out = data[1]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf(in, elsize); + npy_bool res = buf.islower(); + *(npy_bool *)out = res; + + in += strides[0]; + out += strides[1]; + } + + return 0; +} + + +template +static int +string_isupper_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + int elsize = context->descriptors[0]->elsize; + + char *in = data[0]; + char *out = data[1]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf(in, elsize); + npy_bool res = buf.isupper(); + *(npy_bool *)out = res; + + in += strides[0]; + out += strides[1]; + } + + return 0; +} + + +template +static int +string_istitle_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + int elsize = context->descriptors[0]->elsize; + + char *in = data[0]; + char *out = data[1]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf(in, elsize); + npy_bool res = buf.istitle(); + *(npy_bool *)out = res; + + in += strides[0]; + out += strides[1]; + } + + return 0; +} + + static int string_isdecimal_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], @@ -1255,6 +1333,39 @@ init_string_ufuncs(PyObject *umath) return -1; } + if (init_ufunc( + umath, "islower", "templated_string_islower", 1, 1, dtypes, + string_islower_loop, NULL) < 0) { + return -1; + } + if (init_ufunc( + umath, "islower", "templated_string_islower", 1, 1, dtypes, + string_islower_loop, NULL) < 0) { + return -1; + } + + if (init_ufunc( + umath, "isupper", "templated_string_isupper", 1, 1, dtypes, + string_isupper_loop, NULL) < 0) { + return -1; + } + if (init_ufunc( + umath, "isupper", "templated_string_isupper", 1, 1, dtypes, + string_isupper_loop, NULL) < 0) { + return -1; + } + + if (init_ufunc( + umath, "istitle", "templated_string_istitle", 1, 1, dtypes, + string_istitle_loop, NULL) < 0) { + return -1; + } + if (init_ufunc( + umath, "istitle", "templated_string_istitle", 1, 1, dtypes, + string_istitle_loop, NULL) < 0) { + return -1; + } + if (init_ufunc( umath, "isdecimal", "templated_string_isdecimal", 1, 1, dtypes, string_isdecimal_loop, NULL) < 0) { diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index 31ed004d78c7..6aa3da77f90a 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -13,6 +13,9 @@ isalpha, isdigit, isspace, + islower, + isupper, + istitle, isdecimal, isnumeric, str_len, @@ -34,12 +37,12 @@ __all__ = [ # UFuncs "equal", "not_equal", "less", "less_equal", "greater", "greater_equal", - "add", "isalpha", "isdigit", "isspace", "isdecimal", "isnumeric", - "str_len", "find", "rfind", "count", "startswith", "endswith", - "lstrip", "rstrip", "strip", "replace", + "add", "isalpha", "isdigit", "isspace", "islower", "isupper", "istitle", + "isdecimal", "isnumeric", "str_len", "find", "rfind", "count", + "startswith", "endswith", "lstrip", "rstrip", "strip", "replace", # _vec_string - Will gradually become ufuncs as well - "isalnum", "islower", "istitle", "isupper", "multiply", "mod", "index", + "isalnum", "multiply", "mod", "index", "rindex", "decode", "encode", "expandtabs", "center", "ljust", "rjust", "zfill", "upper", "lower", "swapcase", "capitalize", "title", "join", "split", "rsplit", "splitlines", @@ -125,109 +128,6 @@ def isalnum(a): return _vec_string(a, np.bool, 'isalnum') -def islower(a): - """ - Returns true for each element if all cased characters in the - string are lowercase and there is at least one cased character, - false otherwise. - - Calls :meth:`str.islower` element-wise. - - For 8-bit strings, this method is locale-dependent. - - Parameters - ---------- - a : array_like, with `np.bytes_` or `np.str_` dtype - - Returns - ------- - out : ndarray - Output array of bools - - See Also - -------- - str.islower - - Examples - -------- - >>> np.strings.islower("GHC") - array(False) - >>> np.strings.islower("ghc") - array(True) - - """ - return _vec_string(a, np.bool, 'islower') - - -def istitle(a): - """ - Returns true for each element if the element is a titlecased - string and there is at least one character, false otherwise. - - Call :meth:`str.istitle` element-wise. - - For 8-bit strings, this method is locale-dependent. - - Parameters - ---------- - a : array_like, with `np.bytes_` or `np.str_` dtype - - Returns - ------- - out : ndarray - Output array of bools - - See Also - -------- - str.istitle - - Examples - -------- - >>> np.strings.istitle("Numpy Is Great") - array(True) - - >>> np.strings.istitle("Numpy is great") - array(False) - - """ - return _vec_string(a, np.bool, 'istitle') - - -def isupper(a): - """ - Return true for each element if all cased characters in the - string are uppercase and there is at least one character, false - otherwise. - - Call :meth:`str.isupper` element-wise. - - For 8-bit strings, this method is locale-dependent. - - Parameters - ---------- - a : array_like, with `np.bytes_` or `np.str_` dtype - - Returns - ------- - out : ndarray - Output array of bools - - See Also - -------- - str.isupper - - Examples - -------- - >>> np.strings.isupper("GHC") - array(True) - >>> a = np.array(["hello", "HELLO", "Hello"]) - >>> np.strings.isupper(a) - array([False, True, False]) - - """ - return _vec_string(a, np.bool, 'isupper') - - def multiply(a, i): """ Return (a * i), that is string multiple concatenation, diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index f6f7a0ced0dd..efe24658025d 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -167,6 +167,61 @@ def test_isspace(self, in_, out, dt): in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.isspace(in_), out) + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', True), + ('A', False), + ('\n', False), + ('abc', True), + ('aBc', False), + ('abc\n', True), + ]) + def test_islower(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.islower(in_), out) + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', False), + ('A', True), + ('\n', False), + ('ABC', True), + ('AbC', False), + ('ABC\n', True), + ]) + def test_isupper(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.isupper(in_), out) + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', False), + ('A', True), + ('\n', False), + ('A Titlecased Line', True), + ('A\nTitlecased Line', True), + ('A Titlecased, Line', True), + ('Not a capitalized String', False), + ('Not\ta Titlecase String', False), + ('Not--a Titlecase String', False), + ('NOT', False), + ]) + def test_istitle(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.istitle(in_), out) + @pytest.mark.parametrize("in_,out", [ ("", 0), ("abc", 3), @@ -617,6 +672,62 @@ def test_replace_unicode(self, dt): assert_array_equal(np.strings.replace(buf, "<", "<", MAX), "...\u043c......<") + @pytest.mark.parametrize("dt", ["U", "T"]) + @pytest.mark.parametrize("in_,out", [ + ('\u1FFc', False), + ('\u2167', False), + ('\U00010401', False), + ('\U00010427', False), + ('\U0001F40D', False), + ('\U0001F46F', False), + ('\u2177', True), + ('\U00010429', True), + ('\U0001044E', True), + ]) + def test_islower_unicode(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + assert_array_equal(np.strings.islower(in_), out) + + @pytest.mark.parametrize("dt", ["U", "T"]) + @pytest.mark.parametrize("in_,out", [ + ('\u1FFc', False), + ('\u2167', True), + ('\U00010401', True), + ('\U00010427', True), + ('\U0001F40D', False), + ('\U0001F46F', False), + ('\u2177', False), + ('\U00010429', False), + ('\U0001044E', False), + ]) + def test_isupper_unicode(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + assert_array_equal(np.strings.isupper(in_), out) + + @pytest.mark.parametrize("dt", ["U", "T"]) + @pytest.mark.parametrize("in_,out", [ + ('\u1FFc', True), + ('Greek \u1FFcitlecases ...', True), + ('\U00010401\U00010429', True), + ('\U00010427\U0001044E', True), + ('\U00010429', False), + ('\U0001044E', False), + ('\U0001F40D', False), + ('\U0001F46F', False), + ]) + def test_istitle_unicode(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + assert_array_equal(np.strings.istitle(in_), out) + def check_itemsize(n_elem, dt): if dt == "T": From 4c440cacceaa426afb15d7c5f292828c5a005a38 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 13 Feb 2024 11:45:20 +0100 Subject: [PATCH 2/9] Use unary loop with templating instead of different loops for each ufunc --- numpy/_core/src/umath/string_buffer.h | 8 - numpy/_core/src/umath/string_ufuncs.cpp | 381 ++++++------------------ 2 files changed, 95 insertions(+), 294 deletions(-) diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index c3bc75e3a759..b9f460f3d0b8 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -620,14 +620,6 @@ struct call_buffer_member_function { return codepoint_isdigit(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISSPACE: return codepoint_isspace(*buf); - case IMPLEMENTED_UNARY_FUNCTIONS::STR_LEN: - return (T)buf.num_codepoints(); - case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: - return (T)buf.islower(); - case IMPLEMENTED_UNARY_FUNCTIONS::ISUPPER: - return (T)buf.isupper(); - case IMPLEMENTED_UNARY_FUNCTIONS::ISTITLE: - return (T)buf.istitle(); case IMPLEMENTED_UNARY_FUNCTIONS::ISNUMERIC: return codepoint_isnumeric(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISDECIMAL: diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 07fe1b81fd2d..219d22e253a9 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -21,58 +21,6 @@ #include "string_buffer.h" -template -static inline void -string_add(Buffer buf1, Buffer buf2, Buffer out) -{ - size_t len1 = buf1.num_codepoints(); - size_t len2 = buf2.num_codepoints(); - buf1.buffer_memcpy(out, len1); - buf2.buffer_memcpy(out + len1, len2); - out.buffer_fill_with_zeros_after_index(len1 + len2); -} - - -static inline npy_bool -string_isdecimal(Buffer buf) -{ - size_t len = buf.num_codepoints(); - - if (len == 0) { - return (npy_bool) 0; - } - - for (size_t i = 0; i < len; i++) { - npy_bool isdecimal = (npy_bool) Py_UNICODE_ISDECIMAL(*buf); - if (!isdecimal) { - return isdecimal; - } - buf++; - } - return (npy_bool) 1; -} - - -static inline npy_bool -string_isnumeric(Buffer buf) -{ - size_t len = buf.num_codepoints(); - - if (len == 0) { - return (npy_bool) 0; - } - - for (size_t i = 0; i < len; i++) { - npy_bool isnumeric = (npy_bool) Py_UNICODE_ISNUMERIC(*buf); - if (!isnumeric) { - return isnumeric; - } - buf++; - } - return (npy_bool) 1; -} - - /* * Helper for templating, avoids warnings about uncovered switch paths. */ @@ -151,143 +99,48 @@ string_comparison_loop(PyArrayMethod_Context *context, } -template -static int -string_add_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize1 = context->descriptors[0]->elsize; - int elsize2 = context->descriptors[1]->elsize; - int outsize = context->descriptors[2]->elsize; - - char *in1 = data[0]; - char *in2 = data[1]; - char *out = data[2]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf1(in1, elsize1); - Buffer buf2(in2, elsize2); - Buffer outbuf(out, outsize); - string_add(buf1, buf2, outbuf); - - in1 += strides[0]; - in2 += strides[1]; - out += strides[2]; - } - - return 0; -} - - -template -static int -string_len_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - *(npy_intp *)out = buf.num_codepoints(); - - in += strides[0]; - out += strides[1]; - } - - return 0; -} - - -template -static int -string_isalpha_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - PyArray_Descr *descr = context->descriptors[0]; - int elsize = descr->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - *(npy_bool *)out = (npy_bool) buf.isalpha(); - - in += strides[0]; - out += strides[1]; - } - - return 0; -} - - -template -static int -string_isdigit_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - *(npy_bool *)out = (npy_bool) buf.isdigit(); - - in += strides[0]; - out += strides[1]; - } - - return 0; -} - - -template -static int -string_isspace_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - *(npy_bool *)out = (npy_bool) buf.isspace(); - - in += strides[0]; - out += strides[1]; +template +struct call_buffer_unary_function { + void operator()(const char *buffer, size_t size, char *out) { + Buffer buf((char *)buffer, size); + switch (f) { + case IMPLEMENTED_UNARY_FUNCTIONS::ISALPHA: + *(T *)out = buf.isalpha(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISDECIMAL: + *(T *)out = buf.isdecimal(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISDIGIT: + *(T *)out = buf.isdigit(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISNUMERIC: + *(T *)out = buf.isnumeric(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISSPACE: + *(T *)out = buf.isspace(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: + *(T *)out = buf.islower(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISUPPER: + *(T *)out = buf.isupper(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISTITLE: + *(T *)out = buf.istitle(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::STR_LEN: + *(T *)out = buf.num_codepoints(); + break; + } } - - return 0; -} +}; -template +template static int -string_islower_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +string_unary_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) { int elsize = context->descriptors[0]->elsize; @@ -297,9 +150,8 @@ string_islower_loop(PyArrayMethod_Context *context, npy_intp N = dimensions[0]; while (N--) { - Buffer buf(in, elsize); - npy_bool res = buf.islower(); - *(npy_bool *)out = res; + call_buffer_unary_function cbuf; + cbuf(in, (size_t) elsize, out); in += strides[0]; out += strides[1]; @@ -310,101 +162,42 @@ string_islower_loop(PyArrayMethod_Context *context, template -static int -string_isupper_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +static inline void +string_add(Buffer buf1, Buffer buf2, Buffer out) { - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - npy_bool res = buf.isupper(); - *(npy_bool *)out = res; - - in += strides[0]; - out += strides[1]; - } - - return 0; + size_t len1 = buf1.num_codepoints(); + size_t len2 = buf2.num_codepoints(); + buf1.buffer_memcpy(out, len1); + buf2.buffer_memcpy(out + len1, len2); + out.buffer_fill_with_zeros_after_index(len1 + len2); } template static int -string_istitle_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - npy_bool res = buf.istitle(); - *(npy_bool *)out = res; - - in += strides[0]; - out += strides[1]; - } - - return 0; -} - - -static int -string_isdecimal_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) -{ - int elsize = context->descriptors[0]->elsize; - - char *in = data[0]; - char *out = data[1]; - - npy_intp N = dimensions[0]; - - while (N--) { - Buffer buf(in, elsize); - npy_bool res = string_isdecimal(buf); - *(npy_bool *)out = res; - - in += strides[0]; - out += strides[1]; - } - - return 0; -} - - -static int -string_isnumeric_loop(PyArrayMethod_Context *context, - char *const data[], npy_intp const dimensions[], - npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +string_add_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) { - int elsize = context->descriptors[0]->elsize; + int elsize1 = context->descriptors[0]->elsize; + int elsize2 = context->descriptors[1]->elsize; + int outsize = context->descriptors[2]->elsize; - char *in = data[0]; - char *out = data[1]; + char *in1 = data[0]; + char *in2 = data[1]; + char *out = data[2]; npy_intp N = dimensions[0]; while (N--) { - Buffer buf(in, elsize); - npy_bool res = string_isnumeric(buf); - *(npy_bool *)out = res; + Buffer buf1(in1, elsize1); + Buffer buf2(in2, elsize2); + Buffer outbuf(out, outsize); + string_add(buf1, buf2, outbuf); - in += strides[0]; - out += strides[1]; + in1 += strides[0]; + in2 += strides[1]; + out += strides[2]; } return 0; @@ -1195,12 +988,14 @@ init_string_ufuncs(PyObject *umath) dtypes[1] = NPY_DEFAULT_INT; if (init_ufunc( umath, "str_len", "templated_string_len", 1, 1, dtypes, - string_len_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "str_len", "templated_string_len", 1, 1, dtypes, - string_len_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } @@ -1208,12 +1003,14 @@ init_string_ufuncs(PyObject *umath) dtypes[1] = NPY_BOOL; if (init_ufunc( umath, "isalpha", "templated_string_isalpha", 1, 1, dtypes, - string_isalpha_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isalpha", "templated_string_isalpha", 1, 1, dtypes, - string_isalpha_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } @@ -1313,68 +1110,80 @@ init_string_ufuncs(PyObject *umath) dtypes[1] = NPY_BOOL; if (init_ufunc( umath, "isdigit", "templated_string_isdigit", 1, 1, dtypes, - string_isdigit_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isdigit", "templated_string_isdigit", 1, 1, dtypes, - string_isdigit_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isspace", "templated_string_isspace", 1, 1, dtypes, - string_isspace_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isspace", "templated_string_isspace", 1, 1, dtypes, - string_isspace_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "islower", "templated_string_islower", 1, 1, dtypes, - string_islower_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "islower", "templated_string_islower", 1, 1, dtypes, - string_islower_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isupper", "templated_string_isupper", 1, 1, dtypes, - string_isupper_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isupper", "templated_string_isupper", 1, 1, dtypes, - string_isupper_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "istitle", "templated_string_istitle", 1, 1, dtypes, - string_istitle_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "istitle", "templated_string_istitle", 1, 1, dtypes, - string_istitle_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isdecimal", "templated_string_isdecimal", 1, 1, dtypes, - string_isdecimal_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } if (init_ufunc( umath, "isnumeric", "templated_string_isnumeric", 1, 1, dtypes, - string_isnumeric_loop, NULL) < 0) { + string_unary_loop, + NULL) < 0) { return -1; } From b9db87c65d8d05daba592a81c7dd59891e66507b Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Tue, 13 Feb 2024 12:53:00 +0100 Subject: [PATCH 3/9] Add isalnum ufuncs for str and bytes --- numpy/_core/code_generators/generate_umath.py | 5 +++ .../_core/code_generators/ufunc_docstrings.py | 28 +++++++++++++ numpy/_core/src/common/numpyos.c | 2 +- numpy/_core/src/common/numpyos.h | 3 ++ numpy/_core/src/umath/string_buffer.h | 34 ++++++++++++++++ numpy/_core/src/umath/string_ufuncs.cpp | 16 ++++++++ numpy/_core/strings.py | 39 ++----------------- numpy/_core/tests/test_strings.py | 36 +++++++++++++++++ 8 files changed, 127 insertions(+), 36 deletions(-) diff --git a/numpy/_core/code_generators/generate_umath.py b/numpy/_core/code_generators/generate_umath.py index 50c0ea673d37..89977a19206f 100644 --- a/numpy/_core/code_generators/generate_umath.py +++ b/numpy/_core/code_generators/generate_umath.py @@ -1170,6 +1170,11 @@ def english_upper(s): docstrings.get('numpy._core.umath.isspace'), None, ), +'isalnum': + Ufunc(1, 1, False_, + docstrings.get('numpy._core.umath.isalnum'), + None, + ), 'islower': Ufunc(1, 1, False_, docstrings.get('numpy._core.umath.islower'), diff --git a/numpy/_core/code_generators/ufunc_docstrings.py b/numpy/_core/code_generators/ufunc_docstrings.py index a081856f44b6..eedd18b2939b 100644 --- a/numpy/_core/code_generators/ufunc_docstrings.py +++ b/numpy/_core/code_generators/ufunc_docstrings.py @@ -4441,6 +4441,34 @@ def add_newdoc(place, name, doc): """) +add_newdoc('numpy._core.umath', 'isalnum', + """ + Returns true for each element if all characters in the string are + alphanumeric and there is at least one character, false otherwise. + + Parameters + ---------- + x : array_like, with `np.bytes_` or `np.str_` dtype + $PARAMS + + Returns + ------- + out : ndarray + Output array of bool + $OUT_SCALAR_1 + + See Also + -------- + str.isalnum + + Examples + -------- + >>> a = np.array(['a', '1', 'a1', '(', '']) + >>> np.strings.isalnum(a) + array([ True, True, True, False, False]) + + """) + add_newdoc('numpy._core.umath', 'islower', """ Returns true for each element if all cased characters in the diff --git a/numpy/_core/src/common/numpyos.c b/numpy/_core/src/common/numpyos.c index 057595593e51..4b589966591e 100644 --- a/numpy/_core/src/common/numpyos.c +++ b/numpy/_core/src/common/numpyos.c @@ -381,7 +381,7 @@ NumPyOS_ascii_isdigit(char c) * * Same as isalnum under C locale */ -static int +NPY_NO_EXPORT int NumPyOS_ascii_isalnum(char c) { return NumPyOS_ascii_isdigit(c) || NumPyOS_ascii_isalpha(c); diff --git a/numpy/_core/src/common/numpyos.h b/numpy/_core/src/common/numpyos.h index 96c167190222..2b2b88bdc954 100644 --- a/numpy/_core/src/common/numpyos.h +++ b/numpy/_core/src/common/numpyos.h @@ -41,6 +41,9 @@ NumPyOS_ascii_isalpha(char c); NPY_NO_EXPORT int NumPyOS_ascii_isdigit(char c); +NPY_NO_EXPORT int +NumPyOS_ascii_isalnum(char c); + /* Convert a string to an int in an arbitrary base */ NPY_NO_EXPORT npy_longlong NumPyOS_strtoll(const char *str, char **endptr, int base); diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index b9f460f3d0b8..1c143e644e7c 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -26,6 +26,7 @@ enum class IMPLEMENTED_UNARY_FUNCTIONS { ISDECIMAL, ISDIGIT, ISSPACE, + ISALNUM, ISLOWER, ISUPPER, ISTITLE, @@ -139,6 +140,31 @@ codepoint_isspace(npy_ucs4 code) return Py_UNICODE_ISSPACE(code); } +template +inline bool +codepoint_isalnum(npy_ucs4 code); + +template<> +inline bool +codepoint_isalnum(npy_ucs4 code) +{ + return NumPyOS_ascii_isalnum(code); +} + +template<> +inline bool +codepoint_isalnum(npy_ucs4 code) +{ + return Py_UNICODE_ISALNUM(code); +} + +template<> +inline bool +codepoint_isalnum(npy_ucs4 code) +{ + return Py_UNICODE_ISALNUM(code); +} + template inline bool codepoint_islower(npy_ucs4 code); @@ -467,6 +493,12 @@ struct Buffer { return unary_loop(); } + inline bool + isalnum() + { + return unary_loop(); + } + inline bool islower() { @@ -620,6 +652,8 @@ struct call_buffer_member_function { return codepoint_isdigit(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISSPACE: return codepoint_isspace(*buf); + case IMPLEMENTED_UNARY_FUNCTIONS::ISALNUM: + return codepoint_isalnum(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISNUMERIC: return codepoint_isnumeric(*buf); case IMPLEMENTED_UNARY_FUNCTIONS::ISDECIMAL: diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 219d22e253a9..1d0dca84a8d3 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -119,6 +119,9 @@ struct call_buffer_unary_function { case IMPLEMENTED_UNARY_FUNCTIONS::ISSPACE: *(T *)out = buf.isspace(); break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISALNUM: + *(T *)out = buf.isalnum(); + break; case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: *(T *)out = buf.islower(); break; @@ -1134,6 +1137,19 @@ init_string_ufuncs(PyObject *umath) return -1; } + if (init_ufunc( + umath, "isalnum", "templated_string_isalnum", 1, 1, dtypes, + string_unary_loop, + NULL) < 0) { + return -1; + } + if (init_ufunc( + umath, "isalnum", "templated_string_isalnum", 1, 1, dtypes, + string_unary_loop, + NULL) < 0) { + return -1; + } + if (init_ufunc( umath, "islower", "templated_string_islower", 1, 1, dtypes, string_unary_loop, diff --git a/numpy/_core/strings.py b/numpy/_core/strings.py index 6aa3da77f90a..99d2ef5e2b3e 100644 --- a/numpy/_core/strings.py +++ b/numpy/_core/strings.py @@ -13,6 +13,7 @@ isalpha, isdigit, isspace, + isalnum, islower, isupper, istitle, @@ -37,12 +38,12 @@ __all__ = [ # UFuncs "equal", "not_equal", "less", "less_equal", "greater", "greater_equal", - "add", "isalpha", "isdigit", "isspace", "islower", "isupper", "istitle", - "isdecimal", "isnumeric", "str_len", "find", "rfind", "count", + "add", "isalpha", "isdigit", "isspace", "isalnum", "islower", "isupper", + "istitle", "isdecimal", "isnumeric", "str_len", "find", "rfind", "count", "startswith", "endswith", "lstrip", "rstrip", "strip", "replace", # _vec_string - Will gradually become ufuncs as well - "isalnum", "multiply", "mod", "index", + "multiply", "mod", "index", "rindex", "decode", "encode", "expandtabs", "center", "ljust", "rjust", "zfill", "upper", "lower", "swapcase", "capitalize", "title", "join", "split", "rsplit", "splitlines", @@ -96,38 +97,6 @@ def _clean_args(*args): return newargs -def isalnum(a): - """ - Returns true for each element if all characters in the string are - alphanumeric and there is at least one character, false otherwise. - - Calls :meth:`str.isalnum` element-wise. - - For 8-bit strings, this method is locale-dependent. - - Parameters - ---------- - a : array_like, with `np.bytes_` or `np.str_` dtype - - Returns - ------- - out : ndarray - Output array of str or unicode, depending on input type - - See Also - -------- - str.isalnum - - Examples - -------- - >>> a = np.array(['a', '1', 'a1', '(', '']) - >>> np.strings.isalnum(a) - array([ True, True, True, False, False]) - - """ - return _vec_string(a, np.bool, 'isalnum') - - def multiply(a, i): """ Return (a * i), that is string multiple concatenation, diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index efe24658025d..091fa7388eb9 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -167,6 +167,24 @@ def test_isspace(self, in_, out, dt): in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.isspace(in_), out) + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', True), + ('A', True), + ('\n', False), + ('123abc456', True), + ('a1b3c', True), + ('aBc000 ', False), + ('abc\n', False), + ]) + def test_isalnum(self, in_, out, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.isalnum(in_), out) + @pytest.mark.parametrize("in_,out", [ ('', False), ('a', True), @@ -672,6 +690,24 @@ def test_replace_unicode(self, dt): assert_array_equal(np.strings.replace(buf, "<", "<", MAX), "...\u043c......<") + @pytest.mark.parametrize("dt", ["U", "T"]) + @pytest.mark.parametrize("in_", [ + '\U00010401', + '\U00010427', + '\U00010429', + '\U0001044E', + '\U0001D7F6', + '\U00011066', + '\U000104A0', + '\U0001F107', + ]) + def test_isalnum_unicode(self, in_, dt): + # TODO: Remove this + if dt == "T": + pytest.xfail( + "StringDType support to be added in a follow-up commit") + assert_array_equal(np.strings.isalnum(in_), True) + @pytest.mark.parametrize("dt", ["U", "T"]) @pytest.mark.parametrize("in_,out", [ ('\u1FFc', False), From b9202ca464e0b7cca14f800ad0cdc384c7bc1310 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Wed, 14 Feb 2024 12:01:20 +0100 Subject: [PATCH 4/9] Refactor test to multiple parametrized classes --- numpy/_core/tests/test_strings.py | 207 +++++++++++++++--------------- 1 file changed, 100 insertions(+), 107 deletions(-) diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index 091fa7388eb9..312495e414af 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -167,79 +167,6 @@ def test_isspace(self, in_, out, dt): in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.isspace(in_), out) - @pytest.mark.parametrize("in_,out", [ - ('', False), - ('a', True), - ('A', True), - ('\n', False), - ('123abc456', True), - ('a1b3c', True), - ('aBc000 ', False), - ('abc\n', False), - ]) - def test_isalnum(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") - in_ = np.array(in_, dtype=dt) - assert_array_equal(np.strings.isalnum(in_), out) - - @pytest.mark.parametrize("in_,out", [ - ('', False), - ('a', True), - ('A', False), - ('\n', False), - ('abc', True), - ('aBc', False), - ('abc\n', True), - ]) - def test_islower(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") - in_ = np.array(in_, dtype=dt) - assert_array_equal(np.strings.islower(in_), out) - - @pytest.mark.parametrize("in_,out", [ - ('', False), - ('a', False), - ('A', True), - ('\n', False), - ('ABC', True), - ('AbC', False), - ('ABC\n', True), - ]) - def test_isupper(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") - in_ = np.array(in_, dtype=dt) - assert_array_equal(np.strings.isupper(in_), out) - - @pytest.mark.parametrize("in_,out", [ - ('', False), - ('a', False), - ('A', True), - ('\n', False), - ('A Titlecased Line', True), - ('A\nTitlecased Line', True), - ('A Titlecased, Line', True), - ('Not a capitalized String', False), - ('Not\ta Titlecase String', False), - ('Not--a Titlecase String', False), - ('NOT', False), - ]) - def test_istitle(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") - in_ = np.array(in_, dtype=dt) - assert_array_equal(np.strings.istitle(in_), out) - @pytest.mark.parametrize("in_,out", [ ("", 0), ("abc", 3), @@ -641,8 +568,78 @@ def test_replace(self, buf, old, new, count, res, dt): assert_array_equal(np.strings.replace(buf, old, new, count), res) -class TestUnicodeOnlyMethods: - @pytest.mark.parametrize("dt", ["U", "T"]) +@pytest.mark.parametrize("dt", [ + "S", + "U", + pytest.param("T", marks=pytest.mark.xfail( + reason="StringDType support not implemented yet", strict=True)), +]) +class TestMethosWithoutStringDTypeSupport: + """ + Tests shoud be moved to `TestMethods` once StringDType support is + implemeted + """ + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', True), + ('A', True), + ('\n', False), + ('123abc456', True), + ('a1b3c', True), + ('aBc000 ', False), + ('abc\n', False), + ]) + def test_isalnum(self, in_, out, dt): + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.isalnum(in_), out) + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', True), + ('A', False), + ('\n', False), + ('abc', True), + ('aBc', False), + ('abc\n', True), + ]) + def test_islower(self, in_, out, dt): + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.islower(in_), out) + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', False), + ('A', True), + ('\n', False), + ('ABC', True), + ('AbC', False), + ('ABC\n', True), + ]) + def test_isupper(self, in_, out, dt): + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.isupper(in_), out) + + @pytest.mark.parametrize("in_,out", [ + ('', False), + ('a', False), + ('A', True), + ('\n', False), + ('A Titlecased Line', True), + ('A\nTitlecased Line', True), + ('A Titlecased, Line', True), + ('Not a capitalized String', False), + ('Not\ta Titlecase String', False), + ('Not--a Titlecase String', False), + ('NOT', False), + ]) + def test_istitle(self, in_, out, dt): + in_ = np.array(in_, dtype=dt) + assert_array_equal(np.strings.istitle(in_), out) + + +@pytest.mark.parametrize("dt", ["U", "T"]) +class TestMethodsWithUnicode: @pytest.mark.parametrize("in_,out", [ ("", False), ("a", False), @@ -658,12 +655,6 @@ def test_isdecimal_unicode(self, in_, out, dt): buf = np.array(in_, dtype=dt) assert_array_equal(np.strings.isdecimal(buf), out) - def test_isdecimal_bytes(self): - in_ = np.array(b"1") - with assert_raises(TypeError): - np.strings.isdecimal(in_) - - @pytest.mark.parametrize("dt", ["U", "T"]) @pytest.mark.parametrize("in_,out", [ ("", False), ("a", False), @@ -679,18 +670,23 @@ def test_isnumeric_unicode(self, in_, out, dt): buf = np.array(in_, dtype=dt) assert_array_equal(np.strings.isnumeric(buf), out) - def test_isnumeric_bytes(self): - in_ = np.array(b"1") - with assert_raises(TypeError): - np.strings.isnumeric(in_) - - @pytest.mark.parametrize("dt", ["U", "T"]) def test_replace_unicode(self, dt): buf = np.array("...\u043c......<", dtype=dt) assert_array_equal(np.strings.replace(buf, "<", "<", MAX), "...\u043c......<") - @pytest.mark.parametrize("dt", ["U", "T"]) + +@pytest.mark.parametrize("dt", [ + "U", + pytest.param("T", marks=pytest.mark.xfail( + reason="StringDType support not implemented yet", strict=True)), +]) +class TestMethodsWithoutStringDTypeSupportWithUnicode: + """ + Tests shoud be moved to `TestMethods` once StringDType support is + implemeted + """ + @pytest.mark.parametrize("in_", [ '\U00010401', '\U00010427', @@ -702,13 +698,9 @@ def test_replace_unicode(self, dt): '\U0001F107', ]) def test_isalnum_unicode(self, in_, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.isalnum(in_), True) - @pytest.mark.parametrize("dt", ["U", "T"]) @pytest.mark.parametrize("in_,out", [ ('\u1FFc', False), ('\u2167', False), @@ -721,13 +713,9 @@ def test_isalnum_unicode(self, in_, dt): ('\U0001044E', True), ]) def test_islower_unicode(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.islower(in_), out) - @pytest.mark.parametrize("dt", ["U", "T"]) @pytest.mark.parametrize("in_,out", [ ('\u1FFc', False), ('\u2167', True), @@ -740,13 +728,9 @@ def test_islower_unicode(self, in_, out, dt): ('\U0001044E', False), ]) def test_isupper_unicode(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.isupper(in_), out) - @pytest.mark.parametrize("dt", ["U", "T"]) @pytest.mark.parametrize("in_,out", [ ('\u1FFc', True), ('Greek \u1FFcitlecases ...', True), @@ -758,13 +742,22 @@ def test_isupper_unicode(self, in_, out, dt): ('\U0001F46F', False), ]) def test_istitle_unicode(self, in_, out, dt): - # TODO: Remove this - if dt == "T": - pytest.xfail( - "StringDType support to be added in a follow-up commit") + in_ = np.array(in_, dtype=dt) assert_array_equal(np.strings.istitle(in_), out) +class TestUnicodeOnlyMethodsRaiseWithBytes: + def test_isdecimal_raises(self): + in_ = np.array(b"1") + with assert_raises(TypeError): + np.strings.isdecimal(in_) + + def test_isnumeric_bytes(self): + in_ = np.array(b"1") + with assert_raises(TypeError): + np.strings.isnumeric(in_) + + def check_itemsize(n_elem, dt): if dt == "T": return np.dtype(dt).itemsize From f324332a29c1d7e927592f4db53a5ca75e6382fb Mon Sep 17 00:00:00 2001 From: Nathan Goldbaum Date: Wed, 14 Feb 2024 12:49:55 -0700 Subject: [PATCH 5/9] ENH: implement isalnum, istitle, isupper, islower for stringdtype --- numpy/_core/src/umath/stringdtype_ufuncs.cpp | 92 ++++++++++++++++++++ numpy/_core/tests/test_stringdtype.py | 8 +- numpy/_core/tests/test_strings.py | 25 ------ 3 files changed, 96 insertions(+), 29 deletions(-) diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index 647aa4d71ef3..1aa687adc000 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -1034,6 +1034,18 @@ struct call_buffer_function { case IMPLEMENTED_UNARY_FUNCTIONS::STR_LEN: *(T *)out = buf.num_codepoints(); break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISALNUM: + *(T *)out = buf.isalnum(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISTITLE: + *(T *)out = buf.istitle(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISUPPER: + *(T *)out = buf.isupper(); + break; + case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: + *(T *)out = buf.islower(); + break; } } }; @@ -1172,6 +1184,58 @@ string_isspace_strided_loop(PyArrayMethod_Context *context, char *const data[], } +static const char isalnum_name[] = "isalnum"; + +static int +string_isalnum_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], + npy_intp const strides[], + NpyAuxData *auxdata) +{ + return string_bool_output_unary_strided_loop( + context, data, dimensions, strides, auxdata); +} + + +static const char istitle_name[] = "istitle"; + +static int +string_istitle_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], + npy_intp const strides[], + NpyAuxData *auxdata) +{ + return string_bool_output_unary_strided_loop( + context, data, dimensions, strides, auxdata); +} + + +static const char islower_name[] = "islower"; + +static int +string_islower_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], + npy_intp const strides[], + NpyAuxData *auxdata) +{ + return string_bool_output_unary_strided_loop( + context, data, dimensions, strides, auxdata); +} + + +static const char isupper_name[] = "isupper"; + +static int +string_isupper_strided_loop(PyArrayMethod_Context *context, char *const data[], + npy_intp const dimensions[], + npy_intp const strides[], + NpyAuxData *auxdata) +{ + return string_bool_output_unary_strided_loop( + context, data, dimensions, strides, auxdata); +} + + static int string_strlen_strided_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], @@ -2821,6 +2885,34 @@ init_stringdtype_ufuncs(PyObject *umath) return -1; } + if (init_ufunc(umath, "isalnum", bool_output_dtypes, + &string_bool_output_resolve_descriptors, + &string_isalnum_strided_loop, 1, 1, NPY_NO_CASTING, + (NPY_ARRAYMETHOD_FLAGS) 0) < 0) { + return -1; + } + + if (init_ufunc(umath, "istitle", bool_output_dtypes, + &string_bool_output_resolve_descriptors, + &string_istitle_strided_loop, 1, 1, NPY_NO_CASTING, + (NPY_ARRAYMETHOD_FLAGS) 0) < 0) { + return -1; + } + + if (init_ufunc(umath, "isupper", bool_output_dtypes, + &string_bool_output_resolve_descriptors, + &string_isupper_strided_loop, 1, 1, NPY_NO_CASTING, + (NPY_ARRAYMETHOD_FLAGS) 0) < 0) { + return -1; + } + + if (init_ufunc(umath, "islower", bool_output_dtypes, + &string_bool_output_resolve_descriptors, + &string_islower_strided_loop, 1, 1, NPY_NO_CASTING, + (NPY_ARRAYMETHOD_FLAGS) 0) < 0) { + return -1; + } + PyArray_DTypeMeta *intp_output_dtypes[] = { &PyArray_StringDType, &PyArray_IntpDType diff --git a/numpy/_core/tests/test_stringdtype.py b/numpy/_core/tests/test_stringdtype.py index 9e562de81cfc..b14c41586113 100644 --- a/numpy/_core/tests/test_stringdtype.py +++ b/numpy/_core/tests/test_stringdtype.py @@ -952,15 +952,15 @@ def unicode_array(): "upper", "isnumeric", "isdecimal", + "isalnum", + "islower", + "istitle", + "isupper", ] UNIMPLEMENTED_VEC_STRING_FUNCTIONS = [ "capitalize", "expandtabs", - "isalnum", - "islower", - "istitle", - "isupper", "lower", "splitlines", "swapcase", diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index 312495e414af..57940a118a94 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -567,19 +567,6 @@ def test_replace(self, buf, old, new, count, res, dt): res = np.array(res, dtype=dt) assert_array_equal(np.strings.replace(buf, old, new, count), res) - -@pytest.mark.parametrize("dt", [ - "S", - "U", - pytest.param("T", marks=pytest.mark.xfail( - reason="StringDType support not implemented yet", strict=True)), -]) -class TestMethosWithoutStringDTypeSupport: - """ - Tests shoud be moved to `TestMethods` once StringDType support is - implemeted - """ - @pytest.mark.parametrize("in_,out", [ ('', False), ('a', True), @@ -675,18 +662,6 @@ def test_replace_unicode(self, dt): assert_array_equal(np.strings.replace(buf, "<", "<", MAX), "...\u043c......<") - -@pytest.mark.parametrize("dt", [ - "U", - pytest.param("T", marks=pytest.mark.xfail( - reason="StringDType support not implemented yet", strict=True)), -]) -class TestMethodsWithoutStringDTypeSupportWithUnicode: - """ - Tests shoud be moved to `TestMethods` once StringDType support is - implemeted - """ - @pytest.mark.parametrize("in_", [ '\U00010401', '\U00010427', From a9711bb7f713a2237a2c6a6640997e71b81ff5b7 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 16 Feb 2024 11:15:24 +0100 Subject: [PATCH 6/9] Separate str_len / bool methods; use static_data approach --- numpy/_core/src/umath/string_ufuncs.cpp | 224 ++++++++---------------- 1 file changed, 75 insertions(+), 149 deletions(-) diff --git a/numpy/_core/src/umath/string_ufuncs.cpp b/numpy/_core/src/umath/string_ufuncs.cpp index 2ecdaa527985..588deaf3001e 100644 --- a/numpy/_core/src/umath/string_ufuncs.cpp +++ b/numpy/_core/src/umath/string_ufuncs.cpp @@ -99,52 +99,41 @@ string_comparison_loop(PyArrayMethod_Context *context, } -template -struct call_buffer_unary_function { - void operator()(const char *buffer, size_t size, char *out) { - Buffer buf((char *)buffer, size); - switch (f) { - case IMPLEMENTED_UNARY_FUNCTIONS::ISALPHA: - *(T *)out = buf.isalpha(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISDECIMAL: - *(T *)out = buf.isdecimal(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISDIGIT: - *(T *)out = buf.isdigit(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISNUMERIC: - *(T *)out = buf.isnumeric(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISSPACE: - *(T *)out = buf.isspace(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISALNUM: - *(T *)out = buf.isalnum(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISLOWER: - *(T *)out = buf.islower(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISUPPER: - *(T *)out = buf.isupper(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::ISTITLE: - *(T *)out = buf.istitle(); - break; - case IMPLEMENTED_UNARY_FUNCTIONS::STR_LEN: - *(T *)out = buf.num_codepoints(); - break; - } +template +static int +string_str_len_loop(PyArrayMethod_Context *context, + char *const data[], npy_intp const dimensions[], + npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) +{ + int elsize = context->descriptors[0]->elsize; + + char *in = data[0]; + char *out = data[1]; + + npy_intp N = dimensions[0]; + + while (N--) { + Buffer buf(in, elsize); + *(npy_intp *)out = buf.num_codepoints(); + + in += strides[0]; + out += strides[1]; } -}; + + return 0; +} -template +template +using buffer_method = bool (Buffer::*)(); + +template static int string_unary_loop(PyArrayMethod_Context *context, char *const data[], npy_intp const dimensions[], npy_intp const strides[], NpyAuxData *NPY_UNUSED(auxdata)) { + buffer_method is_it = *(buffer_method *)(context->method->static_data); int elsize = context->descriptors[0]->elsize; char *in = data[0]; @@ -153,8 +142,8 @@ string_unary_loop(PyArrayMethod_Context *context, npy_intp N = dimensions[0]; while (N--) { - call_buffer_unary_function cbuf; - cbuf(in, (size_t) elsize, out); + Buffer buf(in, elsize); + *(npy_bool *)out = (buf.*is_it)(); in += strides[0]; out += strides[1]; @@ -939,30 +928,61 @@ init_string_ufuncs(PyObject *umath) dtypes[1] = NPY_DEFAULT_INT; if (init_ufunc( umath, "str_len", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL, NULL) < 0) { + string_str_len_loop, NULL, NULL) < 0) { return -1; } if (init_ufunc( umath, "str_len", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL, NULL) < 0) { + string_str_len_loop, NULL, NULL) < 0) { return -1; } dtypes[0] = NPY_OBJECT; dtypes[1] = NPY_BOOL; - if (init_ufunc( - umath, "isalpha", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL, NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "isalpha", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL, NULL) < 0) { - return -1; + + const char *unary_buffer_method_names[] = { + "isalpha", "isalnum", "isdigit", "isspace", "islower", + "isupper", "istitle", "isdecimal", "isnumeric", + }; + + static buffer_method unary_buffer_ascii_methods[] = { + &Buffer::isalpha, + &Buffer::isalnum, + &Buffer::isdigit, + &Buffer::isspace, + &Buffer::islower, + &Buffer::isupper, + &Buffer::istitle, + }; + + static buffer_method unary_buffer_utf32_methods[] = { + &Buffer::isalpha, + &Buffer::isalnum, + &Buffer::isdigit, + &Buffer::isspace, + &Buffer::islower, + &Buffer::isupper, + &Buffer::istitle, + &Buffer::isdecimal, + &Buffer::isnumeric, + }; + + for (int i = 0; i < 9; i++) { + if (i < 7) { // isdecimal & isnumeric do not support ASCII + if (init_ufunc( + umath, unary_buffer_method_names[i], 1, 1, dtypes, ENCODING::ASCII, + string_unary_loop, NULL, + &unary_buffer_ascii_methods[i]) < 0) { + return -1; + } + } + + if (init_ufunc( + umath, unary_buffer_method_names[i], 1, 1, dtypes, ENCODING::UTF32, + string_unary_loop, NULL, + &unary_buffer_utf32_methods[i]) < 0) { + return -1; + } } dtypes[0] = dtypes[1] = NPY_OBJECT; @@ -1060,100 +1080,6 @@ init_string_ufuncs(PyObject *umath) return -1; } - dtypes[0] = NPY_OBJECT; - dtypes[1] = NPY_BOOL; - if (init_ufunc( - umath, "isdigit", "templated_string_isdigit", 1, 1, dtypes, ENCODING::ASCII - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "isdigit", "templated_string_isdigit", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "isspace", "templated_string_isspace", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "isspace", "templated_string_isspace", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "isalnum", "templated_string_isalnum", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "isalnum", "templated_string_isalnum", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "islower", "templated_string_islower", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "islower", "templated_string_islower", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "isupper", "templated_string_isupper", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "isupper", "templated_string_isupper", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "istitle", "templated_string_istitle", 1, 1, dtypes, ENCODING::ASCII, - string_unary_loop, - NULL) < 0) { - return -1; - } - if (init_ufunc( - umath, "istitle", "templated_string_istitle", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "isdecimal", "templated_string_isdecimal", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - - if (init_ufunc( - umath, "isnumeric", "templated_string_isnumeric", 1, 1, dtypes, ENCODING::UTF32, - string_unary_loop, - NULL) < 0) { - return -1; - } - dtypes[0] = dtypes[1] = NPY_OBJECT; if (init_ufunc( umath, "_lstrip_whitespace", 1, 1, dtypes, ENCODING::ASCII, From 2bc79d796f66312de7ba051ac3bf55c1b355b031 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 16 Feb 2024 11:50:27 +0100 Subject: [PATCH 7/9] Implement NumPyOS_ascii_islower/isupper and use those --- numpy/_core/src/common/numpyos.c | 22 ++++++++++++++++++++++ numpy/_core/src/common/numpyos.h | 6 ++++++ numpy/_core/src/umath/string_buffer.h | 4 ++-- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/numpy/_core/src/common/numpyos.c b/numpy/_core/src/common/numpyos.c index 4b589966591e..38c77616daa2 100644 --- a/numpy/_core/src/common/numpyos.c +++ b/numpy/_core/src/common/numpyos.c @@ -387,6 +387,28 @@ NumPyOS_ascii_isalnum(char c) return NumPyOS_ascii_isdigit(c) || NumPyOS_ascii_isalpha(c); } +/* + * NumPyOS_ascii_islower: + * + * Same as islower under C locale + */ +NPY_NO_EXPORT int +NumPyOS_ascii_islower(char c) +{ + return c >= 'a' && c <= 'z'; +} + +/* + * NumPyOS_ascii_isupper: + * + * Same as isupper under C locale + */ +NPY_NO_EXPORT int +NumPyOS_ascii_isupper(char c) +{ + return c >= 'A' && c <= 'Z'; +} + /* * NumPyOS_ascii_tolower: diff --git a/numpy/_core/src/common/numpyos.h b/numpy/_core/src/common/numpyos.h index 2b2b88bdc954..980b8f6c0578 100644 --- a/numpy/_core/src/common/numpyos.h +++ b/numpy/_core/src/common/numpyos.h @@ -44,6 +44,12 @@ NumPyOS_ascii_isdigit(char c); NPY_NO_EXPORT int NumPyOS_ascii_isalnum(char c); +NPY_NO_EXPORT int +NumPyOS_ascii_islower(char c); + +NPY_NO_EXPORT int +NumPyOS_ascii_isupper(char c); + /* Convert a string to an int in an arbitrary base */ NPY_NO_EXPORT npy_longlong NumPyOS_strtoll(const char *str, char **endptr, int base); diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index b6002b0fddd1..ca8a5b197a51 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -174,7 +174,7 @@ template<> inline bool codepoint_islower(npy_ucs4 code) { - return Py_ISLOWER((char) code); + return NumPyOS_ascii_islower(code); } template<> @@ -199,7 +199,7 @@ template<> inline bool codepoint_isupper(npy_ucs4 code) { - return Py_ISUPPER((char) code); + return NumPyOS_ascii_isupper(code); } template<> From 1cc073381266a7dabe68ebe31384c68906e33a77 Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 16 Feb 2024 18:59:36 +0100 Subject: [PATCH 8/9] Address feedback; remove unnecessary assignment & change typedef to using --- numpy/_core/src/umath/string_buffer.h | 1 - numpy/_core/src/umath/stringdtype_ufuncs.cpp | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/numpy/_core/src/umath/string_buffer.h b/numpy/_core/src/umath/string_buffer.h index ca8a5b197a51..a3be7b149e9f 100644 --- a/numpy/_core/src/umath/string_buffer.h +++ b/numpy/_core/src/umath/string_buffer.h @@ -567,7 +567,6 @@ struct Buffer { if (!previous_is_cased) { return false; } - previous_is_cased = true; cased = true; } else { diff --git a/numpy/_core/src/umath/stringdtype_ufuncs.cpp b/numpy/_core/src/umath/stringdtype_ufuncs.cpp index 74b210ac091b..e064235762b3 100644 --- a/numpy/_core/src/umath/stringdtype_ufuncs.cpp +++ b/numpy/_core/src/umath/stringdtype_ufuncs.cpp @@ -626,7 +626,7 @@ string_intp_output_resolve_descriptors( return NPY_NO_CASTING; } -typedef bool (Buffer::*utf8_buffer_method)(); +using utf8_buffer_method = bool (Buffer::*)(); static int string_bool_output_unary_strided_loop( From 2214aeed4e5bb78a542eada3a7753c844468511e Mon Sep 17 00:00:00 2001 From: Lysandros Nikolaou Date: Fri, 16 Feb 2024 22:34:05 +0100 Subject: [PATCH 9/9] Mark some is* tests as xfail under PyPy on Windows --- numpy/_core/tests/test_strings.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/numpy/_core/tests/test_strings.py b/numpy/_core/tests/test_strings.py index 00357e67d49c..e6dd3552010e 100644 --- a/numpy/_core/tests/test_strings.py +++ b/numpy/_core/tests/test_strings.py @@ -1,9 +1,10 @@ +import sys import pytest import operator import numpy as np -from numpy.testing import assert_array_equal, assert_raises +from numpy.testing import assert_array_equal, assert_raises, IS_PYPY COMPARISONS = [ @@ -719,7 +720,10 @@ def test_replace_unicode(self, dt): '\U0001D7F6', '\U00011066', '\U000104A0', - '\U0001F107', + pytest.param('\U0001F107', marks=pytest.mark.xfail( + sys.platform == 'win32' and IS_PYPY, + reason="PYPY bug in Py_UNICODE_ISALNUM", + strict=True)), ]) def test_isalnum_unicode(self, in_, dt): in_ = np.array(in_, dtype=dt) @@ -733,7 +737,10 @@ def test_isalnum_unicode(self, in_, dt): ('\U0001F40D', False), ('\U0001F46F', False), ('\u2177', True), - ('\U00010429', True), + pytest.param('\U00010429', True, marks=pytest.mark.xfail( + sys.platform == 'win32' and IS_PYPY, + reason="PYPY bug in Py_UNICODE_ISLOWER", + strict=True)), ('\U0001044E', True), ]) def test_islower_unicode(self, in_, out, dt): @@ -748,7 +755,10 @@ def test_islower_unicode(self, in_, out, dt): ('\U0001F40D', False), ('\U0001F46F', False), ('\u2177', False), - ('\U00010429', False), + pytest.param('\U00010429', False, marks=pytest.mark.xfail( + sys.platform == 'win32' and IS_PYPY, + reason="PYPY bug in Py_UNICODE_ISUPPER", + strict=True)), ('\U0001044E', False), ]) def test_isupper_unicode(self, in_, out, dt): @@ -758,9 +768,15 @@ def test_isupper_unicode(self, in_, out, dt): @pytest.mark.parametrize("in_,out", [ ('\u1FFc', True), ('Greek \u1FFcitlecases ...', True), - ('\U00010401\U00010429', True), + pytest.param('\U00010401\U00010429', True, marks=pytest.mark.xfail( + sys.platform == 'win32' and IS_PYPY, + reason="PYPY bug in Py_UNICODE_ISISTITLE", + strict=True)), ('\U00010427\U0001044E', True), - ('\U00010429', False), + pytest.param('\U00010429', False, marks=pytest.mark.xfail( + sys.platform == 'win32' and IS_PYPY, + reason="PYPY bug in Py_UNICODE_ISISTITLE", + strict=True)), ('\U0001044E', False), ('\U0001F40D', False), ('\U0001F46F', False),