Merge pull request #15385 from eric-wieser/fix-unicode-ucs2

BUG, MAINT: Stop using the error-prone deprecated Py_UNICODE apis
numpy · Feb 14, 2020 · 1f9ab28 · 1f9ab28
2 parents 491f41a + d0b7b66
commit 1f9ab28
Show file tree

Hide file tree

Showing 12 changed files with 180 additions and 268 deletions.
diff --git a/doc/release/upcoming_changes/15385.new_feature.rst b/doc/release/upcoming_changes/15385.new_feature.rst
@@ -0,0 +1,5 @@
+``np.str_`` scalars now support the buffer protocol
+---------------------------------------------------
+``np.str_`` arrays are always stored as UCS4, so the corresponding scalars
+now expose this through the buffer interface, meaning
+``memoryview(np.str_('test'))`` now works.
diff --git a/numpy/core/defchararray.py b/numpy/core/defchararray.py
@@ -2679,25 +2679,6 @@ class adds the following functionality:
             itemsize = len(obj)
         shape = len(obj) // itemsize
 
-        if unicode:
-            if sys.maxunicode == 0xffff:
-                # On a narrow Python build, the buffer for Unicode
-                # strings is UCS2, which doesn't match the buffer for
-                # NumPy Unicode types, which is ALWAYS UCS4.
-                # Therefore, we need to convert the buffer.  On Python
-                # 2.6 and later, we can use the utf_32 codec.  Earlier
-                # versions don't have that codec, so we convert to a
-                # numerical array that matches the input buffer, and
-                # then use NumPy to convert it to UCS4.  All of this
-                # should happen in native endianness.
-                obj = obj.encode('utf_32')
-            else:
-                obj = str(obj)
-        else:
-            # Let the default Unicode -> string encoding (if any) take
-            # precedence.
-            obj = bytes(obj)
-
         return chararray(shape, itemsize=itemsize, unicode=unicode,
                          buffer=obj, order=order)
 

diff --git a/numpy/core/include/numpy/arrayscalars.h b/numpy/core/include/numpy/arrayscalars.h
@@ -135,7 +135,13 @@ typedef struct {
 } PyScalarObject;
 
 #define PyStringScalarObject PyStringObject
-#define PyUnicodeScalarObject PyUnicodeObject
+#define PyStringScalarObject PyStringObject
+typedef struct {
+        /* note that the PyObject_HEAD macro lives right here */
+        PyUnicodeObject base;
+        Py_UCS4 *obval;
+} PyUnicodeScalarObject;
+
 
 typedef struct {
         PyObject_VAR_HEAD

diff --git a/numpy/core/src/common/ucsnarrow.c b/numpy/core/src/common/ucsnarrow.c
@@ -16,76 +16,12 @@
 #include "ctors.h"
 
 /*
- * Functions only needed on narrow builds of Python for converting back and
- * forth between the NumPy Unicode data-type (always 4-bytes) and the
- * Python Unicode scalar (2-bytes on a narrow build).
- */
-
-/*
- * The ucs2 buffer must be large enough to hold 2*ucs4length characters
- * due to the use of surrogate pairs.
+ * This file originally contained functions only needed on narrow builds of
+ * Python for converting back and forth between the NumPy Unicode data-type
+ * (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
  *
- * The return value is the number of ucs2 bytes used-up which
- * is ucs4length + number of surrogate pairs found.
- *
- * Values above 0xffff are converted to surrogate pairs.
+ * This "narrow" interface is now deprecated in python and unused in NumPy.
  */
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
-{
-    int i;
-    int numucs2 = 0;
-    npy_ucs4 chr;
-    for (i = 0; i < ucs4length; i++) {
-        chr = *ucs4++;
-        if (chr > 0xffff) {
-            numucs2++;
-            chr -= 0x10000L;
-            *ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
-            *ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
-        }
-        else {
-            *ucs2++ = (Py_UNICODE) chr;
-        }
-        numucs2++;
-    }
-    return numucs2;
-}
-
-
-/*
- * This converts a UCS2 buffer of the given length to UCS4 buffer.
- * It converts up to ucs4len characters of UCS2
- *
- * It returns the number of characters converted which can
- * be less than ucs2len if there are surrogate pairs in ucs2.
- *
- * The return value is the actual size of the used part of the ucs4 buffer.
- */
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
-{
-    int i;
-    npy_ucs4 chr;
-    Py_UNICODE ch;
-    int numchars=0;
-
-    for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
-        ch = *ucs2++;
-        if (ch >= 0xd800 && ch <= 0xdfff) {
-            /* surrogate pair */
-            chr = ((npy_ucs4)(ch-0xd800)) << 10;
-            chr += *ucs2++ + 0x2400;  /* -0xdc00 + 0x10000 */
-            i++;
-        }
-        else {
-            chr = (npy_ucs4) ch;
-        }
-        *ucs4++ = chr;
-        numchars++;
-    }
-    return numchars;
-}
 
 /*
  * Returns a PyUnicodeObject initialized from a buffer containing
@@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
     Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
     npy_ucs4 const *src = (npy_ucs4 const *)src_char;
     npy_ucs4 *buf = NULL;
-    PyUnicodeObject *ret;
 
     /* swap and align if needed */
     if (swap || align) {
         buf = (npy_ucs4 *)malloc(size);
         if (buf == NULL) {
             PyErr_NoMemory();
-            goto fail;
+            return NULL;
         }
         memcpy(buf, src, size);
         if (swap) {
@@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
     while (ucs4len > 0 && src[ucs4len - 1] == 0) {
         ucs4len--;
     }
-
-    /* produce PyUnicode object */
-#ifdef Py_UNICODE_WIDE
-    {
-        ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
-                                                       (Py_ssize_t) ucs4len);
-        if (ret == NULL) {
-            goto fail;
-        }
-    }
-#else
-    {
-        Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
-        Py_ssize_t ucs2len;
-        Py_UNICODE *tmp;
-
-        if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
-            PyErr_NoMemory();
-            goto fail;
-        }
-        ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
-        ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
-        free(tmp);
-        if (ret == NULL) {
-            goto fail;
-        }
-    }
-#endif
-
-    if (buf) {
-        free(buf);
-    }
+    PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
+        PyUnicode_4BYTE_KIND, src, ucs4len);
+    free(buf);
     return ret;
-
-fail:
-    if (buf) {
-        free(buf);
-    }
-    return NULL;
 }
diff --git a/numpy/core/src/common/ucsnarrow.h b/numpy/core/src/common/ucsnarrow.h
@@ -1,12 +1,6 @@
 #ifndef _NPY_UCSNARROW_H_
 #define _NPY_UCSNARROW_H_
 
-NPY_NO_EXPORT int
-PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);
-
-NPY_NO_EXPORT int
-PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);
-
 NPY_NO_EXPORT PyUnicodeObject *
 PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);
 

diff --git a/numpy/core/src/multiarray/arraytypes.c.src b/numpy/core/src/multiarray/arraytypes.c.src
@@ -450,12 +450,6 @@ static int
 UNICODE_setitem(PyObject *op, void *ov, void *vap)
 {
     PyArrayObject *ap = vap;
-    PyObject *temp;
-    Py_UNICODE *ptr;
-    int datalen;
-#ifndef Py_UNICODE_WIDE
-    char *buffer;
-#endif
 
     if (PyArray_IsZeroDim(op)) {
         return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
@@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
                 "setting an array element with a sequence");
         return -1;
     }
+
+    PyObject *temp;
     if (PyBytes_Check(op)) {
         /* Try to decode from ASCII */
         temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
@@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
     else if ((temp=PyObject_Str(op)) == NULL) {
         return -1;
     }
-    ptr = PyUnicode_AS_UNICODE(temp);
-    if ((ptr == NULL) || (PyErr_Occurred())) {
+
+    /* truncate if needed */
+    Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
+    Py_ssize_t actual_len = PyUnicode_GetLength(temp);
+    if (actual_len < 0) {
         Py_DECREF(temp);
         return -1;
     }
-    datalen = PyUnicode_GET_DATA_SIZE(temp);
+    if (actual_len > max_len) {
+        Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
+        if (temp == NULL) {
+            return -1;
+        }
+        actual_len = max_len;
+    }
 
-#ifdef Py_UNICODE_WIDE
-    memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
-#else
+    Py_ssize_t num_bytes = actual_len * 4;
+
+    char *buffer;
     if (!PyArray_ISALIGNED(ap)) {
-        buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+        buffer = PyArray_malloc(num_bytes);
         if (buffer == NULL) {
             Py_DECREF(temp);
             PyErr_NoMemory();
@@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
     else {
         buffer = ov;
     }
-    datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
-            datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
-    datalen <<= 2;
+    if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
+        PyArray_free(buffer);
+        Py_DECREF(temp);
+        return -1;
+    }
+
     if (!PyArray_ISALIGNED(ap)) {
-        memcpy(ov, buffer, datalen);
+        memcpy(ov, buffer, num_bytes);
         PyArray_free(buffer);
     }
-#endif
+
     /* Fill in the rest of the space with 0 */
-    if (PyArray_DESCR(ap)->elsize > datalen) {
-        memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
+    if (PyArray_DESCR(ap)->elsize > num_bytes) {
+        memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
     }
     if (PyArray_ISBYTESWAPPED(ap)) {
-        byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
+        byte_swap_vector(ov, actual_len, 4);
     }
     Py_DECREF(temp);
     return 0;
@@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
     return nonz;
 }
 
-#ifdef Py_UNICODE_WIDE
-#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
-#else
-#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
-#endif
-
 static npy_bool
 UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
 {
@@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
         if (*ip == '\0') {
             seen_null = NPY_TRUE;
         }
-        else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
+        else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
             nonz = NPY_TRUE;
             break;
         }

diff --git a/numpy/core/src/multiarray/buffer.c b/numpy/core/src/multiarray/buffer.c
@@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
     descr = PyArray_DescrFromScalar(self);
     view->buf = (void *)scalar_value(self, descr);
     elsize = descr->elsize;
-#ifndef Py_UNICODE_WIDE
-    if (descr->type_num == NPY_UNICODE) {
-        elsize >>= 1;
-    }
-#endif
     view->len = elsize;
     if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
         elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */

diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
@@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
         PyObject *obj, PyArray_Descr *last_dtype, int string_type)
 {
     int itemsize;
-    PyObject *temp;
 
     if (string_type == NPY_STRING) {
-        if ((temp = PyObject_Str(obj)) == NULL) {
+        PyObject *temp = PyObject_Str(obj);
+        if (temp == NULL) {
             return NULL;
         }
+        /* assume that when we do the encoding elsewhere we'll use ASCII */
         itemsize = PyUnicode_GetLength(temp);
+        Py_DECREF(temp);
+        if (itemsize < 0) {
+            return NULL;
+        }
     }
     else if (string_type == NPY_UNICODE) {
-        if ((temp = PyObject_Str(obj)) == NULL) {
+        PyObject *temp = PyObject_Str(obj);
+        if (temp == NULL) {
             return NULL;
         }
-        itemsize = PyUnicode_GET_DATA_SIZE(temp);
-#ifndef Py_UNICODE_WIDE
-        itemsize <<= 1;
-#endif
+        itemsize = PyUnicode_GetLength(temp);
+        Py_DECREF(temp);
+        if (itemsize < 0) {
+            return NULL;
+        }
+        itemsize *= 4;  /* convert UCS4 codepoints to bytes */
     }
     else {
         return NULL;
     }
-    Py_DECREF(temp);
     if (last_dtype != NULL &&
             last_dtype->type_num == string_type &&
             last_dtype->elsize >= itemsize) {
@@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
 
     /* Check if it's a Unicode string */
     if (PyUnicode_Check(obj)) {
-        int itemsize = PyUnicode_GET_DATA_SIZE(obj);
-#ifndef Py_UNICODE_WIDE
-        itemsize <<= 1;
-#endif
+        int itemsize = PyUnicode_GetLength(obj);
+        if (itemsize < 0) {
+            goto fail;
+        }
+        itemsize *= 4;
 
         /*
          * If it's already a big enough unicode object,