Skip to content

Commit

Permalink
Merge pull request #15385 from eric-wieser/fix-unicode-ucs2
Browse files Browse the repository at this point in the history
BUG, MAINT: Stop using the error-prone deprecated Py_UNICODE apis
  • Loading branch information
seberg committed Feb 14, 2020
2 parents 491f41a + d0b7b66 commit 1f9ab28
Show file tree
Hide file tree
Showing 12 changed files with 180 additions and 268 deletions.
5 changes: 5 additions & 0 deletions doc/release/upcoming_changes/15385.new_feature.rst
@@ -0,0 +1,5 @@
``np.str_`` scalars now support the buffer protocol
---------------------------------------------------
``np.str_`` arrays are always stored as UCS4, so the corresponding scalars
now expose this through the buffer interface, meaning
``memoryview(np.str_('test'))`` now works.
19 changes: 0 additions & 19 deletions numpy/core/defchararray.py
Expand Up @@ -2679,25 +2679,6 @@ class adds the following functionality:
itemsize = len(obj)
shape = len(obj) // itemsize

if unicode:
if sys.maxunicode == 0xffff:
# On a narrow Python build, the buffer for Unicode
# strings is UCS2, which doesn't match the buffer for
# NumPy Unicode types, which is ALWAYS UCS4.
# Therefore, we need to convert the buffer. On Python
# 2.6 and later, we can use the utf_32 codec. Earlier
# versions don't have that codec, so we convert to a
# numerical array that matches the input buffer, and
# then use NumPy to convert it to UCS4. All of this
# should happen in native endianness.
obj = obj.encode('utf_32')
else:
obj = str(obj)
else:
# Let the default Unicode -> string encoding (if any) take
# precedence.
obj = bytes(obj)

return chararray(shape, itemsize=itemsize, unicode=unicode,
buffer=obj, order=order)

Expand Down
8 changes: 7 additions & 1 deletion numpy/core/include/numpy/arrayscalars.h
Expand Up @@ -135,7 +135,13 @@ typedef struct {
} PyScalarObject;

#define PyStringScalarObject PyStringObject
#define PyUnicodeScalarObject PyUnicodeObject
#define PyStringScalarObject PyStringObject
typedef struct {
/* note that the PyObject_HEAD macro lives right here */
PyUnicodeObject base;
Py_UCS4 *obval;
} PyUnicodeScalarObject;


typedef struct {
PyObject_VAR_HEAD
Expand Down
116 changes: 8 additions & 108 deletions numpy/core/src/common/ucsnarrow.c
Expand Up @@ -16,76 +16,12 @@
#include "ctors.h"

/*
* Functions only needed on narrow builds of Python for converting back and
* forth between the NumPy Unicode data-type (always 4-bytes) and the
* Python Unicode scalar (2-bytes on a narrow build).
*/

/*
* The ucs2 buffer must be large enough to hold 2*ucs4length characters
* due to the use of surrogate pairs.
* This file originally contained functions only needed on narrow builds of
* Python for converting back and forth between the NumPy Unicode data-type
* (always 4-bytes) and the Python Unicode scalar (2-bytes on a narrow build).
*
* The return value is the number of ucs2 bytes used-up which
* is ucs4length + number of surrogate pairs found.
*
* Values above 0xffff are converted to surrogate pairs.
* This "narrow" interface is now deprecated in python and unused in NumPy.
*/
NPY_NO_EXPORT int
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 const *ucs4, int ucs4length)
{
int i;
int numucs2 = 0;
npy_ucs4 chr;
for (i = 0; i < ucs4length; i++) {
chr = *ucs4++;
if (chr > 0xffff) {
numucs2++;
chr -= 0x10000L;
*ucs2++ = 0xD800 + (Py_UNICODE) (chr >> 10);
*ucs2++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
}
else {
*ucs2++ = (Py_UNICODE) chr;
}
numucs2++;
}
return numucs2;
}


/*
* This converts a UCS2 buffer of the given length to UCS4 buffer.
* It converts up to ucs4len characters of UCS2
*
* It returns the number of characters converted which can
* be less than ucs2len if there are surrogate pairs in ucs2.
*
* The return value is the actual size of the used part of the ucs4 buffer.
*/
NPY_NO_EXPORT int
PyUCS2Buffer_AsUCS4(Py_UNICODE const *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len)
{
int i;
npy_ucs4 chr;
Py_UNICODE ch;
int numchars=0;

for (i = 0; (i < ucs2len) && (numchars < ucs4len); i++) {
ch = *ucs2++;
if (ch >= 0xd800 && ch <= 0xdfff) {
/* surrogate pair */
chr = ((npy_ucs4)(ch-0xd800)) << 10;
chr += *ucs2++ + 0x2400; /* -0xdc00 + 0x10000 */
i++;
}
else {
chr = (npy_ucs4) ch;
}
*ucs4++ = chr;
numchars++;
}
return numchars;
}

/*
* Returns a PyUnicodeObject initialized from a buffer containing
Expand All @@ -112,14 +48,13 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
Py_ssize_t ucs4len = size / sizeof(npy_ucs4);
npy_ucs4 const *src = (npy_ucs4 const *)src_char;
npy_ucs4 *buf = NULL;
PyUnicodeObject *ret;

/* swap and align if needed */
if (swap || align) {
buf = (npy_ucs4 *)malloc(size);
if (buf == NULL) {
PyErr_NoMemory();
goto fail;
return NULL;
}
memcpy(buf, src, size);
if (swap) {
Expand All @@ -132,43 +67,8 @@ PyUnicode_FromUCS4(char const *src_char, Py_ssize_t size, int swap, int align)
while (ucs4len > 0 && src[ucs4len - 1] == 0) {
ucs4len--;
}

/* produce PyUnicode object */
#ifdef Py_UNICODE_WIDE
{
ret = (PyUnicodeObject *)PyUnicode_FromUnicode((Py_UNICODE const*)src,
(Py_ssize_t) ucs4len);
if (ret == NULL) {
goto fail;
}
}
#else
{
Py_ssize_t tmpsiz = 2 * sizeof(Py_UNICODE) * ucs4len;
Py_ssize_t ucs2len;
Py_UNICODE *tmp;

if ((tmp = (Py_UNICODE *)malloc(tmpsiz)) == NULL) {
PyErr_NoMemory();
goto fail;
}
ucs2len = PyUCS2Buffer_FromUCS4(tmp, src, ucs4len);
ret = (PyUnicodeObject *)PyUnicode_FromUnicode(tmp, (Py_ssize_t) ucs2len);
free(tmp);
if (ret == NULL) {
goto fail;
}
}
#endif

if (buf) {
free(buf);
}
PyUnicodeObject *ret = (PyUnicodeObject *)PyUnicode_FromKindAndData(
PyUnicode_4BYTE_KIND, src, ucs4len);
free(buf);
return ret;

fail:
if (buf) {
free(buf);
}
return NULL;
}
6 changes: 0 additions & 6 deletions numpy/core/src/common/ucsnarrow.h
@@ -1,12 +1,6 @@
#ifndef _NPY_UCSNARROW_H_
#define _NPY_UCSNARROW_H_

NPY_NO_EXPORT int
PyUCS2Buffer_FromUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs4length);

NPY_NO_EXPORT int
PyUCS2Buffer_AsUCS4(Py_UNICODE *ucs2, npy_ucs4 *ucs4, int ucs2len, int ucs4len);

NPY_NO_EXPORT PyUnicodeObject *
PyUnicode_FromUCS4(char *src, Py_ssize_t size, int swap, int align);

Expand Down
58 changes: 30 additions & 28 deletions numpy/core/src/multiarray/arraytypes.c.src
Expand Up @@ -450,12 +450,6 @@ static int
UNICODE_setitem(PyObject *op, void *ov, void *vap)
{
PyArrayObject *ap = vap;
PyObject *temp;
Py_UNICODE *ptr;
int datalen;
#ifndef Py_UNICODE_WIDE
char *buffer;
#endif

if (PyArray_IsZeroDim(op)) {
return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
Expand All @@ -466,6 +460,8 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
"setting an array element with a sequence");
return -1;
}

PyObject *temp;
if (PyBytes_Check(op)) {
/* Try to decode from ASCII */
temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
Expand All @@ -476,18 +472,27 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else if ((temp=PyObject_Str(op)) == NULL) {
return -1;
}
ptr = PyUnicode_AS_UNICODE(temp);
if ((ptr == NULL) || (PyErr_Occurred())) {

/* truncate if needed */
Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
Py_ssize_t actual_len = PyUnicode_GetLength(temp);
if (actual_len < 0) {
Py_DECREF(temp);
return -1;
}
datalen = PyUnicode_GET_DATA_SIZE(temp);
if (actual_len > max_len) {
Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
if (temp == NULL) {
return -1;
}
actual_len = max_len;
}

#ifdef Py_UNICODE_WIDE
memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize, datalen));
#else
Py_ssize_t num_bytes = actual_len * 4;

char *buffer;
if (!PyArray_ISALIGNED(ap)) {
buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
buffer = PyArray_malloc(num_bytes);
if (buffer == NULL) {
Py_DECREF(temp);
PyErr_NoMemory();
Expand All @@ -497,20 +502,23 @@ UNICODE_setitem(PyObject *op, void *ov, void *vap)
else {
buffer = ov;
}
datalen = PyUCS2Buffer_AsUCS4(ptr, (npy_ucs4 *)buffer,
datalen >> 1, PyArray_DESCR(ap)->elsize >> 2);
datalen <<= 2;
if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
PyArray_free(buffer);
Py_DECREF(temp);
return -1;
}

if (!PyArray_ISALIGNED(ap)) {
memcpy(ov, buffer, datalen);
memcpy(ov, buffer, num_bytes);
PyArray_free(buffer);
}
#endif

/* Fill in the rest of the space with 0 */
if (PyArray_DESCR(ap)->elsize > datalen) {
memset((char*)ov + datalen, 0, (PyArray_DESCR(ap)->elsize - datalen));
if (PyArray_DESCR(ap)->elsize > num_bytes) {
memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
}
if (PyArray_ISBYTESWAPPED(ap)) {
byte_swap_vector(ov, PyArray_DESCR(ap)->elsize >> 2, 4);
byte_swap_vector(ov, actual_len, 4);
}
Py_DECREF(temp);
return 0;
Expand Down Expand Up @@ -2650,12 +2658,6 @@ STRING_nonzero (char *ip, PyArrayObject *ap)
return nonz;
}

#ifdef Py_UNICODE_WIDE
#define PyArray_UCS4_ISSPACE Py_UNICODE_ISSPACE
#else
#define PyArray_UCS4_ISSPACE(ch) Py_STRING_ISSPACE((char)ch)
#endif

static npy_bool
UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
{
Expand All @@ -2681,7 +2683,7 @@ UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
if (*ip == '\0') {
seen_null = NPY_TRUE;
}
else if (seen_null || !PyArray_UCS4_ISSPACE(*ip)) {
else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
nonz = NPY_TRUE;
break;
}
Expand Down
5 changes: 0 additions & 5 deletions numpy/core/src/multiarray/buffer.c
Expand Up @@ -832,11 +832,6 @@ gentype_getbuffer(PyObject *self, Py_buffer *view, int flags)
descr = PyArray_DescrFromScalar(self);
view->buf = (void *)scalar_value(self, descr);
elsize = descr->elsize;
#ifndef Py_UNICODE_WIDE
if (descr->type_num == NPY_UNICODE) {
elsize >>= 1;
}
#endif
view->len = elsize;
if (PyArray_IsScalar(self, Datetime) || PyArray_IsScalar(self, Timedelta)) {
elsize = 1; /* descr->elsize,char is 8,'M', but we return 1,'B' */
Expand Down
32 changes: 20 additions & 12 deletions numpy/core/src/multiarray/common.c
Expand Up @@ -130,27 +130,34 @@ PyArray_DTypeFromObjectStringDiscovery(
PyObject *obj, PyArray_Descr *last_dtype, int string_type)
{
int itemsize;
PyObject *temp;

if (string_type == NPY_STRING) {
if ((temp = PyObject_Str(obj)) == NULL) {
PyObject *temp = PyObject_Str(obj);
if (temp == NULL) {
return NULL;
}
/* assume that when we do the encoding elsewhere we'll use ASCII */
itemsize = PyUnicode_GetLength(temp);
Py_DECREF(temp);
if (itemsize < 0) {
return NULL;
}
}
else if (string_type == NPY_UNICODE) {
if ((temp = PyObject_Str(obj)) == NULL) {
PyObject *temp = PyObject_Str(obj);
if (temp == NULL) {
return NULL;
}
itemsize = PyUnicode_GET_DATA_SIZE(temp);
#ifndef Py_UNICODE_WIDE
itemsize <<= 1;
#endif
itemsize = PyUnicode_GetLength(temp);
Py_DECREF(temp);
if (itemsize < 0) {
return NULL;
}
itemsize *= 4; /* convert UCS4 codepoints to bytes */
}
else {
return NULL;
}
Py_DECREF(temp);
if (last_dtype != NULL &&
last_dtype->type_num == string_type &&
last_dtype->elsize >= itemsize) {
Expand Down Expand Up @@ -258,10 +265,11 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,

/* Check if it's a Unicode string */
if (PyUnicode_Check(obj)) {
int itemsize = PyUnicode_GET_DATA_SIZE(obj);
#ifndef Py_UNICODE_WIDE
itemsize <<= 1;
#endif
int itemsize = PyUnicode_GetLength(obj);
if (itemsize < 0) {
goto fail;
}
itemsize *= 4;

/*
* If it's already a big enough unicode object,
Expand Down

0 comments on commit 1f9ab28

Please sign in to comment.