Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

BUG: Fix ticket #1990.

When an array is created from a sequence of numeric (boolean, int, float,
complex) and string (bytes, str, unicode) values, the resulting array type
is string, but only the string values were being used to choose the string
length, leading to truncation of data.
  • Loading branch information...
commit 91f87e1f613630ff0ad9864017f059afcd6e57f1 1 parent d878ad9
Bryan Van de Ven authored charris committed
View
25 numpy/compat/py3k.py
@@ -13,32 +13,45 @@
import io
bytes = bytes
unicode = str
- asunicode = str
+
+ def asunicode(s):
+ if isinstance(s, bytes):
+ return s.decode('latin1')
+ return str(s)
+
def asbytes(s):
if isinstance(s, bytes):
return s
- return s.encode('latin1')
+ return str(s).encode('latin1')
+
def asstr(s):
- if isinstance(s, str):
- return s
- return s.decode('latin1')
+ if isinstance(s, bytes):
+ return s.decode('latin1')
+ return str(s)
+
def isfileobj(f):
return isinstance(f, (io.FileIO, io.BufferedReader))
+
def open_latin1(filename, mode='r'):
return open(filename, mode=mode, encoding='iso-8859-1')
+
strchar = 'U'
+
else:
bytes = str
unicode = unicode
asbytes = str
asstr = str
strchar = 'S'
+
def isfileobj(f):
return isinstance(f, file)
+
def asunicode(s):
if isinstance(s, unicode):
return s
- return s.decode('ascii')
+ return str(s).decode('ascii')
+
def open_latin1(filename, mode='r'):
return open(filename, mode=mode)
View
141 numpy/core/src/multiarray/common.c
@@ -68,6 +68,15 @@ _use_default_type(PyObject *op)
#endif
/*
+ * These constants are used to signal that the recursive dtype determination in
+ * PyArray_DTypeFromObject encountered a string type, and that the recursive
+ * search must be restarted so that string representation lengths can be
+ * computed for all scalar types.
+ */
+#define RETRY_WITH_STRING 1
+#define RETRY_WITH_UNICODE 2
+
+/*
* Recursively examines the object to determine an appropriate dtype
* to use for converting to an ndarray.
*
@@ -88,10 +97,33 @@ _use_default_type(PyObject *op)
*
* Returns 0 on success, -1 on failure.
*/
-NPY_NO_EXPORT int
+ NPY_NO_EXPORT int
PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
PyArray_Descr **out_dtype)
{
+ int res;
+
+ res = PyArray_DTypeFromObjectHelper(obj, maxdims, out_contains_na,
+ out_dtype, 0);
+ if (res == RETRY_WITH_STRING) {
+ res = PyArray_DTypeFromObjectHelper(obj, maxdims, out_contains_na,
+ out_dtype, NPY_STRING);
+ if (res == RETRY_WITH_UNICODE) {
+ res = PyArray_DTypeFromObjectHelper(obj, maxdims,
+ out_contains_na, out_dtype, NPY_UNICODE);
+ }
+ }
+ else if (res == RETRY_WITH_UNICODE) {
+ res = PyArray_DTypeFromObjectHelper(obj, maxdims, out_contains_na,
+ out_dtype, NPY_UNICODE);
+ }
+ return res;
+}
+
+NPY_NO_EXPORT int
+PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims, int *out_contains_na,
+ PyArray_Descr **out_dtype, int string_type)
+{
int i, size;
PyArray_Descr *dtype = NULL;
PyObject *ip;
@@ -103,6 +135,7 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
if (PyArray_Check(obj)) {
/* Check for any NAs in the array */
int containsna = PyArray_ContainsNA((PyArrayObject *)obj, NULL, NULL);
+
if (containsna == -1) {
goto fail;
}
@@ -116,9 +149,43 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
/* Check if it's a NumPy scalar */
if (PyArray_IsScalar(obj, Generic)) {
- dtype = PyArray_DescrFromScalar(obj);
- if (dtype == NULL) {
- goto fail;
+ int itemsize;
+ PyObject *temp;
+
+ if (!string_type) {
+ dtype = PyArray_DescrFromScalar(obj);
+ if (dtype == NULL) {
+ goto fail;
+ }
+ }
+ else {
+ if (string_type == NPY_STRING) {
+ if ((temp = PyObject_Str(obj)) == NULL) {
+ return -1;
+ }
+ itemsize = PyString_GET_SIZE(temp);
+ }
+ else if (string_type == NPY_UNICODE) {
+#if defined(NPY_PY3K)
+ if ((temp = PyObject_Str(obj)) == NULL) {
+#else
+ if ((temp = PyObject_Unicode(obj)) == NULL) {
+#endif
+ return -1;
+ }
+ itemsize = PyUnicode_GET_DATA_SIZE(temp);
+ }
+ Py_DECREF(temp);
+ if (*out_dtype != NULL &&
+ (*out_dtype)->type_num == string_type &&
+ (*out_dtype)->elsize >= itemsize) {
+ return 0;
+ }
+ dtype = PyArray_DescrNewFromType(string_type);
+ if (dtype == NULL) {
+ goto fail;
+ }
+ dtype->elsize = itemsize;
}
goto promote_types;
}
@@ -126,6 +193,41 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
/* Check if it's a Python scalar */
dtype = _array_find_python_scalar_type(obj);
if (dtype != NULL) {
+ int itemsize;
+ PyObject *temp;
+
+ if (string_type) {
+ if (string_type == NPY_STRING) {
+ if ((temp = PyObject_Str(obj)) == NULL) {
+ return -1;
+ }
+ itemsize = PyString_GET_SIZE(temp);
+ }
+ else if (string_type == NPY_UNICODE) {
+#if defined(NPY_PY3K)
+ if ((temp = PyObject_Str(obj)) == NULL) {
+#else
+ if ((temp = PyObject_Unicode(obj)) == NULL) {
+#endif
+ return -1;
+ }
+ itemsize = PyUnicode_GET_DATA_SIZE(temp);
+#ifndef Py_UNICODE_WIDE
+ itemsize <<= 1;
+#endif
+ }
+ Py_DECREF(temp);
+ if (*out_dtype != NULL &&
+ (*out_dtype)->type_num == string_type &&
+ (*out_dtype)->elsize >= itemsize) {
+ return 0;
+ }
+ dtype = PyArray_DescrNewFromType(string_type);
+ if (dtype == NULL) {
+ goto fail;
+ }
+ dtype->elsize = itemsize;
+ }
goto promote_types;
}
@@ -313,15 +415,21 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
}
/* Recursive call for each sequence item */
for (i = 0; i < size; ++i) {
+ int res;
ip = PySequence_GetItem(obj, i);
- if (ip==NULL) {
+ if (ip == NULL) {
goto fail;
}
- if (PyArray_DTypeFromObject(ip, maxdims - 1,
- out_contains_na, out_dtype) < 0) {
+ res = PyArray_DTypeFromObjectHelper(ip, maxdims - 1,
+ out_contains_na, out_dtype, string_type);
+ if (res < 0) {
Py_DECREF(ip);
goto fail;
}
+ else if (res > 0) {
+ Py_DECREF(ip);
+ return res;
+ }
Py_DECREF(ip);
}
@@ -331,6 +439,12 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
promote_types:
/* Set 'out_dtype' if it's NULL */
if (*out_dtype == NULL) {
+ if (!string_type && dtype->type_num == NPY_STRING) {
+ return RETRY_WITH_STRING;
+ }
+ if (!string_type && dtype->type_num == NPY_UNICODE) {
+ return RETRY_WITH_UNICODE;
+ }
*out_dtype = dtype;
return 0;
}
@@ -342,6 +456,16 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
return -1;
}
Py_DECREF(*out_dtype);
+ if (!string_type &&
+ res_dtype->type_num == NPY_UNICODE &&
+ (*out_dtype)->type_num != NPY_UNICODE) {
+ return RETRY_WITH_UNICODE;
+ }
+ if (!string_type &&
+ res_dtype->type_num == NPY_STRING &&
+ (*out_dtype)->type_num != NPY_STRING) {
+ return RETRY_WITH_STRING;
+ }
*out_dtype = res_dtype;
return 0;
}
@@ -352,6 +476,9 @@ PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
return -1;
}
+#undef RETRY_WITH_STRING
+#undef RETRY_WITH_UNICODE
+
/* new reference */
NPY_NO_EXPORT PyArray_Descr *
_array_typedescr_fromstr(char *str)
View
4 numpy/core/src/multiarray/common.h
@@ -27,6 +27,10 @@ NPY_NO_EXPORT int
PyArray_DTypeFromObject(PyObject *obj, int maxdims, int *out_contains_na,
PyArray_Descr **out_dtype);
+NPY_NO_EXPORT int
+PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims, int *out_contains_na,
+ PyArray_Descr **out_dtype, int string_status);
+
/*
* Returns NULL without setting an exception if no scalar is matched, a
* new dtype reference otherwise.
View
16 numpy/core/tests/test_regression.py
@@ -1643,5 +1643,21 @@ def test_search_sorted_invalid_arguments(self):
x = np.arange(0, 4, dtype='datetime64[D]')
assert_raises(TypeError, x.searchsorted, 1)
+ def test_string_truncation(self):
+ # Ticket #1990 - Data can be truncated in creation of an array from a
+ # mixed sequence of numeric values and strings
+ for val in [True, 1234, 123.4, complex(1, 234)]:
+ for tostr in [asunicode, asbytes]:
+ b = np.array([val, tostr('xx')])
+ assert_equal(tostr(b[0]), tostr(val))
+ b = np.array([tostr('xx'), val])
+ assert_equal(tostr(b[1]), tostr(val))
+
+ # test also with longer strings
+ b = np.array([val, tostr('xxxxxxxxxx')])
+ assert_equal(tostr(b[0]), tostr(val))
+ b = np.array([tostr('xxxxxxxxxx'), val])
+ assert_equal(tostr(b[1]), tostr(val))
+
if __name__ == "__main__":
run_module_suite()
Please sign in to comment.
Something went wrong with that request. Please try again.