From 37a7e69cea0ba518adc4328aac92ac7d925ed843 Mon Sep 17 00:00:00 2001 From: Kieran O'Mahony Date: Tue, 26 Apr 2016 14:51:06 -0400 Subject: [PATCH] BUG: fix json segfaults closes #11473 closes #10778 closes #11299 Author: Kieran O'Mahony Closes #12802 from Komnomnomnom/json-seg-faults and squashes the following commits: b14d0df [Kieran O'Mahony] CLN: rename json test inline with others af006a4 [Kieran O'Mahony] BUG: fix json segfaults --- doc/source/whatsnew/v0.18.1.txt | 8 +- .../io/tests/{test_json => json}/__init__.py | 0 .../data/tsframe_iso_v012.json | 0 .../data/tsframe_v012.json | 0 pandas/io/tests/{ => json}/test_json_norm.py | 0 .../tests/{test_json => json}/test_pandas.py | 93 ++++++++ .../tests/{test_json => json}/test_ujson.py | 22 +- pandas/src/ujson/python/objToJSON.c | 199 +++++++++++++++--- setup.py | 4 +- 9 files changed, 284 insertions(+), 42 deletions(-) rename pandas/io/tests/{test_json => json}/__init__.py (100%) rename pandas/io/tests/{test_json => json}/data/tsframe_iso_v012.json (100%) rename pandas/io/tests/{test_json => json}/data/tsframe_v012.json (100%) rename pandas/io/tests/{ => json}/test_json_norm.py (100%) rename pandas/io/tests/{test_json => json}/test_pandas.py (90%) rename pandas/io/tests/{test_json => json}/test_ujson.py (98%) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index c4a4f03e98fc0..7357aff679fbc 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -149,6 +149,7 @@ Other Enhancements - ``pd.read_csv()`` now supports opening files using xz compression, via extension inference or explicit ``compression='xz'`` is specified; ``xz`` compressions is also supported by ``DataFrame.to_csv`` in the same way (:issue:`11852`) - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`). - ``pd.read_msgpack()`` now supports serializing and de-serializing categoricals with msgpack (:issue:`12573`) +- ``.to_json()`` now supports ``NDFrames`` that contain categorical and sparse data (:issue:`10778`) - ``interpolate()`` now supports ``method='akima'`` (:issue:`7588`). - ``pd.read_excel()`` now accepts path objects (e.g. ``pathlib.Path``, ``py.path.local``) for the file path, in line with other ``read_*`` functions (:issue:`12655`) - ``Index.take`` now handles ``allow_fill`` and ``fill_value`` consistently (:issue:`12631`) @@ -398,8 +399,6 @@ Deprecations - - .. _whatsnew_0181.performance: Performance Improvements @@ -443,6 +442,11 @@ Bug Fixes - Bug in correctly raising a ``ValueError`` in ``.resample(..).fillna(..)`` when passing a non-string (:issue:`12952`) - Bug fixes in various encoding and header processing issues in ``pd.read_sas()`` (:issue:`12659`, :issue:`12654`, :issue:`12647`, :issue:`12809`) - Bug in ``pd.crosstab()`` where would silently ignore ``aggfunc`` if ``values=None`` (:issue:`12569`). +- Potential segfault in ``DataFrame.to_json`` when serialising ``datetime.time`` (:issue:`11473`). +- Potential segfault in ``DataFrame.to_json`` when attempting to serialise 0d array (:issue:`11299`). +- Segfault in ``to_json`` when attempting to serialise a ``DataFrame`` or ``Series`` with non-ndarray values (:issue:`10778`). + + - Bug in consistency of ``.name`` on ``.groupby(..).apply(..)`` cases (:issue:`12363`) diff --git a/pandas/io/tests/test_json/__init__.py b/pandas/io/tests/json/__init__.py similarity index 100% rename from pandas/io/tests/test_json/__init__.py rename to pandas/io/tests/json/__init__.py diff --git a/pandas/io/tests/test_json/data/tsframe_iso_v012.json b/pandas/io/tests/json/data/tsframe_iso_v012.json similarity index 100% rename from pandas/io/tests/test_json/data/tsframe_iso_v012.json rename to pandas/io/tests/json/data/tsframe_iso_v012.json diff --git a/pandas/io/tests/test_json/data/tsframe_v012.json b/pandas/io/tests/json/data/tsframe_v012.json similarity index 100% rename from pandas/io/tests/test_json/data/tsframe_v012.json rename to pandas/io/tests/json/data/tsframe_v012.json diff --git a/pandas/io/tests/test_json_norm.py b/pandas/io/tests/json/test_json_norm.py similarity index 100% rename from pandas/io/tests/test_json_norm.py rename to pandas/io/tests/json/test_json_norm.py diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/json/test_pandas.py similarity index 90% rename from pandas/io/tests/test_json/test_pandas.py rename to pandas/io/tests/json/test_pandas.py index af897aeeee419..70fef01c0a3ea 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/json/test_pandas.py @@ -821,6 +821,99 @@ def my_handler_raises(obj): DataFrame({'a': [1, 2, object()]}).to_json, default_handler=my_handler_raises) + def test_categorical(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + df = DataFrame({"A": ["a", "b", "c", "a", "b", "b", "a"]}) + df["B"] = df["A"] + expected = df.to_json() + + df["B"] = df["A"].astype('category') + self.assertEqual(expected, df.to_json()) + + s = df["A"] + sc = df["B"] + self.assertEqual(s.to_json(), sc.to_json()) + + def test_datetime_tz(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + tz_range = pd.date_range('20130101', periods=3, tz='US/Eastern') + tz_naive = tz_range.tz_convert('utc').tz_localize(None) + + df = DataFrame({ + 'A': tz_range, + 'B': pd.date_range('20130101', periods=3)}) + + df_naive = df.copy() + df_naive['A'] = tz_naive + expected = df_naive.to_json() + self.assertEqual(expected, df.to_json()) + + stz = Series(tz_range) + s_naive = Series(tz_naive) + self.assertEqual(stz.to_json(), s_naive.to_json()) + + def test_sparse(self): + # GH4377 df.to_json segfaults with non-ndarray blocks + df = pd.DataFrame(np.random.randn(10, 4)) + df.ix[:8] = np.nan + + sdf = df.to_sparse() + expected = df.to_json() + self.assertEqual(expected, sdf.to_json()) + + s = pd.Series(np.random.randn(10)) + s.ix[:8] = np.nan + ss = s.to_sparse() + + expected = s.to_json() + self.assertEqual(expected, ss.to_json()) + + def test_tz_is_utc(self): + exp = '"2013-01-10T05:00:00.000Z"' + + ts = Timestamp('2013-01-10 05:00:00Z') + self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + dt = ts.to_datetime() + self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + + ts = Timestamp('2013-01-10 00:00:00', tz='US/Eastern') + self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + dt = ts.to_datetime() + self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + + ts = Timestamp('2013-01-10 00:00:00-0500') + self.assertEqual(exp, pd.json.dumps(ts, iso_dates=True)) + dt = ts.to_datetime() + self.assertEqual(exp, pd.json.dumps(dt, iso_dates=True)) + + def test_tz_range_is_utc(self): + exp = '["2013-01-01T05:00:00.000Z","2013-01-02T05:00:00.000Z"]' + dfexp = ('{"DT":{' + '"0":"2013-01-01T05:00:00.000Z",' + '"1":"2013-01-02T05:00:00.000Z"}}') + + tz_range = pd.date_range('2013-01-01 05:00:00Z', periods=2) + self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + dti = pd.DatetimeIndex(tz_range) + self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + df = DataFrame({'DT': dti}) + self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + + tz_range = pd.date_range('2013-01-01 00:00:00', periods=2, + tz='US/Eastern') + self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + dti = pd.DatetimeIndex(tz_range) + self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + df = DataFrame({'DT': dti}) + self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + + tz_range = pd.date_range('2013-01-01 00:00:00-0500', periods=2) + self.assertEqual(exp, pd.json.dumps(tz_range, iso_dates=True)) + dti = pd.DatetimeIndex(tz_range) + self.assertEqual(exp, pd.json.dumps(dti, iso_dates=True)) + df = DataFrame({'DT': dti}) + self.assertEqual(dfexp, pd.json.dumps(df, iso_dates=True)) + if __name__ == '__main__': import nose diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/json/test_ujson.py similarity index 98% rename from pandas/io/tests/test_json/test_ujson.py rename to pandas/io/tests/json/test_ujson.py index f5efb54099ddd..babcd910a2edd 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/json/test_ujson.py @@ -23,7 +23,6 @@ import numpy as np from numpy.testing import (assert_array_almost_equal_nulp, assert_approx_equal) -import pytz from pandas import DataFrame, Series, Index, NaT, DatetimeIndex import pandas.util.testing as tm @@ -365,15 +364,30 @@ def test_encodeTimeConversion(self): datetime.time(), datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243), - datetime.time(10, 12, 15, 343243, pytz.utc), - # datetime.time(10, 12, 15, 343243, dateutil.tz.gettz('UTC')), # - # this segfaults! No idea why. ] for test in tests: output = ujson.encode(test) expected = '"%s"' % test.isoformat() self.assertEqual(expected, output) + def test_encodeTimeConversion_pytz(self): + # GH11473 to_json segfaults with timezone-aware datetimes + tm._skip_if_no_pytz() + import pytz + test = datetime.time(10, 12, 15, 343243, pytz.utc) + output = ujson.encode(test) + expected = '"%s"' % test.isoformat() + self.assertEqual(expected, output) + + def test_encodeTimeConversion_dateutil(self): + # GH11473 to_json segfaults with timezone-aware datetimes + tm._skip_if_no_dateutil() + import dateutil + test = datetime.time(10, 12, 15, 343243, dateutil.tz.tzutc()) + output = ujson.encode(test) + expected = '"%s"' % test.isoformat() + self.assertEqual(expected, output) + def test_nat(self): input = NaT assert ujson.encode(input) == 'null', "Expected null" diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index dcb509be696dc..2f8ac0077d92e 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -232,6 +232,90 @@ static TypeContext* createTypeContext(void) return pc; } +static PyObject* get_values(PyObject *obj) +{ + PyObject *values = PyObject_GetAttrString(obj, "values"); + PRINTMARK(); + + if (values && !PyArray_CheckExact(values)) + { + if (PyObject_HasAttrString(values, "values")) + { + PyObject *subvals = get_values(values); + PyErr_Clear(); + PRINTMARK(); + // subvals are sometimes missing a dimension + if (subvals) + { + PyArrayObject *reshape = (PyArrayObject*) subvals; + PyObject *shape = PyObject_GetAttrString(obj, "shape"); + PyArray_Dims dims; + PRINTMARK(); + + if (!shape || !PyArray_IntpConverter(shape, &dims)) + { + subvals = NULL; + } + else + { + subvals = PyArray_Newshape(reshape, &dims, NPY_ANYORDER); + PyDimMem_FREE(dims.ptr); + } + Py_DECREF(reshape); + Py_XDECREF(shape); + } + Py_DECREF(values); + values = subvals; + } + else + { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } + } + + if (!values && PyObject_HasAttrString(obj, "get_values")) + { + PRINTMARK(); + values = PyObject_CallMethod(obj, "get_values", NULL); + if (values && !PyArray_CheckExact(values)) + { + PRINTMARK(); + Py_DECREF(values); + values = NULL; + } + } + + if (!values) + { + PyObject *typeRepr = PyObject_Repr((PyObject*) Py_TYPE(obj)); + PyObject *repr; + PRINTMARK(); + if (PyObject_HasAttrString(obj, "dtype")) + { + PyObject *dtype = PyObject_GetAttrString(obj, "dtype"); + repr = PyObject_Repr(dtype); + Py_DECREF(dtype); + } + else + { + repr = PyString_FromString(""); + } + + PyErr_Format(PyExc_ValueError, + "%s or %s are not JSON serializable yet", + PyString_AS_STRING(repr), + PyString_AS_STRING(typeRepr)); + Py_DECREF(repr); + Py_DECREF(typeRepr); + + return NULL; + } + + return values; +} + static PyObject* get_sub_attr(PyObject *obj, char *attr, char *subAttr) { PyObject *tmp = PyObject_GetAttrString(obj, attr); @@ -456,7 +540,12 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_ str = PyObject_CallMethod(obj, "isoformat", NULL); if (str == NULL) { PRINTMARK(); - PyErr_SetString(PyExc_ValueError, "Failed to convert time"); + *outLen = 0; + if (!PyErr_Occurred()) + { + PyErr_SetString(PyExc_ValueError, "Failed to convert time"); + } + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; return NULL; } if (PyUnicode_Check(str)) @@ -465,9 +554,11 @@ static void *PyTimeToJSON(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_ str = PyUnicode_AsUTF8String(str); Py_DECREF(tmp); } + + GET_TC(tc)->newObj = str; + + *outLen = PyString_GET_SIZE(str); outValue = (void *) PyString_AS_STRING (str); - *outLen = strlen ((char *) outValue); - Py_DECREF(str); return outValue; } @@ -997,13 +1088,15 @@ void PdBlock_iterBegin(JSOBJ _obj, JSONTypeContext *tc) goto BLKRET; } - tmp = PyObject_GetAttrString(block, "values"); + tmp = get_values(block); if (!tmp) { + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; Py_DECREF(block); GET_TC(tc)->iterNext = NpyArr_iterNextNone; goto BLKRET; } + values = PyArray_Transpose((PyArrayObject*) tmp, NULL); Py_DECREF(tmp); if (!values) @@ -1421,7 +1514,11 @@ int Index_iterNext(JSOBJ obj, JSONTypeContext *tc) if (index == 1) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) + { + return 0; + } } else { @@ -1491,7 +1588,11 @@ int Series_iterNext(JSOBJ obj, JSONTypeContext *tc) if (index == 2) { memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) + { + return 0; + } } else { @@ -1565,7 +1666,11 @@ int DataFrame_iterNext(JSOBJ obj, JSONTypeContext *tc) memcpy(GET_TC(tc)->cStr, "data", sizeof(char)*5); if (is_simple_frame(obj)) { - GET_TC(tc)->itemValue = PyObject_GetAttrString(obj, "values"); + GET_TC(tc)->itemValue = get_values(obj); + if (!GET_TC(tc)->itemValue) + { + return 0; + } } else { @@ -1814,7 +1919,7 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *exc, *toDictFunc, *tmpObj, *getValuesFunc; + PyObject *obj, *exc, *toDictFunc, *tmpObj, *values; TypeContext *pc; PyObjectEncoder *enc; double val; @@ -2067,20 +2172,14 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) } else if (PyArray_Check(obj) && PyArray_CheckScalar(obj)) { - #if PY_MAJOR_VERSION >= 3 - PyErr_Format( - PyExc_TypeError, - "%R (0d array) is not JSON serializable at the moment", - obj - ); - #else - PyErr_Format( + tmpObj = PyObject_Repr(obj); + PyErr_Format( PyExc_TypeError, "%s (0d array) is not JSON serializable at the moment", - PyString_AsString(PyObject_Repr(obj)) - ); - #endif - return; + PyString_AS_STRING(tmpObj) + ); + Py_DECREF(tmpObj); + goto INVALID; } ISITERABLE: @@ -2099,19 +2198,16 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } - getValuesFunc = PyObject_GetAttrString(obj, "get_values"); - if (getValuesFunc) + pc->newObj = get_values(obj); + if (pc->newObj) { PRINTMARK(); tc->type = JT_ARRAY; - pc->newObj = PyObject_CallObject(getValuesFunc, NULL); pc->iterBegin = NpyArr_iterBegin; pc->iterEnd = NpyArr_iterEnd; pc->iterNext = NpyArr_iterNext; pc->iterGetValue = NpyArr_iterGetValue; pc->iterGetName = NpyArr_iterGetName; - - Py_DECREF(getValuesFunc); } else { @@ -2135,14 +2231,29 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) return; } - pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->newObj = get_values(obj); + if (!pc->newObj) + { + goto INVALID; + } if (enc->outputFormat == INDEX || enc->outputFormat == COLUMNS) { PRINTMARK(); tc->type = JT_OBJECT; + tmpObj = PyObject_GetAttrString(obj, "index"); + if (!tmpObj) + { + goto INVALID; + } + values = get_values(tmpObj); + Py_DECREF(tmpObj); + if (!values) + { + goto INVALID; + } pc->columnLabelsLen = PyArray_DIM(pc->newObj, 0); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(PyObject_GetAttrString(obj, "index"), "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); if (!pc->columnLabels) { goto INVALID; @@ -2227,7 +2338,11 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) pc->iterNext = NpyArr_iterNext; pc->iterGetName = NpyArr_iterGetName; - pc->newObj = PyObject_GetAttrString(obj, "values"); + pc->newObj = get_values(obj); + if (!pc->newObj) + { + goto INVALID; + } } else { @@ -2253,8 +2368,14 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } + values = get_values(tmpObj); + if (!values) + { + Py_DECREF(tmpObj); + goto INVALID; + } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(tmpObj, "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { @@ -2271,13 +2392,15 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) { goto INVALID; } - pc->rowLabelsLen = PyObject_Size(tmpObj); - pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(tmpObj, "values"), (JSONObjectEncoder*) enc, pc->rowLabelsLen); - Py_DECREF(tmpObj); - if (!pc->rowLabels) + values = get_values(tmpObj); + if (!values) { + Py_DECREF(tmpObj); goto INVALID; } + pc->rowLabelsLen = PyObject_Size(tmpObj); + pc->rowLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->rowLabelsLen); + Py_DECREF(tmpObj); tmpObj = (enc->outputFormat == INDEX ? PyObject_GetAttrString(obj, "columns") : PyObject_GetAttrString(obj, "index")); if (!tmpObj) { @@ -2285,8 +2408,16 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) pc->rowLabels = NULL; goto INVALID; } + values = get_values(tmpObj); + if (!values) + { + Py_DECREF(tmpObj); + NpyArr_freeLabels(pc->rowLabels, pc->rowLabelsLen); + pc->rowLabels = NULL; + goto INVALID; + } pc->columnLabelsLen = PyObject_Size(tmpObj); - pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) PyObject_GetAttrString(tmpObj, "values"), (JSONObjectEncoder*) enc, pc->columnLabelsLen); + pc->columnLabels = NpyArr_encodeLabels((PyArrayObject*) values, (JSONObjectEncoder*) enc, pc->columnLabelsLen); Py_DECREF(tmpObj); if (!pc->columnLabels) { diff --git a/setup.py b/setup.py index 2a2aee72ec8e3..29d6ce2ab5b46 100755 --- a/setup.py +++ b/setup.py @@ -583,7 +583,7 @@ def pxd(name): 'pandas.tseries.tests', 'pandas.types', 'pandas.io.tests', - 'pandas.io.tests.test_json', + 'pandas.io.tests.json', 'pandas.stats.tests', 'pandas.msgpack' ], @@ -602,7 +602,7 @@ def pxd(name): 'tests/sas/data/*.sas7bdat', 'tests/data/*.html', 'tests/data/html_encoding/*.html', - 'tests/test_json/data/*.json'], + 'tests/json/data/*.json'], 'pandas.tools': ['tests/*.csv'], 'pandas.tests': ['data/*.pickle', 'data/*.csv'],