diff --git a/doc/source/io.rst b/doc/source/io.rst index 0fabfa7077a95..9a893fb18cc8e 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1054,8 +1054,9 @@ with optional parameters: - ``double_precision`` : The number of decimal places to use when encoding floating point values, default 10. - ``force_ascii`` : force encoded string to be ASCII, default True. - ``date_unit`` : The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us' or 'ns' for seconds, milliseconds, microseconds and nanoseconds respectively. Default 'ms'. +- ``default_handler`` : The handler to call if an object cannot otherwise be converted to a suitable format for JSON. Takes a single argument, which is the object to convert, and returns a serialisable object. -Note NaN's, NaT's and None will be converted to null and datetime objects will be converted based on the date_format and date_unit parameters. +Note ``NaN``'s, ``NaT``'s and ``None`` will be converted to ``null`` and ``datetime`` objects will be converted based on the ``date_format`` and ``date_unit`` parameters. .. ipython:: python @@ -1098,6 +1099,48 @@ Writing to a file, with a date index and a date column dfj2.to_json('test.json') open('test.json').read() +If the JSON serialiser cannot handle the container contents directly it will fallback in the following manner: + +- if a ``toDict`` method is defined by the unrecognised object then that + will be called and its returned ``dict`` will be JSON serialised. +- if a ``default_handler`` has been passed to ``to_json`` that will + be called to convert the object. +- otherwise an attempt is made to convert the object to a ``dict`` by + parsing its contents. However if the object is complex this will often fail + with an ``OverflowError``. + +Your best bet when encountering ``OverflowError`` during serialisation +is to specify a ``default_handler``. For example ``timedelta`` can cause +problems: + +.. ipython:: python + :suppress: + + from datetime import timedelta + dftd = DataFrame([timedelta(23), timedelta(seconds=5), 42]) + +.. code-block:: ipython + + In [141]: from datetime import timedelta + + In [142]: dftd = DataFrame([timedelta(23), timedelta(seconds=5), 42]) + + In [143]: dftd.to_json() + + --------------------------------------------------------------------------- + OverflowError Traceback (most recent call last) + OverflowError: Maximum recursion level reached + +which can be dealt with by specifying a simple ``default_handler``: + +.. ipython:: python + + dftd.to_json(default_handler=str) + + def my_handler(obj): + return obj.total_seconds() + dftd.to_json(default_handler=my_handler) + Reading JSON ~~~~~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 3e072da164ab2..661e55f21e3ee 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -233,6 +233,8 @@ API Changes - added ``date_unit`` parameter to specify resolution of timestamps. Options are seconds, milliseconds, microseconds and nanoseconds. (:issue:`4362`, :issue:`4498`). + - added ``default_handler`` parameter to allow a callable to be passed which will be + responsible for handling otherwise unserialisable objects. - ``Index`` and ``MultiIndex`` changes (:issue:`4039`): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5ac9d12de8a9a..d8a03cef16c9e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -707,7 +707,8 @@ def __setstate__(self, state): # I/O Methods def to_json(self, path_or_buf=None, orient=None, date_format='epoch', - double_precision=10, force_ascii=True, date_unit='ms'): + double_precision=10, force_ascii=True, date_unit='ms', + default_handler=None): """ Convert the object to a JSON string. @@ -728,18 +729,21 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', * DataFrame - default is 'columns' - - allowed values are: {'split','records','index','columns','values'} + - allowed values are: + {'split','records','index','columns','values'} * The format of the JSON string - - split : dict like {index -> [index], columns -> [columns], data -> [values]} - - records : list like [{column -> value}, ... , {column -> value}] + - split : dict like + {index -> [index], columns -> [columns], data -> [values]} + - records : list like + [{column -> value}, ... , {column -> value}] - index : dict like {index -> {column -> value}} - columns : dict like {column -> {index -> value}} - values : just the values array - date_format : type of date conversion (epoch = epoch milliseconds, iso = ISO8601) - default is epoch + date_format : type of date conversion, epoch or iso + epoch = epoch milliseconds, iso = ISO8601, default is epoch double_precision : The number of decimal places to use when encoding floating point values, default 10. force_ascii : force encoded string to be ASCII, default True. @@ -747,6 +751,10 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', The time unit to encode to, governs timestamp and ISO8601 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond, microsecond, and nanosecond respectively. + default_handler : callable, default None + Handler to call if object cannot otherwise be converted to a + suitable format for JSON. Should receive a single argument which is + the object to convert and return a serialisable object. Returns ------- @@ -761,7 +769,8 @@ def to_json(self, path_or_buf=None, orient=None, date_format='epoch', date_format=date_format, double_precision=double_precision, force_ascii=force_ascii, - date_unit=date_unit) + date_unit=date_unit, + default_handler=default_handler) def to_hdf(self, path_or_buf, key, **kwargs): """ activate the HDFStore diff --git a/pandas/io/json.py b/pandas/io/json.py index 497831f597681..c81064d1c0516 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -17,19 +17,21 @@ dumps = _json.dumps ### interface to/from ### + def to_json(path_or_buf, obj, orient=None, date_format='epoch', - double_precision=10, force_ascii=True, date_unit='ms'): + double_precision=10, force_ascii=True, date_unit='ms', + default_handler=None): if isinstance(obj, Series): s = SeriesWriter( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit).write() + date_unit=date_unit, default_handler=default_handler).write() elif isinstance(obj, DataFrame): s = FrameWriter( obj, orient=orient, date_format=date_format, double_precision=double_precision, ensure_ascii=force_ascii, - date_unit=date_unit).write() + date_unit=date_unit, default_handler=default_handler).write() else: raise NotImplementedError @@ -45,7 +47,7 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch', class Writer(object): def __init__(self, obj, orient, date_format, double_precision, - ensure_ascii, date_unit): + ensure_ascii, date_unit, default_handler=None): self.obj = obj if orient is None: @@ -56,6 +58,7 @@ def __init__(self, obj, orient, date_format, double_precision, self.double_precision = double_precision self.ensure_ascii = ensure_ascii self.date_unit = date_unit + self.default_handler = default_handler self.is_copy = False self._format_axes() @@ -70,7 +73,9 @@ def write(self): double_precision=self.double_precision, ensure_ascii=self.ensure_ascii, date_unit=self.date_unit, - iso_dates=self.date_format == 'iso') + iso_dates=self.date_format == 'iso', + default_handler=self.default_handler) + class SeriesWriter(Writer): _default_orient = 'index' @@ -121,13 +126,17 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, - default is ``'columns'`` - allowed values are: {'split','records','index','columns','values'} - - The DataFrame index must be unique for orients 'index' and 'columns'. - - The DataFrame columns must be unique for orients 'index', 'columns', and 'records'. + - The DataFrame index must be unique for orients 'index' and + 'columns'. + - The DataFrame columns must be unique for orients 'index', + 'columns', and 'records'. * The format of the JSON string - - split : dict like ``{index -> [index], columns -> [columns], data -> [values]}`` - - records : list like ``[{column -> value}, ... , {column -> value}]`` + - split : dict like + ``{index -> [index], columns -> [columns], data -> [values]}`` + - records : list like + ``[{column -> value}, ... , {column -> value}]`` - index : dict like ``{index -> {column -> value}}`` - columns : dict like ``{column -> {index -> value}}`` - values : just the values array @@ -384,7 +393,6 @@ class SeriesParser(Parser): _default_orient = 'index' _split_keys = ('name', 'index', 'data') - def _parse_no_numpy(self): json = self.json @@ -542,7 +550,7 @@ def is_ok(col): #---------------------------------------------------------------------- # JSON normalization routines -def nested_to_record(ds,prefix="",level=0): +def nested_to_record(ds, prefix="", level=0): """a simplified json_normalize converts a nested dict into a flat dict ("record"), unlike json_normalize, @@ -557,7 +565,8 @@ def nested_to_record(ds,prefix="",level=0): d - dict or list of dicts, matching `ds` Example: - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),nested=dict(e=dict(c=1,d=2),d=2))) + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2),d=2))) Out[52]: {'dict1.c': 1, 'dict1.d': 2, @@ -567,7 +576,7 @@ def nested_to_record(ds,prefix="",level=0): 'nested.e.d': 2} """ singleton = False - if isinstance(ds,dict): + if isinstance(ds, dict): ds = [ds] singleton = True @@ -575,23 +584,23 @@ def nested_to_record(ds,prefix="",level=0): for d in ds: new_d = copy.deepcopy(d) - for k,v in d.items(): + for k, v in d.items(): # each key gets renamed with prefix if level == 0: newkey = str(k) else: - newkey = prefix+'.'+ str(k) + newkey = prefix + '.' + str(k) # only dicts gets recurse-flattend # only at level>1 do we rename the rest of the keys - if not isinstance(v,dict): - if level!=0: # so we skip copying for top level, common case + if not isinstance(v, dict): + if level != 0: # so we skip copying for top level, common case v = new_d.pop(k) - new_d[newkey]= v + new_d[newkey] = v continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v,newkey,level+1)) + new_d.update(nested_to_record(v, newkey, level+1)) new_ds.append(new_d) if singleton: @@ -663,13 +672,14 @@ def _pull_field(js, spec): data = [data] if record_path is None: - if any([isinstance(x,dict) for x in compat.itervalues(data[0])]): + if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): # naive normalization, this is idempotent for flat records # and potentially will inflate the data considerably for # deeply nested structures: # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} # - # TODO: handle record value which are lists, at least error reasonabley + # TODO: handle record value which are lists, at least error + # reasonably data = nested_to_record(data) return DataFrame(data) elif not isinstance(record_path, list): diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index dea7f2b079cef..8c7d89641bdd4 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -575,3 +575,16 @@ def test_url(self): url = 'http://search.twitter.com/search.json?q=pandas%20python' result = read_json(url) + + def test_default_handler(self): + from datetime import timedelta + frame = DataFrame([timedelta(23), timedelta(seconds=5)]) + self.assertRaises(OverflowError, frame.to_json) + expected = DataFrame([str(timedelta(23)), str(timedelta(seconds=5))]) + assert_frame_equal( + expected, pd.read_json(frame.to_json(default_handler=str))) + + def my_handler_raises(obj): + raise TypeError + self.assertRaises( + TypeError, frame.to_json, default_handler=my_handler_raises) diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index 13ccf0bbd1742..4eb5b94ccf091 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -836,6 +836,51 @@ def toDict(self): dec = ujson.decode(output) self.assertEquals(dec, d) + def test_defaultHandler(self): + + class _TestObject(object): + + def __init__(self, val): + self.val = val + + @property + def recursive_attr(self): + return _TestObject("recursive_attr") + + def __str__(self): + return str(self.val) + + self.assertRaises(OverflowError, ujson.encode, _TestObject("foo")) + self.assertEquals('"foo"', ujson.encode(_TestObject("foo"), + default_handler=str)) + + def my_handler(obj): + return "foobar" + self.assertEquals('"foobar"', ujson.encode(_TestObject("foo"), + default_handler=my_handler)) + + def my_handler_raises(obj): + raise TypeError("I raise for anything") + with tm.assertRaisesRegexp(TypeError, "I raise for anything"): + ujson.encode(_TestObject("foo"), default_handler=my_handler_raises) + + def my_int_handler(obj): + return 42 + self.assertEquals( + 42, ujson.decode(ujson.encode(_TestObject("foo"), + default_handler=my_int_handler))) + + def my_obj_handler(obj): + return datetime.datetime(2013, 2, 3) + self.assertEquals( + ujson.decode(ujson.encode(datetime.datetime(2013, 2, 3))), + ujson.decode(ujson.encode(_TestObject("foo"), + default_handler=my_obj_handler))) + + l = [_TestObject("foo"), _TestObject("bar")] + self.assertEquals(json.loads(json.dumps(l, default=str)), + ujson.decode(ujson.encode(l, default_handler=str))) + class NumpyJSONTests(TestCase): diff --git a/pandas/src/ujson/python/objToJSON.c b/pandas/src/ujson/python/objToJSON.c index aefddd7e47bcb..50010f4e7641a 100644 --- a/pandas/src/ujson/python/objToJSON.c +++ b/pandas/src/ujson/python/objToJSON.c @@ -120,6 +120,8 @@ typedef struct __PyObjectEncoder // output format style for pandas data types int outputFormat; int originalOutputFormat; + + PyObject *defaultHandler; } PyObjectEncoder; #define GET_TC(__ptrtc) ((TypeContext *)((__ptrtc)->prv)) @@ -256,6 +258,7 @@ static void *PandasDateTimeStructToJSON(pandas_datetimestruct *dts, JSONTypeCont { PRINTMARK(); PyErr_SetString(PyExc_ValueError, "Could not convert datetime value to string"); + ((JSONObjectEncoder*) tc->encoder)->errorMsg = ""; PyObject_Free(GET_TC(tc)->cStr); return NULL; } @@ -1160,7 +1163,7 @@ char** NpyArr_encodeLabels(PyArrayObject* labels, JSONObjectEncoder* enc, npy_in void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) { - PyObject *obj, *exc, *toDictFunc; + PyObject *obj, *exc, *toDictFunc, *defaultObj; TypeContext *pc; PyObjectEncoder *enc; double val; @@ -1630,6 +1633,23 @@ void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc) PyErr_Clear(); + if (enc->defaultHandler) + { + PRINTMARK(); + defaultObj = PyObject_CallFunctionObjArgs(enc->defaultHandler, obj, NULL); + if (defaultObj == NULL || PyErr_Occurred()) + { + if (!PyErr_Occurred()) + { + PyErr_SetString(PyExc_TypeError, "Failed to execute default handler"); + } + goto INVALID; + } + encode (defaultObj, enc, NULL, 0); + Py_DECREF(defaultObj); + goto INVALID; + } + PRINTMARK(); tc->type = JT_OBJECT; pc->iterBegin = Dir_iterBegin; @@ -1716,7 +1736,7 @@ char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen) PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) { - static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", "date_unit", "iso_dates", NULL}; + static char *kwlist[] = { "obj", "ensure_ascii", "double_precision", "encode_html_chars", "orient", "date_unit", "iso_dates", "default_handler", NULL}; char buffer[65536]; char *ret; @@ -1728,6 +1748,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) char *sOrient = NULL; char *sdateFormat = NULL; PyObject *oisoDates = 0; + PyObject *odefHandler = 0; PyObjectEncoder pyEncoder = { @@ -1759,10 +1780,11 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) pyEncoder.datetimeIso = 0; pyEncoder.datetimeUnit = PANDAS_FR_ms; pyEncoder.outputFormat = COLUMNS; + pyEncoder.defaultHandler = 0; PRINTMARK(); - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssO", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, &oisoDates)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OiOssOO", kwlist, &oinput, &oensureAscii, &idoublePrecision, &oencodeHTMLChars, &sOrient, &sdateFormat, &oisoDates, &odefHandler)) { return NULL; } @@ -1851,6 +1873,16 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs) } + if (odefHandler != NULL && odefHandler != Py_None) + { + if (!PyCallable_Check(odefHandler)) + { + PyErr_SetString (PyExc_TypeError, "Default handler is not callable"); + return NULL; + } + pyEncoder.defaultHandler = odefHandler; + } + pyEncoder.originalOutputFormat = pyEncoder.outputFormat; PRINTMARK(); ret = JSON_EncodeObject (oinput, encoder, buffer, sizeof (buffer));