From 197906d47417273743c7bf709e9cb037a5db6a96 Mon Sep 17 00:00:00 2001 From: Carl Friedrich Bolz-Tereick Date: Wed, 13 Nov 2019 15:00:54 +0100 Subject: [PATCH 1/2] memoize stringly keys of objects When unpacking msgpack objects, store the keys that appear into a memo dictionary to make them unique. This is useful, because for most sizable msgpack files the same keys appear again and again, since many objects have the same "shape" (set of keys). A similar optimization is done in most json deserializers, eg in CPython: https://github.com/python/cpython/blob/d89cea15ad37e873003fc74ec2c77660ab620b00/Modules/_json.c#L717 My totally unscientific results: I tried this on two big msgpack files, a wikidata dump (92 MiB) and a dump of reddit comments (596 MiB). I am reporting time spent deserializing and memory use of the resulting data structure. I've included json deserialization numbers as a comparison. The results I get on my old-ish laptop are: wikidata time memory CPython 3.7.5 before 3.42s 1279 MiB CPython 3.7.5 after 3.43s 883 MiB PyPy3 7.2 before 6.44s 1380 MiB PyPy3 7.2 after 4.98s 965 MiB CPython 3.7.5 json 4.13s 887 MiB PyPy3 7.2 json 3.54s 958 MiB reddit CPython 3.7.5 before 5.62s 3412 MiB CPython 3.7.5 after 5.20s 1754 MiB PyPy3 7.2 before 14.72s 3782 MiB PyPy3 7.2 after 8.37s 2086 MiB CPython 3.7.5 json 8.64s 1753 MiB PyPy3 7.2 json 10.52s 2052 MiB For wikidata, there is only a memory improvement on CPython, the time stays the same. For all other three variants (all of reddit, pypy on wikidata) both time and memory improve significantly. The reason for the memory improvements are due to the memoizing, time improves due to better cache locality due to the smaller working set, and less time spent in GC in the case of PyPy. --- msgpack/_unpacker.pyx | 11 ++++++++--- msgpack/fallback.py | 4 ++++ msgpack/unpack.h | 16 ++++++++++++++++ msgpack/unpack_template.h | 1 + test/test_unpack.py | 9 +++++++++ 5 files changed, 38 insertions(+), 3 deletions(-) diff --git a/msgpack/_unpacker.pyx b/msgpack/_unpacker.pyx index 3727f50c..0935d15a 100644 --- a/msgpack/_unpacker.pyx +++ b/msgpack/_unpacker.pyx @@ -38,6 +38,7 @@ cdef extern from "unpack.h": Py_ssize_t max_array_len Py_ssize_t max_map_len Py_ssize_t max_ext_len + PyObject* memo ctypedef struct unpack_context: msgpack_user user @@ -61,7 +62,7 @@ cdef inline init_ctx(unpack_context *ctx, const char* encoding, const char* unicode_errors, Py_ssize_t max_str_len, Py_ssize_t max_bin_len, Py_ssize_t max_array_len, Py_ssize_t max_map_len, - Py_ssize_t max_ext_len): + Py_ssize_t max_ext_len, object d): unpack_init(ctx) ctx.user.use_list = use_list ctx.user.raw = raw @@ -101,6 +102,9 @@ cdef inline init_ctx(unpack_context *ctx, ctx.user.encoding = encoding ctx.user.unicode_errors = unicode_errors + Py_INCREF(d) + ctx.user.memo = d + def default_read_extended_type(typecode, data): raise NotImplementedError("Cannot decode extended type with typecode=%d" % typecode) @@ -195,9 +199,10 @@ def unpackb(object packed, object object_hook=None, object list_hook=None, max_ext_len = buf_len try: + memo = {} init_ctx(&ctx, object_hook, object_pairs_hook, list_hook, ext_hook, use_list, raw, strict_map_key, cenc, cerr, - max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len) + max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len, memo) ret = unpack_construct(&ctx, buf, buf_len, &off) finally: if new_protocol: @@ -404,7 +409,7 @@ cdef class Unpacker(object): init_ctx(&self.ctx, object_hook, object_pairs_hook, list_hook, ext_hook, use_list, raw, strict_map_key, cenc, cerr, max_str_len, max_bin_len, max_array_len, - max_map_len, max_ext_len) + max_map_len, max_ext_len, {}) def feed(self, object next_bytes): """Append `next_bytes` to internal buffer.""" diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 3836e830..9cb813fb 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -321,6 +321,8 @@ def __init__(self, file_like=None, read_size=0, use_list=True, raw=True, strict_ self._max_ext_len = max_ext_len self._stream_offset = 0 + self._memo = {} # used to memoize keys to reduce memory consumption + if list_hook is not None and not callable(list_hook): raise TypeError('`list_hook` is not callable') if object_hook is not None and not callable(object_hook): @@ -656,6 +658,8 @@ def _unpack(self, execute=EX_CONSTRUCT): key = self._unpack(EX_CONSTRUCT) if self._strict_map_key and type(key) not in (unicode, bytes): raise ValueError("%s is not allowed for map key" % str(type(key))) + if type(key) in (unicode, bytes): + key = self._memo.setdefault(key, key) ret[key] = self._unpack(EX_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) diff --git a/msgpack/unpack.h b/msgpack/unpack.h index 85dbbed5..49d8a5ad 100644 --- a/msgpack/unpack.h +++ b/msgpack/unpack.h @@ -30,6 +30,7 @@ typedef struct unpack_user { const char *encoding; const char *unicode_errors; Py_ssize_t max_str_len, max_bin_len, max_array_len, max_map_len, max_ext_len; + PyObject *memo; } unpack_user; typedef PyObject* msgpack_unpack_object; @@ -193,6 +194,21 @@ static inline int unpack_callback_map_item(unpack_user* u, unsigned int current, PyErr_Format(PyExc_ValueError, "%.100s is not allowed for map key", Py_TYPE(k)->tp_name); return -1; } + if (PyUnicode_CheckExact(k) || PyBytes_CheckExact(k)) { + PyObject *memokey = PyDict_GetItem(u->memo, k); + if (memokey != NULL) { + Py_INCREF(memokey); + Py_DECREF(k); + k = memokey; + } + else { + if (PyDict_SetItem(u->memo, k, k) < 0) { + Py_DECREF(k); + Py_DECREF(v); + return -1; + } + } + } if (u->has_pairs_hook) { msgpack_unpack_object item = PyTuple_Pack(2, k, v); if (!item) diff --git a/msgpack/unpack_template.h b/msgpack/unpack_template.h index 9924b9c6..398df42f 100644 --- a/msgpack/unpack_template.h +++ b/msgpack/unpack_template.h @@ -73,6 +73,7 @@ static inline PyObject* unpack_data(unpack_context* ctx) static inline void unpack_clear(unpack_context *ctx) { Py_CLEAR(ctx->stack[0].obj); + Py_CLEAR(ctx->user.memo); } template diff --git a/test/test_unpack.py b/test/test_unpack.py index 00a10612..dbaa3240 100644 --- a/test/test_unpack.py +++ b/test/test_unpack.py @@ -65,6 +65,15 @@ def _hook(self, code, data): assert unpacker.unpack() == {'a': ExtType(2, b'321')} +def test_unpacker_shares_stringly_keys(): + f = BytesIO(packb([{" a ?!": 1}, {" a ?!": 2}])) + unpacker = Unpacker(f) + d1, d2 = unpacker.unpack() + key1, = d1 + key2, = d2 + assert key1 is key2 + + if __name__ == '__main__': test_unpack_array_header_from_file() test_unpacker_hook_refcnt() From 3b97d0cf46381f5d9cff817265aebdfd9bb23c44 Mon Sep 17 00:00:00 2001 From: Carl Friedrich Bolz-Tereick Date: Wed, 13 Nov 2019 17:54:43 +0100 Subject: [PATCH 2/2] express logic more clearly, without repeated checks --- msgpack/fallback.py | 4 ++-- msgpack/unpack.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/msgpack/fallback.py b/msgpack/fallback.py index 9cb813fb..127d6c1f 100644 --- a/msgpack/fallback.py +++ b/msgpack/fallback.py @@ -656,10 +656,10 @@ def _unpack(self, execute=EX_CONSTRUCT): ret = {} for _ in xrange(n): key = self._unpack(EX_CONSTRUCT) - if self._strict_map_key and type(key) not in (unicode, bytes): - raise ValueError("%s is not allowed for map key" % str(type(key))) if type(key) in (unicode, bytes): key = self._memo.setdefault(key, key) + elif self._strict_map_key: + raise ValueError("%s is not allowed for map key" % str(type(key))) ret[key] = self._unpack(EX_CONSTRUCT) if self._object_hook is not None: ret = self._object_hook(ret) diff --git a/msgpack/unpack.h b/msgpack/unpack.h index 49d8a5ad..6eba18a0 100644 --- a/msgpack/unpack.h +++ b/msgpack/unpack.h @@ -190,10 +190,6 @@ static inline int unpack_callback_map(unpack_user* u, unsigned int n, msgpack_un static inline int unpack_callback_map_item(unpack_user* u, unsigned int current, msgpack_unpack_object* c, msgpack_unpack_object k, msgpack_unpack_object v) { - if (u->strict_map_key && !PyUnicode_CheckExact(k) && !PyBytes_CheckExact(k)) { - PyErr_Format(PyExc_ValueError, "%.100s is not allowed for map key", Py_TYPE(k)->tp_name); - return -1; - } if (PyUnicode_CheckExact(k) || PyBytes_CheckExact(k)) { PyObject *memokey = PyDict_GetItem(u->memo, k); if (memokey != NULL) { @@ -209,6 +205,10 @@ static inline int unpack_callback_map_item(unpack_user* u, unsigned int current, } } } + else if (u->strict_map_key) { + PyErr_Format(PyExc_ValueError, "%.100s is not allowed for map key", Py_TYPE(k)->tp_name); + return -1; + } if (u->has_pairs_hook) { msgpack_unpack_object item = PyTuple_Pack(2, k, v); if (!item)