Merge pull request #198 from mahmoud/perf

Perf
mahmoud · Sep 18, 2020 · 55bbcd5 · 55bbcd5
2 parents d99ccaa + 404abae
commit 55bbcd5
Show file tree

Hide file tree

Showing 6 changed files with 137 additions and 23 deletions.
diff --git a/.tox-coveragerc b/.tox-coveragerc
@@ -6,6 +6,7 @@ source =
 omit =
    */flycheck_*.py
    */chainmap_backport.py
+   */perf_report.py
 
 [paths]
 source =

diff --git a/glom/core.py b/glom/core.py
@@ -625,6 +625,9 @@ def __init__(self, *path_parts):
                 path_t = _t_child(path_t, 'P', part)
         self.path_t = path_t
 
+    _CACHE = {}
+    _MAX_CACHE = 10000
+
     @classmethod
     def from_text(cls, text):
         """Make a Path from .-delimited text:
@@ -633,7 +636,11 @@ def from_text(cls, text):
         Path('a', 'b', 'c')
 
         """
-        return cls(*text.split('.'))
+        if text not in cls._CACHE:
+            if len(cls._CACHE) > cls._MAX_CACHE:
+                return cls(*text.split('.'))
+            cls._CACHE[text] = cls(*text.split('.'))
+        return cls._CACHE[text]
 
     def glomit(self, target, scope):
         # The entrypoint for the Path extension
@@ -987,9 +994,13 @@ def _trace(self, target, spec, scope):
             scope[glom] = scope[Inspect]
         if self.echo:
             print('---')
+            # TODO: switch from scope[Path] to the Target-Spec format trace above
+            # ... but maybe be smart about only printing deltas instead of the whole
+            # thing
             print('path:  ', scope[Path] + [spec])
             print('target:', target)
         if self.breakpoint:
+            # TODO: real debugger here?
             self.breakpoint()
         try:
             ret = scope[Inspect](target, spec, scope)
@@ -1807,6 +1818,7 @@ class TargetRegistry(object):
     def __init__(self, register_default_types=True):
         self._op_type_map = {}
         self._op_type_tree = {}  # see _register_fuzzy_type for details
+        self._type_cache = {}
 
         self._op_auto_map = OrderedDict()  # op name to function that returns handler function
 
@@ -1825,22 +1837,26 @@ def get_handler(self, op, obj, path=None, raise_exc=True):
         """
         ret = False
         obj_type = type(obj)
-        type_map = self.get_type_map(op)
-        if type_map:
-            try:
-                ret = type_map[obj_type]
-            except KeyError:
-                type_tree = self._op_type_tree.get(op, {})
-                closest = self._get_closest_type(obj, type_tree=type_tree)
-                if closest is None:
-                    ret = False
-                else:
-                    ret = type_map[closest]
+        cache_key = (obj_type, op)
+        if cache_key not in self._type_cache:
+            type_map = self.get_type_map(op)
+            if type_map:
+                try:
+                    ret = type_map[obj_type]
+                except KeyError:
+                    type_tree = self._op_type_tree.get(op, {})
+                    closest = self._get_closest_type(obj, type_tree=type_tree)
+                    if closest is None:
+                        ret = False
+                    else:
+                        ret = type_map[closest]
 
-        if ret is False and raise_exc:
-            raise UnregisteredTarget(op, obj_type, type_map=type_map, path=path)
+            if ret is False and raise_exc:
+                raise UnregisteredTarget(op, obj_type, type_map=type_map, path=path)
 
-        return ret
+            self._type_cache[cache_key] = ret
+
+        return self._type_cache[cache_key]
 
     def get_type_map(self, op):
         try:
@@ -1928,6 +1944,8 @@ def register(self, target_type, **kwargs):
             for op_name in new_op_map:
                 self._register_fuzzy_type(op_name, target_type)
 
+        self._type_cache = {}  # reset type cache
+
         return
 
     def register_op(self, op_name, auto_func=None, exact=False):
@@ -2119,21 +2137,23 @@ def _has_callable_glomit(obj):
 
 def _glom(target, spec, scope):
     parent = scope
+    pmap = parent.maps[0]
     scope = scope.new_child({
         T: target,
         Spec: spec,
         UP: parent,
         CHILD_ERRORS: [],
+        MODE: pmap[MODE],
     })
-    parent[LAST_CHILD_SCOPE] = scope
+    pmap[LAST_CHILD_SCOPE] = scope
 
     try:
-        if isinstance(spec, TType):  # must go first, due to callability
+        if type(spec) is TType:  # must go first, due to callability
             return _t_eval(target, spec, scope)
         elif _has_callable_glomit(spec):
             return spec.glomit(target, scope)
 
-        return scope[MODE](target, spec, scope)
+        return scope.maps[0][MODE](target, spec, scope)
     except Exception as e:
         scope.maps[1][CHILD_ERRORS].append(scope)
         scope.maps[0][CUR_ERROR] = e
@@ -2147,6 +2167,8 @@ def _glom(target, spec, scope):
 
 
 def AUTO(target, spec, scope):
+    if type(spec) is str:  # shortcut to make deep-get use case faster
+        return _t_eval(target, Path.from_text(spec).path_t, scope)
     if isinstance(spec, dict):
         return _handle_dict(target, spec, scope)
     elif isinstance(spec, list):

diff --git a/glom/matching.py b/glom/matching.py
@@ -1003,9 +1003,6 @@ def glomit(self, target, scope):
                          type(target).__name__))
 
         if errs:
-            # TODO: due to the usage of basic path (not a Path
-            # object), the format can be a bit inconsistent here
-            # (e.g., 'a.b' and ['a', 'b'])
             raise CheckError(errs, self, scope[Path])
         return ret
 

diff --git a/glom/test/perf_report.py b/glom/test/perf_report.py
@@ -0,0 +1,87 @@
+"""
+slow gloms that came up organically, used as performance metrics
+"""
+import time
+import gc
+
+import attr
+
+from glom import glom, T
+
+
+
+STR_SPEC = [{
+    'id': ('id', str),
+    'name': 'short_name',
+    'external_id': 'external_id',
+    'created_date': 'created_date',
+}]
+
+
+T_SPEC = [{
+    'id': (T.id, str),
+    'name': T.short_name,
+    'external_id': T.external_id,
+    'created_date': T.created_date,
+}]
+
+
+def func(data):
+    return [{
+            'id': str(t.id),
+            'name': t.short_name,
+            'external_id': t.external_id,
+            'created_date': t.created_date
+        } for t in data]
+
+
+def setup_list_of_dict(num=100):
+    """
+    a common use case is list-of-dicts object processing
+    to prepare internal objects for JSON serialization
+    """
+    Obj = attr.make_class(
+        'Obj', ['id', 'short_name', 'external_id', 'created_date'])
+
+    data = [
+        Obj(i, 'name' + str(i), 'external' + str(i), 'now') for i in range(num)]
+
+    return data
+
+
+def run(spec, data):
+    start = time.time()
+    glom(data, spec)
+    end = time.time()
+    print("{} us per object".format((end - start) / len(data) * 1e6))
+
+
+def ratio(spec, func, data):
+    glom_dur = []
+    py_dur = []
+    for i in range(10):
+        t1 = time.perf_counter_ns()
+        glom(data, spec)
+        t2 = time.perf_counter_ns()
+        func(data)
+        t3 = time.perf_counter_ns()
+        glom_dur.append(t2 - t1)
+        py_dur.append(t3 - t2)
+
+    glom_avg = sum(sorted(glom_dur)[2:-2])
+    py_avg = sum(sorted(py_dur)[2:-2])
+
+    return 1.0 * glom_avg / py_avg
+
+
+if __name__ == "__main__":
+    import cProfile
+    data = setup_list_of_dict(100000)
+    run(STR_SPEC, data)
+    run(STR_SPEC, data)
+    print(ratio(STR_SPEC, func, setup_list_of_dict(1000)))
+    print(ratio(STR_SPEC, func, setup_list_of_dict(1000)))
+
+
+# suggest using scalene to profile with:
+# $ scalene glom/test/perf_report.py --profile-all --reduced-profile --cpu-only --outfile SCALENE-CPU.txt
diff --git a/glom/test/test_path_and_t.py b/glom/test/test_path_and_t.py
@@ -236,3 +236,10 @@ def test_t_dunders():
     assert 'use T.__("name__")' in str(exc_info.value)
 
     assert glom(1, T.__('class__')) is int
+
+
+def test_path_cache():
+    assert Path.from_text('a.b.c') is Path.from_text('a.b.c')
+    pre = Path._MAX_CACHE
+    Path._MAX_CACHE = 0
+    assert Path.from_text('d.e.f') is not Path.from_text('d.e.f')
diff --git a/glom/test/test_target_types.py b/glom/test/test_target_types.py
@@ -73,9 +73,9 @@ def test_types_bare():
     with pytest.raises(UnregisteredTarget) as exc_info:
         glommer.glom({'test': [{'hi': 'hi'}]}, ('test', ['hi']))
     # feel free to update the "(at ['test'])" part to improve path display
-    assert str(exc_info.value).find(
+    assert (
         "target type 'list' not registered for 'iterate', "
-        "expected one of registered types: (dict) (at ['test'])") != -1
+        "expected one of registered types: (dict)" in str(exc_info.value))
     return