Properly index Python method calls, chained calls, and nested calls. C…

…lose #594.
mozilla · Oct 3, 2016 · 310b694 · 310b694
2 parents 07fc57e + c672218
commit 310b694
Show file tree

Hide file tree

Showing 5 changed files with 143 additions and 63 deletions.
diff --git a/dxr/plugins/python/analysis.py b/dxr/plugins/python/analysis.py
@@ -10,7 +10,7 @@
 
 from dxr.build import unicode_contents
 from dxr.plugins.python.utils import (ClassFunctionVisitorMixin,
-                                      convert_node_to_name, package_for_module,
+                                      convert_node_to_fullname, package_for_module,
                                       path_to_module, ast_parse)
 
 
@@ -192,7 +192,7 @@ def visit_ClassDef(self, node):
         class_path = self.abs_module_name + '.' + node.name
         bases = []
         for base in node.bases:
-            base_name = convert_node_to_name(base)
+            base_name = convert_node_to_fullname(base)
             if base_name:
                 bases.append((self.abs_module_name, base_name))
         self.tree_analysis.base_classes[class_path] = bases

diff --git a/dxr/plugins/python/indexers.py b/dxr/plugins/python/indexers.py
@@ -85,6 +85,14 @@ def __init__(self, file_to_index, tree_analysis):
         self.needles = []
         self.refs = []
 
+    def visit_Name(self, node):
+        self.file_to_index.advance_node(node)
+        self.generic_visit(node)
+
+    def visit_Attribute(self, node):
+        self.file_to_index.advance_node(node)
+        self.generic_visit(node)
+
     def visit_FunctionDef(self, node):
         # Index the function itself for the function: filter.
         start, end = self.file_to_index.get_node_start_end(node)
@@ -195,7 +203,7 @@ def visitor(self):
 
         """
         if not self._visitor:
-            self.node_start_table, self.call_start_table = self.analyze_tokens()
+            self.node_start_table = self.analyze_tokens()
             self._visitor = IndexingNodeVisitor(self, self.tree_analysis)
             syntax_tree = ast_parse(self.contents)
             self._visitor.visit(syntax_tree)
@@ -222,71 +230,103 @@ def refs(self):
         return self.visitor.refs
 
     def analyze_tokens(self):
-        """Split the file into tokens and analyze them for data needed
-        for indexing.
+        """Split the file into tokens and return a table mapping utf-8 bytestring
+        offsets to lists of unicode offsets for these tokens.
 
         """
-        # Run the file contents through the tokenizer, both as unicode
-        # and as a utf-8 encoded string.  This will allow us to build
-        # up a mapping between the byte offset and the character offset.
+        # Run the file contents through the tokenizer, both as unicode and as a utf-8
+        # encoded string.  This will allow us to build up a mapping between the byte
+        # offset and the character offset.
         token_gen = tokenize.generate_tokens(StringIO(self.contents).readline)
         utf8_token_gen = tokenize.generate_tokens(
             StringIO(self.contents.encode('utf-8')).readline)
 
-        # These are a mapping from the utf-8 byte starting points provided by
-        # the ast nodes, to the unicode character offset tuples for both the
-        # start and the end points.
+        # The ast nodes provide their locations as an index into a utf-8 encoded
+        # bytestring, which of course won't match up when we are indexing into a
+        # unicode string if there are multi-byte characters involved, so we need to
+        # provide a conversion table from utf-8 to unicode indexes.  Unfortunately,
+        # attribute ast nodes wind up with the same lineno and col_offset as the node
+        # they are attributes of, making this a bit tricky, so we need to group
+        # together offsets that are part of the same chain into a list.  This list
+        # can then be used to yield successive real offsets for the attributes as we
+        # get ast nodes that repeat the same lineno and col_offset.
+
+        # This table will contain, e.g. {(42, 4): [((42, 8), (42, 14)), ...], ...}
         node_start_table = {}
-        call_start_table = {}
 
-        node_type, node_start = None, None
+        # However, if we have a sequence of attribute name tokens inside, say,
+        # method call parens or indexing brackets, the ast nodes for each of
+        # these will contain the location of the first token of the sequence
+        # inside, not the first of the outside sequence, e.g. for the line
+        #
+        #     a.b(d.e.f).c
+        #
+        # the nodes for 'd', 'e', and 'f' will each have the location of 'd',
+        # not 'a'.  Therefore, we need to push and pop the actual locations
+        # gleaned from the tokenizer onto a series of stacks that are aware of
+        # how many parens (or whichever) deep the token is.  So for the above
+        # example, the paren_stack would look like this by the time we process 'f'
+        #
+        #     {0: ((42, 4), [((42, 4), (42, 5)),       # node 'a'
+        #                    ((42, 6), (42, 7))]),     # node 'b'
+        #      1: ((42, 8), [((42, 8), (42, 9)),       # node 'd'
+        #                    ((42, 10), (42, 11)),     # node 'e'
+        #                    ((42, 12), (42, 13))])}   # node 'f'
+        #
+        # These lists that are being built up as the second item of each pair
+        # in the dict will then be popped out of the dict at the closing paren,
+        # bracket, or brace, and will have already been added to the
+        # longer-term node_start_table described above.  In this example, node
+        # 'c' would then be appended to the existing list at level 0, which
+        # would then terminate and be popped off in turn.
+
         paren_level, paren_stack = 0, {}
 
         for unicode_token, utf8_token in izip(token_gen, utf8_token_gen):
+            # start and end here are themselves tuples of (lineno, col_offset), as is
+            # utf8_start.
             tok_type, tok_name, start, end, _ = unicode_token
             utf8_start = utf8_token[2]
 
             if tok_type == token.NAME:
                 # AST nodes for classes and functions point to the position of
                 # their 'def' and 'class' tokens. To get the position of their
-                # names, we look for 'def' and 'class' tokens and store the
-                # position of the token immediately following them.
-                if node_start and node_type == 'definition':
-                    node_start_table[node_start[0]] = (start, end)
-                    node_type, node_start = None, None
-                    continue
-
-                if tok_name in ('def', 'class'):
-                    node_type, node_start = 'definition', (utf8_start, start)
-                    continue
-
-                # Record all name nodes in the token table.  Currently unused,
-                # but will be needed for recording variable references.
-                node_start_table[utf8_start] = (start, end)
-                node_type, node_start = 'name', (utf8_start, start)
+                # names, we start the queue for the current parenthesis level
+                # at the byte offset for the keyword token, but only start
+                # pushing character offsets once we're past the keyword.
+                queue_start, node_queue = paren_stack.setdefault(paren_level, (utf8_start, []))
+                node_start_table.setdefault(queue_start, node_queue)
+
+                if tok_name not in ('def', 'class'):
+                    node_queue.append((start, end))
+
+                continue
 
             elif tok_type == token.OP:
-                # In order to properly capture the start and end of function
-                # calls, we need to keep track of the parens.  Put the
-                # starting positions on a stack (here implemented with a dict
-                # so that it can be sparse), but only if the previous node was
-                # a name.
-                if tok_name == '(':
-                    if node_type == 'name':
-                        paren_stack[paren_level] = node_start
+                # Delimiters (parens, brackets, and braces) start a new context
+                # where the node for following name tokens will no longer be
+                # tied to the position of the head of the current queue.  So,
+                # keep track of the current context with a stack, here
+                # implemented with a dict keyed on the current paren level so
+                # that it can be sparse.
+                if tok_name in '([{':
                     paren_level += 1
-                elif tok_name == ')':
+                elif tok_name in '}])':
+                    # The items for the current paren level are popped off (and
+                    # below) without doing anything with them, since the
+                    # reference to the list is already in node_start_table.
+                    paren_stack.pop(paren_level, None)
                     paren_level -= 1
-                    if paren_level in paren_stack:
-                        call_start = paren_stack.pop(paren_level)
-                        call_start_table[call_start[0]] = (call_start[1], end)
+                elif tok_name == '.':
+                    # Attribute access.  Don't reset, stay at the same level.
+                    pass
+                else:
+                    paren_stack.pop(paren_level, None)
 
-                node_type, node_start = None, None
+            elif tok_type == token.NEWLINE:
+                paren_stack.pop(paren_level, None)
 
-            else:
-                node_type, node_start = None, None
-
-        return node_start_table, call_start_table
+        return node_start_table
 
     def get_node_start_end(self, node):
         """Return start and end positions within the file for the given
@@ -295,14 +335,23 @@ def get_node_start_end(self, node):
         """
         loc = node.lineno, node.col_offset
 
-        if isinstance(node, ast.ClassDef) or isinstance(node, ast.FunctionDef):
-            start, end = self.node_start_table.get(loc, (None, None))
-        elif isinstance(node, ast.Call):
-            start, end = self.call_start_table.get(loc, (None, None))
-        else:
-            start, end = None, None
+        if isinstance(node, (ast.ClassDef, ast.FunctionDef, ast.Call)):
+            loc_list = self.node_start_table.get(loc, [])
+            if loc_list:
+                return loc_list[-1]
+
+        return None, None
 
-        return start, end
+    def advance_node(self, node):
+        """Destructively change which actual token offset we'll get on a call
+        to get_node_start_end.
+
+        """
+        loc = node.lineno, node.col_offset
+        try:
+            self.node_start_table[loc].pop()
+        except (KeyError, IndexError):
+            pass
 
 
 def file_needle(needle_type, name, qualname=None):

diff --git a/dxr/plugins/python/tests/test_callers.py b/dxr/plugins/python/tests/test_callers.py
@@ -27,40 +27,40 @@ def inner_call_foo():
     """)
 
     def test_called_once(self):
-        self.found_line_eq('callers:called_once', '<b>called_once()</b>', 3)
+        self.found_line_eq('callers:called_once', '<b>called_once</b>()', 3)
 
     def test_called_multiple_times(self):
         self.found_lines_eq('callers:called_multiple', [
-            ('<b>called_multiple()</b>', 6),
-            ('<b>called_multiple()</b>', 7),
+            ('<b>called_multiple</b>()', 6),
+            ('<b>called_multiple</b>()', 7),
         ])
 
     def test_called_in_several_functions(self):
         self.found_lines_eq('callers:called_in_separate_functions', [
-            ('<b>called_in_separate_functions()</b>', 10),
-            ('<b>called_in_separate_functions()</b>', 13),
+            ('<b>called_in_separate_functions</b>()', 10),
+            ('<b>called_in_separate_functions</b>()', 13),
         ])
 
     def test_called_in_inner_function(self):
         """Make sure a call within an inner function matches the inner
         function only.
 
         """
-        self.found_line_eq('callers:foo', '<b>foo()</b>', 17)
+        self.found_line_eq('callers:foo', '<b>foo</b>()', 17)
 
     def test_called_in_outer_function(self):
         """Make sure inner function definitions do not affect other
         calls in the outer function.
 
         """
-        self.found_line_eq('callers:bar', '<b>bar()</b>', 18)
+        self.found_line_eq('callers:bar', '<b>bar</b>()', 18)
 
     def test_called_outside_of_function(self):
         """Make sure calls that take place at the top level in a module are
         still recorded.
 
         """
-        self.found_line_eq('callers:outer_call_bar', '<b>outer_call_bar()</b>', 20)
+        self.found_line_eq('callers:outer_call_bar', '<b>outer_call_bar</b>()', 20)
 
 
 class CallersMethodTests(PythonSingleFileTestCase):
@@ -77,10 +77,28 @@ def method(self):
 
     foo = Foo()
     foo.method()
+
+    a.b().c().d
+
+    f(g(h()))
     """)
 
     def test_class_method_called(self):
-        self.found_nothing('callers:class_method')
+        self.found_line_eq('callers:class_method', 'Foo.<b>class_method</b>()', 10)
+
+    def test_class_called(self):
+        self.found_line_eq('callers:Foo', 'foo = <b>Foo</b>()', 12)
 
     def test_method_called(self):
-        self.found_nothing('callers:method')
+        self.found_line_eq('callers:method', 'foo.<b>method</b>()', 13)
+
+    def test_chain_of_calls(self):
+        self.found_nothing('callers:a')
+        self.found_line_eq('callers:b', 'a.<b>b</b>().c().d', 15)
+        self.found_line_eq('callers:c', 'a.b().<b>c</b>().d', 15)
+        self.found_nothing('callers:d')
+
+    def test_sequence_of_nested_calls(self):
+        self.found_line_eq('callers:f', '<b>f</b>(g(h()))', 17)
+        self.found_line_eq('callers:g', 'f(<b>g</b>(h()))', 17)
+        self.found_line_eq('callers:h', 'f(g(<b>h</b>()))', 17)
diff --git a/dxr/plugins/python/tests/test_unicode.py b/dxr/plugins/python/tests/test_unicode.py
@@ -47,4 +47,4 @@ def test_call_offsets(self):
         characters, not bytes.
 
         """
-        self.found_line_eq('callers:kilroy', u'print u"どうもありがとう " + <b>kilroy()</b>', 9)
+        self.found_line_eq('callers:kilroy', u'print u"どうもありがとう " + <b>kilroy</b>()', 9)
diff --git a/dxr/plugins/python/utils.py b/dxr/plugins/python/utils.py
@@ -48,6 +48,19 @@ def convert_node_to_name(node):
     """Convert an AST node to a name if possible. Return None if we
     can't (such as function calls).
 
+    """
+    if isinstance(node, ast.Name):
+        return node.id
+    elif isinstance(node, ast.Attribute):
+        return node.attr
+    else:
+        return None
+
+
+def convert_node_to_fullname(node):
+    """Convert an AST node to a full dotted name if possible. Return None
+    if we can't (such as function calls).
+
     """
     if isinstance(node, ast.Name):
         return node.id