Merge pull request #554 from pelmers/less-memory

Reduce peak memory
mozilla · Jun 3, 2016 · 575e3de · 575e3de
2 parents a85c20a + 91cdd60
commit 575e3de
Show file tree

Hide file tree

Showing 8 changed files with 78 additions and 49 deletions.
diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst
@@ -84,8 +84,9 @@ containing the config file.
     working directory).
 
 ``skip_stages``
-    Build/indexing stages to skip, for debugging: ``build``, ``index``, or
-    both, whitespace-separated. Default: none
+    Build/indexing/clean stages to skip, for debugging: ``build``, ``index``,
+    ``clean``, or any combination, whitespace-separated Either of ``build`` or
+    ``index`` implies ``clean``. Default: none
 
 ``temp_folder``
     A ``format()``-style template for deciding where to store temporary files
@@ -252,4 +253,4 @@ value that contains ``#``, place it in triple quotes::
 
 A single surrounding pair of single or double quotes will end up as part of the
 value at the moment, due to an apparent `bug in configobj
-<https://github.com/DiffSK/configobj/issues/97>`_.
+<https://github.com/DiffSK/configobj/issues/97>`_.
diff --git a/dxr/build.py b/dxr/build.py
@@ -2,37 +2,32 @@
 from errno import ENOENT
 from fnmatch import fnmatchcase
 from itertools import chain, izip, repeat
-from operator import attrgetter
 import os
-from os import stat, mkdir, makedirs
-from os.path import dirname, islink, relpath, join, split
+from os import stat, makedirs
+from os.path import islink, relpath, join, split
 from shutil import rmtree
 import subprocess
 import sys
 from sys import exc_info
 from traceback import format_exc
 from uuid import uuid1
 
+from binaryornot.helpers import is_binary_string
 from concurrent.futures import as_completed, ProcessPoolExecutor
 from click import progressbar
 from flask import current_app
-from funcy import merge, ichunks, first, suppress
-import jinja2
-from more_itertools import chunked
-from ordereddict import OrderedDict
+from funcy import ichunks, first, suppress
 from pyelasticsearch import (ElasticSearch, ElasticHttpNotFoundError,
                              IndexAlreadyExistsError, bulk_chunks, Timeout,
                              ConnectionError)
 
-import dxr
 from dxr.app import make_app, dictify_links
 from dxr.config import FORMAT
 from dxr.es import UNINDEXED_STRING, UNANALYZED_STRING, TREE, create_index_and_wait
 from dxr.exceptions import BuildError
 from dxr.filters import LINE, FILE
 from dxr.lines import es_lines, finished_tags
-from dxr.mime import decode_data, icon
-from dxr.query import filter_menu_items
+from dxr.mime import decode_data, is_binary_image
 from dxr.utils import (open_log, deep_update, append_update,
                        append_update_by_line, append_by_line, bucket)
 from dxr.vcs import VcsCache
@@ -197,7 +192,7 @@ def delete_index_quietly(es, index):
 
     skip_indexing = 'index' in config.skip_stages
     skip_build = 'build' in config.skip_stages
-    skip_cleanup  = skip_indexing or skip_build
+    skip_cleanup = skip_indexing or skip_build or 'clean' in config.skip_stages
 
     # Create and/or clear out folders:
     ensure_folder(tree.object_folder, tree.source_folder != tree.object_folder)
@@ -359,9 +354,9 @@ def _unignored_folders(folders, source_path, ignore_filenames, ignore_paths):
                 yield folder
 
 
-def file_contents(path, encoding_guess):  # TODO: Make accessible to TreeToIndex.post_build.
-    """Return the unicode contents of a file if we can figure out a decoding.
-    Otherwise, return the contents as a string.
+def unicode_contents(path, encoding_guess):  # TODO: Make accessible to TreeToIndex.post_build.
+    """Return the unicode contents of a file if we can figure out a decoding,
+    or else None.
 
     :arg path: A sufficient path to the file
     :arg encoding_guess: A guess at the encoding of the file, to be applied if
@@ -370,9 +365,15 @@ def file_contents(path, encoding_guess):  # TODO: Make accessible to TreeToIndex
     """
     # Read the binary contents of the file.
     with open(path, 'rb') as source_file:
-        contents = source_file.read()  # always str
-    _, contents = decode_data(contents, encoding_guess)
-    return contents  # unicode if we were able to decode, str if not
+        initial_portion = source_file.read(4096)
+        if not is_binary_string(initial_portion):
+            # Move the cursor back to the start of the file.
+            source_file.seek(0)
+            decoded, contents = decode_data(source_file.read(),
+                                            encoding_guess,
+                                            can_be_binary=False)
+            if decoded:
+                return contents
 
 
 def unignored(folder, ignore_paths, ignore_filenames, want_folders=False):
@@ -418,7 +419,6 @@ def raise_(exc):
             for f in folders:
                 yield join(root, f)
 
-
 def index_file(tree, tree_indexers, path, es, index):
     """Index a single file into ES, and build a static HTML representation of it.
 
@@ -433,7 +433,7 @@ def index_file(tree, tree_indexers, path, es, index):
 
     """
     try:
-        contents = file_contents(path, tree.source_encoding)
+        contents = unicode_contents(path, tree.source_encoding)
     except IOError as exc:
         if exc.errno == ENOENT and islink(path):
             # It's just a bad symlink (or a symlink that was swiped out
@@ -474,7 +474,11 @@ def index_file(tree, tree_indexers, path, es, index):
                                file_to_index.annotations_by_line())
 
     def docs():
-        """Yield documents for bulk indexing."""
+        """Yield documents for bulk indexing.
+
+        Big Warning: docs also clears the contents of all elements of
+        needles_by_line because they will no longer be used.
+        """
         # Index a doc of type 'file' so we can build folder listings.
         # At the moment, we send to ES in the same worker that does the
         # indexing. We could interpose an external queueing system, but I'm
@@ -523,6 +527,11 @@ def docs():
                     total['annotations'] = annotations_for_this_line
                 yield es.index_op(total)
 
+                # Because needles_by_line holds a reference, total is not
+                # garbage collected. Since we won't use it again, we can clear
+                # the contents, saving substantial memory on long files.
+                total.clear()
+
     # Indexing a 277K-line file all in one request makes ES time out (>60s),
     # so we chunk it up. 300 docs is optimal according to the benchmarks in
     # https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like

diff --git a/dxr/indexers.py b/dxr/indexers.py
@@ -131,7 +131,7 @@ def file_to_index(self, path, contents):
         :arg path: A path to the file to index, relative to the tree's source
             folder
         :arg contents: What's in the file: unicode if we managed to guess an
-            encoding and decode it, str otherwise
+            encoding and decode it, None otherwise
 
         Return None if there is no indexing to do on the file.
 
@@ -175,11 +175,11 @@ def __init__(self, path, contents, plugin_name, tree, file_properties=None,
             source folder. Such a file might not exist on disk. This is useful
             mostly as a hint for syntax coloring.
         :arg contents: What's in the file: unicode if we knew or successfully
-            guessed an encoding, str otherwise. Don't return any by-line data
-            for strs; the framework won't have succeeded in breaking up the
+            guessed an encoding, None otherwise. Don't return any by-line data
+            for None; the framework won't have succeeded in breaking up the
             file by line for display, so there will be no useful UI for those
             data to support. In fact, most skimmers won't be be able to do
-            anything useful with strs at all. For unicode, split the file into
+            anything useful with None at all. For unicode, split the file into
             lines using universal newlines (``unicode.splitlines()`` with no
             params); that's what the rest of the framework expects.
         :arg tree: The :class:`~dxr.config.TreeConfig` of the tree to which
@@ -344,8 +344,8 @@ def __init__(self, path, contents, plugin_name, tree):
         :arg path: A path to the file to index, relative to the tree's source
             folder
         :arg contents: What's in the file: unicode if we managed to guess at an
-            encoding and decode it, str otherwise. Don't return any by-line
-            data for strs; the framework won't have succeeded in breaking up
+            encoding and decode it, None otherwise. Don't return any by-line
+            data for None; the framework won't have succeeded in breaking up
             the file by line for display, so there will be no useful UI for
             those data to support. Think more along the lines of returning
             EXIF data to search by for a JPEG. For unicode, split the file into
@@ -563,3 +563,20 @@ def iterable_per_line(triples):
             for line_num in xrange(1, last_line)]
 
     # If this has to be generic so we can use it on annotations_by_line as well, pass in a key function that extracts the line number and maybe another that constructs the return value.
+
+def iterable_per_line_sorted(triples):
+    """Yield iterables of (key, value mapping), one for each line, where triples are sorted already."""
+    last_row = 1
+    last_row_kvs = []
+    for k, v, extent in triples:
+        if extent.start.row == last_row:
+            last_row_kvs.append((k, v))
+        else:
+            yield last_row_kvs
+            # Yield empty lists for any skipped lines.
+            for _ in xrange(last_row + 1, extent.start.row):
+                yield []
+            last_row_kvs = [(k, v)]
+            last_row = extent.start.row
+    # Emit anything on the last line.
+    yield last_row_kvs
diff --git a/dxr/lines.py b/dxr/lines.py
@@ -464,15 +464,12 @@ def finished_tags(lines, refs, regions):
     """
     # Plugins return unicode offsets, not byte ones.
 
-    # Get start and endpoints of intervals:
-    tags = list(tag_boundaries(chain(refs, regions)))
-
-    tags.extend(line_boundaries(lines))
-
-    # Sorting is actually not a significant use of time in an actual indexing
-    # run.
-    tags.sort(key=nesting_order)  # balanced_tags undoes this, but we tolerate
-                                  # that in html_lines().
+    # balanced_tags undoes the sorting, but we tolerate that in html_lines().
+    # Remark: this sort is the memory peak, but it is not a significant use of
+    # time in an indexing run.
+    tags = sorted(chain(tag_boundaries(chain(refs, regions)),
+                        line_boundaries(lines)),
+                  key=nesting_order)
     remove_overlapping_refs(tags)
     return balanced_tags(tags)
 

diff --git a/dxr/mime.py b/dxr/mime.py
@@ -14,11 +14,13 @@ def icon(path, is_binary=False):
     return class_name
 
 
-def decode_data(data, encoding_guess):
-    """Given str data, return an (is_text, data) tuple, where data is returned
-    as unicode if we think it's text and were able to determine an encoding for
-    it."""
-    if not is_binary_string(data[:1024]):
+def decode_data(data, encoding_guess, can_be_binary=True):
+    """Given string data, return an (is_text, data) tuple, where data is
+    returned as unicode if we think it's text and were able to determine an
+    encoding for it.
+    If can_be_binary is False, then skip the initial is_binary check.
+    """
+    if not (can_be_binary and is_binary_string(data[:1024])):
         try:
             # Try our default encoding.
             data = data.decode(encoding_guess)

diff --git a/dxr/plugins/core.py b/dxr/plugins/core.py
@@ -464,6 +464,10 @@ def needles(self):
         # We store both the contents of textual images twice so that they can
         # both show up in searches and be previewed in the browser.
         if is_binary_image(self.path) or is_textual_image(self.path):
+            # If the file was binary, then contents are None, so read it here.
+            if self.contents is None:
+                with open(self.absolute_path(), 'rb') as image_file:
+                    self.contents = image_file.read()
             bytestring = (self.contents.encode('utf-8') if self.contains_text()
                           else self.contents)
             yield 'raw_data', b64encode(bytestring)

diff --git a/dxr/plugins/js/indexers.py b/dxr/plugins/js/indexers.py
@@ -6,8 +6,8 @@
 
 from dxr.plugins.js.refs import PLUGIN_NAME, QualifiedRef
 import dxr.indexers
-from dxr.indexers import (Extent, Position, iterable_per_line,
-                          with_start_and_end, split_into_lines)
+from dxr.indexers import (Extent, Position, iterable_per_line_sorted,
+                          with_start_and_end)
 from dxr.utils import cumulative_sum
 
 
@@ -46,7 +46,6 @@ def post_build(self):
     def file_to_index(self, path, contents):
         return FileToIndex(path, contents, self.plugin_name, self.tree)
 
-
 class FileToIndex(dxr.indexers.FileToIndex):
     def __init__(self, path, contents, plugin_name, tree):
         super(FileToIndex, self).__init__(path, contents, plugin_name, tree)
@@ -95,7 +94,7 @@ def all_needles():
                     typ += '_ref'
                 yield self.build_needle(typ, row, start, end, line.name, line.sym)
 
-        return iterable_per_line(with_start_and_end(all_needles()))
+        return iterable_per_line_sorted(with_start_and_end(all_needles()))
 
     def refs(self):
         for line in self.lines:

diff --git a/dxr/plugins/python/analysis.py b/dxr/plugins/python/analysis.py
@@ -8,7 +8,7 @@
 from collections import defaultdict
 from warnings import warn
 
-from dxr.build import file_contents
+from dxr.build import unicode_contents
 from dxr.plugins.python.utils import (ClassFunctionVisitorMixin,
                                       convert_node_to_name, package_for_module,
                                       path_to_module, ast_parse)
@@ -50,7 +50,7 @@ def _analyze_file(self, path, encoding):
 
         """
         try:
-            syntax_tree = ast_parse(file_contents(path, encoding))
+            syntax_tree = ast_parse(unicode_contents(path, encoding))
         except (IOError, SyntaxError, TypeError, UnicodeDecodeError) as error:
             rel_path = os.path.relpath(path, self.source_folder)
             warn('Failed to analyze {filename} due to error "{error}".'.format(