Skip to content
This repository has been archived by the owner on Oct 13, 2021. It is now read-only.

Commit

Permalink
Merge pull request #554 from pelmers/less-memory
Browse files Browse the repository at this point in the history
Reduce peak memory
  • Loading branch information
pelmers committed Jun 3, 2016
2 parents a85c20a + 91cdd60 commit 575e3de
Show file tree
Hide file tree
Showing 8 changed files with 78 additions and 49 deletions.
7 changes: 4 additions & 3 deletions docs/source/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@ containing the config file.
working directory).

``skip_stages``
Build/indexing stages to skip, for debugging: ``build``, ``index``, or
both, whitespace-separated. Default: none
Build/indexing/clean stages to skip, for debugging: ``build``, ``index``,
``clean``, or any combination, whitespace-separated Either of ``build`` or
``index`` implies ``clean``. Default: none

``temp_folder``
A ``format()``-style template for deciding where to store temporary files
Expand Down Expand Up @@ -252,4 +253,4 @@ value that contains ``#``, place it in triple quotes::

A single surrounding pair of single or double quotes will end up as part of the
value at the moment, due to an apparent `bug in configobj
<https://github.com/DiffSK/configobj/issues/97>`_.
<https://github.com/DiffSK/configobj/issues/97>`_.
49 changes: 29 additions & 20 deletions dxr/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,32 @@
from errno import ENOENT
from fnmatch import fnmatchcase
from itertools import chain, izip, repeat
from operator import attrgetter
import os
from os import stat, mkdir, makedirs
from os.path import dirname, islink, relpath, join, split
from os import stat, makedirs
from os.path import islink, relpath, join, split
from shutil import rmtree
import subprocess
import sys
from sys import exc_info
from traceback import format_exc
from uuid import uuid1

from binaryornot.helpers import is_binary_string
from concurrent.futures import as_completed, ProcessPoolExecutor
from click import progressbar
from flask import current_app
from funcy import merge, ichunks, first, suppress
import jinja2
from more_itertools import chunked
from ordereddict import OrderedDict
from funcy import ichunks, first, suppress
from pyelasticsearch import (ElasticSearch, ElasticHttpNotFoundError,
IndexAlreadyExistsError, bulk_chunks, Timeout,
ConnectionError)

import dxr
from dxr.app import make_app, dictify_links
from dxr.config import FORMAT
from dxr.es import UNINDEXED_STRING, UNANALYZED_STRING, TREE, create_index_and_wait
from dxr.exceptions import BuildError
from dxr.filters import LINE, FILE
from dxr.lines import es_lines, finished_tags
from dxr.mime import decode_data, icon
from dxr.query import filter_menu_items
from dxr.mime import decode_data, is_binary_image
from dxr.utils import (open_log, deep_update, append_update,
append_update_by_line, append_by_line, bucket)
from dxr.vcs import VcsCache
Expand Down Expand Up @@ -197,7 +192,7 @@ def delete_index_quietly(es, index):

skip_indexing = 'index' in config.skip_stages
skip_build = 'build' in config.skip_stages
skip_cleanup = skip_indexing or skip_build
skip_cleanup = skip_indexing or skip_build or 'clean' in config.skip_stages

# Create and/or clear out folders:
ensure_folder(tree.object_folder, tree.source_folder != tree.object_folder)
Expand Down Expand Up @@ -359,9 +354,9 @@ def _unignored_folders(folders, source_path, ignore_filenames, ignore_paths):
yield folder


def file_contents(path, encoding_guess): # TODO: Make accessible to TreeToIndex.post_build.
"""Return the unicode contents of a file if we can figure out a decoding.
Otherwise, return the contents as a string.
def unicode_contents(path, encoding_guess): # TODO: Make accessible to TreeToIndex.post_build.
"""Return the unicode contents of a file if we can figure out a decoding,
or else None.
:arg path: A sufficient path to the file
:arg encoding_guess: A guess at the encoding of the file, to be applied if
Expand All @@ -370,9 +365,15 @@ def file_contents(path, encoding_guess): # TODO: Make accessible to TreeToIndex
"""
# Read the binary contents of the file.
with open(path, 'rb') as source_file:
contents = source_file.read() # always str
_, contents = decode_data(contents, encoding_guess)
return contents # unicode if we were able to decode, str if not
initial_portion = source_file.read(4096)
if not is_binary_string(initial_portion):
# Move the cursor back to the start of the file.
source_file.seek(0)
decoded, contents = decode_data(source_file.read(),
encoding_guess,
can_be_binary=False)
if decoded:
return contents


def unignored(folder, ignore_paths, ignore_filenames, want_folders=False):
Expand Down Expand Up @@ -418,7 +419,6 @@ def raise_(exc):
for f in folders:
yield join(root, f)


def index_file(tree, tree_indexers, path, es, index):
"""Index a single file into ES, and build a static HTML representation of it.
Expand All @@ -433,7 +433,7 @@ def index_file(tree, tree_indexers, path, es, index):
"""
try:
contents = file_contents(path, tree.source_encoding)
contents = unicode_contents(path, tree.source_encoding)
except IOError as exc:
if exc.errno == ENOENT and islink(path):
# It's just a bad symlink (or a symlink that was swiped out
Expand Down Expand Up @@ -474,7 +474,11 @@ def index_file(tree, tree_indexers, path, es, index):
file_to_index.annotations_by_line())

def docs():
"""Yield documents for bulk indexing."""
"""Yield documents for bulk indexing.
Big Warning: docs also clears the contents of all elements of
needles_by_line because they will no longer be used.
"""
# Index a doc of type 'file' so we can build folder listings.
# At the moment, we send to ES in the same worker that does the
# indexing. We could interpose an external queueing system, but I'm
Expand Down Expand Up @@ -523,6 +527,11 @@ def docs():
total['annotations'] = annotations_for_this_line
yield es.index_op(total)

# Because needles_by_line holds a reference, total is not
# garbage collected. Since we won't use it again, we can clear
# the contents, saving substantial memory on long files.
total.clear()

# Indexing a 277K-line file all in one request makes ES time out (>60s),
# so we chunk it up. 300 docs is optimal according to the benchmarks in
# https://bugzilla.mozilla.org/show_bug.cgi?id=1122685. So large docs like
Expand Down
29 changes: 23 additions & 6 deletions dxr/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def file_to_index(self, path, contents):
:arg path: A path to the file to index, relative to the tree's source
folder
:arg contents: What's in the file: unicode if we managed to guess an
encoding and decode it, str otherwise
encoding and decode it, None otherwise
Return None if there is no indexing to do on the file.
Expand Down Expand Up @@ -175,11 +175,11 @@ def __init__(self, path, contents, plugin_name, tree, file_properties=None,
source folder. Such a file might not exist on disk. This is useful
mostly as a hint for syntax coloring.
:arg contents: What's in the file: unicode if we knew or successfully
guessed an encoding, str otherwise. Don't return any by-line data
for strs; the framework won't have succeeded in breaking up the
guessed an encoding, None otherwise. Don't return any by-line data
for None; the framework won't have succeeded in breaking up the
file by line for display, so there will be no useful UI for those
data to support. In fact, most skimmers won't be be able to do
anything useful with strs at all. For unicode, split the file into
anything useful with None at all. For unicode, split the file into
lines using universal newlines (``unicode.splitlines()`` with no
params); that's what the rest of the framework expects.
:arg tree: The :class:`~dxr.config.TreeConfig` of the tree to which
Expand Down Expand Up @@ -344,8 +344,8 @@ def __init__(self, path, contents, plugin_name, tree):
:arg path: A path to the file to index, relative to the tree's source
folder
:arg contents: What's in the file: unicode if we managed to guess at an
encoding and decode it, str otherwise. Don't return any by-line
data for strs; the framework won't have succeeded in breaking up
encoding and decode it, None otherwise. Don't return any by-line
data for None; the framework won't have succeeded in breaking up
the file by line for display, so there will be no useful UI for
those data to support. Think more along the lines of returning
EXIF data to search by for a JPEG. For unicode, split the file into
Expand Down Expand Up @@ -563,3 +563,20 @@ def iterable_per_line(triples):
for line_num in xrange(1, last_line)]

# If this has to be generic so we can use it on annotations_by_line as well, pass in a key function that extracts the line number and maybe another that constructs the return value.

def iterable_per_line_sorted(triples):
"""Yield iterables of (key, value mapping), one for each line, where triples are sorted already."""
last_row = 1
last_row_kvs = []
for k, v, extent in triples:
if extent.start.row == last_row:
last_row_kvs.append((k, v))
else:
yield last_row_kvs
# Yield empty lists for any skipped lines.
for _ in xrange(last_row + 1, extent.start.row):
yield []
last_row_kvs = [(k, v)]
last_row = extent.start.row
# Emit anything on the last line.
yield last_row_kvs
15 changes: 6 additions & 9 deletions dxr/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,15 +464,12 @@ def finished_tags(lines, refs, regions):
"""
# Plugins return unicode offsets, not byte ones.

# Get start and endpoints of intervals:
tags = list(tag_boundaries(chain(refs, regions)))

tags.extend(line_boundaries(lines))

# Sorting is actually not a significant use of time in an actual indexing
# run.
tags.sort(key=nesting_order) # balanced_tags undoes this, but we tolerate
# that in html_lines().
# balanced_tags undoes the sorting, but we tolerate that in html_lines().
# Remark: this sort is the memory peak, but it is not a significant use of
# time in an indexing run.
tags = sorted(chain(tag_boundaries(chain(refs, regions)),
line_boundaries(lines)),
key=nesting_order)
remove_overlapping_refs(tags)
return balanced_tags(tags)

Expand Down
12 changes: 7 additions & 5 deletions dxr/mime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ def icon(path, is_binary=False):
return class_name


def decode_data(data, encoding_guess):
"""Given str data, return an (is_text, data) tuple, where data is returned
as unicode if we think it's text and were able to determine an encoding for
it."""
if not is_binary_string(data[:1024]):
def decode_data(data, encoding_guess, can_be_binary=True):
"""Given string data, return an (is_text, data) tuple, where data is
returned as unicode if we think it's text and were able to determine an
encoding for it.
If can_be_binary is False, then skip the initial is_binary check.
"""
if not (can_be_binary and is_binary_string(data[:1024])):
try:
# Try our default encoding.
data = data.decode(encoding_guess)
Expand Down
4 changes: 4 additions & 0 deletions dxr/plugins/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,6 +464,10 @@ def needles(self):
# We store both the contents of textual images twice so that they can
# both show up in searches and be previewed in the browser.
if is_binary_image(self.path) or is_textual_image(self.path):
# If the file was binary, then contents are None, so read it here.
if self.contents is None:
with open(self.absolute_path(), 'rb') as image_file:
self.contents = image_file.read()
bytestring = (self.contents.encode('utf-8') if self.contains_text()
else self.contents)
yield 'raw_data', b64encode(bytestring)
Expand Down
7 changes: 3 additions & 4 deletions dxr/plugins/js/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@

from dxr.plugins.js.refs import PLUGIN_NAME, QualifiedRef
import dxr.indexers
from dxr.indexers import (Extent, Position, iterable_per_line,
with_start_and_end, split_into_lines)
from dxr.indexers import (Extent, Position, iterable_per_line_sorted,
with_start_and_end)
from dxr.utils import cumulative_sum


Expand Down Expand Up @@ -46,7 +46,6 @@ def post_build(self):
def file_to_index(self, path, contents):
return FileToIndex(path, contents, self.plugin_name, self.tree)


class FileToIndex(dxr.indexers.FileToIndex):
def __init__(self, path, contents, plugin_name, tree):
super(FileToIndex, self).__init__(path, contents, plugin_name, tree)
Expand Down Expand Up @@ -95,7 +94,7 @@ def all_needles():
typ += '_ref'
yield self.build_needle(typ, row, start, end, line.name, line.sym)

return iterable_per_line(with_start_and_end(all_needles()))
return iterable_per_line_sorted(with_start_and_end(all_needles()))

def refs(self):
for line in self.lines:
Expand Down
4 changes: 2 additions & 2 deletions dxr/plugins/python/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from collections import defaultdict
from warnings import warn

from dxr.build import file_contents
from dxr.build import unicode_contents
from dxr.plugins.python.utils import (ClassFunctionVisitorMixin,
convert_node_to_name, package_for_module,
path_to_module, ast_parse)
Expand Down Expand Up @@ -50,7 +50,7 @@ def _analyze_file(self, path, encoding):
"""
try:
syntax_tree = ast_parse(file_contents(path, encoding))
syntax_tree = ast_parse(unicode_contents(path, encoding))
except (IOError, SyntaxError, TypeError, UnicodeDecodeError) as error:
rel_path = os.path.relpath(path, self.source_folder)
warn('Failed to analyze {filename} due to error "{error}".'.format(
Expand Down

0 comments on commit 575e3de

Please sign in to comment.