Merge reduction of memory use in JS plugin. Close #553.

mozilla · Jun 1, 2016 · 97b3f38 · 97b3f38
2 parents 3cb7441 + 3e5f0e4
commit 97b3f38
Show file tree

Hide file tree

Showing 4 changed files with 76 additions and 73 deletions.
diff --git a/dxr/indexers.py b/dxr/indexers.py
@@ -487,8 +487,6 @@ def span_to_lines((kv, span)):
         warn('Bad Extent: end.row < start.row: %s < %s' %
              (span.end.row, span.start.row))
     else:
-        num_rows = span.end.row - span.start.row
-
         # TODO: There are a lot of Nones used as slice bounds below. Do we
         # ever translate them back into char offsets? If not, does the
         # highlighter or anything else choke on them?
@@ -501,7 +499,6 @@ def span_to_lines((kv, span)):
         yield (kv, 0, span.end.col), span.end.row
 
 
-
 def split_into_lines(triples):
     """Split a bunch of (key, mapping, extent) triples into more triples
     than those, with each one contained in a line.
@@ -519,8 +516,6 @@ def _split_one((key, mapping, extent)):
             warn('Bad extent: end.row < start.row: %s < %s' %
                  (extent.end.row, extent.start.row))
         else:
-            num_rows = extent.end.row - extent.start.row
-
             # TODO: There are a lot of Nones used as slice bounds below. Do we
             # ever translate them back into char offsets? If not, does the
             # highlighter or anything else choke on them?

diff --git a/dxr/plugins/js/analyze_js/analyze_file.js b/dxr/plugins/js/analyze_js/analyze_file.js
@@ -329,7 +329,7 @@ const Analyzer = {
       break;
 
     default:
-      console.log(`In ${fileIndex}, Unexpected statement: ${stmt.type} ${JSON.stringify(stmt)}`);
+      console.warn(`In ${fileIndex}, Unexpected statement: ${stmt.type} ${JSON.stringify(stmt)}`);
       break;
     }
   },
@@ -388,7 +388,7 @@ const Analyzer = {
 
   // Handle an expression by dispatching based on its type.
   expression(expr) {
-    if (!expr) console.log(Error().stack);
+    if (!expr) console.warn(Error().stack);
 
     switch (expr.type) {
     case "Identifier":
@@ -613,8 +613,8 @@ const Analyzer = {
       break;
 
     default:
-      console.log(Error().stack);
-      console.log(`In ${fileIndex}, Unexpected expression ${expr.type}: ${JSON.stringify(expr)}`);
+      console.warn(Error().stack);
+      console.warn(`In ${fileIndex}, Unexpected expression ${expr.type}: ${JSON.stringify(expr)}`);
       break;
     }
   },
@@ -635,7 +635,7 @@ const Analyzer = {
   // Handle a pattern-matching assignment by dispatching on type.
   pattern(pat) {
     if (!pat) {
-      console.log(Error().stack);
+      console.warn(Error().stack);
     }
 
     switch (pat.type) {
@@ -671,17 +671,16 @@ const Analyzer = {
       break;
 
     default:
-      console.log(`In ${fileIndex}, Unexpected pattern: ${pat.type} ${JSON.stringify(pat)}`);
+      console.warn(`In ${fileIndex}, Unexpected pattern: ${pat.type} ${JSON.stringify(pat)}`);
       break;
     }
   }
 };
 
 // Attempt to comment out some mozilla-specific preprocessor headers.
-function preprocess(text, comment)
+function preprocess(lines, comment)
 {
   let substitution = false;
-  const lines = text.split("\n");
   const preprocessedLines = [];
   const branches = [true];
   for (let i = 0; i < lines.length; i++) {
@@ -729,27 +728,34 @@ function preprocess(text, comment)
     }
   }
 
-  return preprocessedLines.join("\n");
+  return preprocessedLines.join('\n');
 }
 
 function analyzeJS(filepath, relpath, tempFilepath)
 {
   fileIndex = relpath;
   nextSymId = 0;
   outLines = [];
-  const text = preprocess(String(fs.readFileSync(filepath)), line => "//" + line);
+  const text = String(fs.readFileSync(filepath));
+  const lines = text.split('\n');
+  // With files this large we currently risk running out of memory in the
+  // indexer, so we skip them. TODO: fix the issue and disable this check.
+  if (lines.length >= 100000) {
+    console.warn(`Skipping ${filepath} because length of ${lines.length} exceeds limit.`);
+    return;
+  }
   try {
-    const ast = esprima.parse(text,
-                            {loc: true,
-                             source: path.basename(filepath),
-                             line: 1,
-                             tolerant: true,
-                             sourceType: "script"});
+    const ast = esprima.parse(preprocess(lines, line => "//" + line),
+                              {loc: true,
+                               source: path.basename(filepath),
+                               line: 1,
+                               tolerant: true,
+                               sourceType: "script"});
     if (ast) {
       Analyzer.program(ast);
     }
   } catch (e) {
-    console.log(fileIndex, e.name, e.message);
+    console.error(fileIndex, e.name, e.message);
   }
   fs.writeFileSync(tempFilepath, outLines.join('\n'));
 }

diff --git a/dxr/plugins/js/analyze_js/analyze_tree.js b/dxr/plugins/js/analyze_js/analyze_tree.js
@@ -61,8 +61,8 @@ function main() {
       const tempPath = path.join(tempRoot, pathSegment);
       ensurePath(tempPath);
       analyzeFile(fullPath,
-                        path.join(pathSegment, stat.name),
-                        path.join(tempPath, stat.name + '.data'));
+                  path.join(pathSegment, stat.name),
+                  path.join(tempPath, stat.name + '.data'));
     }
     next();
   });

diff --git a/dxr/plugins/js/indexers.py b/dxr/plugins/js/indexers.py
@@ -1,3 +1,4 @@
+from collections import namedtuple
 from itertools import imap
 import json
 import subprocess
@@ -10,39 +11,19 @@
 from dxr.utils import cumulative_sum
 
 
-class ReadAnalysis(object):
-    def __init__(self, tree, lines, contents):
-        self.needles = []
-        self.refs = []
-        # Build map of line number -> byte offset to use for emitting refs.
-        self.offsets = list(cumulative_sum(imap(len, contents.splitlines(True))))
-        for line in lines:
-            row, (start, end) = line['loc']
-            qref = QualifiedRef(tree, (line['sym'], line['name'], line['type']), qualname=line['sym'])
-            typ = line['type']
-            if line['kind'] == 'use':
-                typ += '_ref'
-            self.yield_needle(typ, row, start, end, line['name'], line['sym'])
-            self.yield_ref(row, start, end, qref)
-
-    def yield_ref(self, row, start, end, ref):
-        offset = self.row_to_offset(row)
-        self.refs.append((offset + start, offset + end, ref))
-
-    def row_to_offset(self, line):
-        """Return the byte offset in the file of given line number.
-        """
-        return self.offsets[line - 1]
+AnalysisSchema = namedtuple('AnalysisSchema', ['loc', 'kind', 'type', 'name', 'sym'])
 
-    def yield_needle(self, filter_name, line, start, end, name, qualname=None):
-        """Add needle for qualified filter_name from line:start
-        to line:end with given name and qualname.
-        """
-        # If qualname is not provided, then use name.
-        mapping = {'name': name, 'qualname': qualname or name}
-        self.needles.append((PLUGIN_NAME + '_' + filter_name,
-                             mapping,
-                             Extent(Position(row=line, col=start), Position(row=line, col=end))))
+
+def to_analysis(line):
+    """Convert a json-parsed line into an AnalysisSchema.
+    """
+    row, col = line['loc'].split(':', 1)
+    if '-' in col:
+        col = tuple(map(int, col.split('-', 1)))
+    else:
+        col = int(col), int(col)
+    line['loc'] = int(row), col
+    return AnalysisSchema(**line)
 
 
 class TreeToIndex(dxr.indexers.TreeToIndex):
@@ -72,31 +53,52 @@ def __init__(self, path, contents, plugin_name, tree):
         self.analysis_path = join(join(join(tree.temp_folder, 'plugins/js'),
                                        relpath(dirname(self.absolute_path()), tree.source_folder)),
                                   basename(path) + '.data')
-        lines = []
+        # All lines from the analysis output file.
+        self.lines = []
+        # Map of line number -> byte offset to use for emitting refs.
+        self.offsets = []
         if self.is_interesting():
             with open(self.analysis_path) as analysis:
-                lines = self.parse_analysis(analysis.readlines())
-            lines = sorted(lines, key=lambda x: x['loc'])
-        self.analyzer = ReadAnalysis(tree, lines, contents)
+                self.lines = sorted((self.parse_analysis(line) for line in analysis), key=lambda x: x.loc)
+            self.offsets = list(cumulative_sum(imap(len, contents.splitlines(True))))
 
     def is_interesting(self):
         return exists(self.analysis_path)
 
-    def parse_analysis(self, lines):
-        def parse_loc(line):
-            if 'loc' in line:
-                row, col = line['loc'].split(':', 1)
-                if '-' in col:
-                    col = tuple(map(int, col.split('-', 1)))
-                else:
-                    col = int(col), int(col)
-                line['loc'] = int(row), col
-            return line
+    def parse_analysis(self, line):
+        """Convert JSON line string into a AnalysisSchema object.
+        """
+        return json.loads(line, object_hook=to_analysis)
 
-        return (parse_loc(json.loads(line)) for line in lines)
+    def build_ref(self, row, start, end, ref):
+        """Create a 3-tuple from given line, start and end columns, and ref.
+        """
+        # Offset table is 0-indexed, line numbers are 1-indexed.
+        offset = self.offsets[row - 1]
+        return offset + start, offset + end, ref
+
+    def build_needle(self, filter_name, line, start, end, name, qualname=None):
+        """Create a needle mapping for the given filter, line, start and end
+        columns, and name.
+        """
+        # If qualname is not provided, then use name.
+        mapping = {'name': name, 'qualname': qualname or name}
+        return (PLUGIN_NAME + '_' + filter_name, mapping,
+                Extent(Position(row=line, col=start), Position(row=line, col=end)))
 
     def needles_by_line(self):
-        return iterable_per_line(with_start_and_end(split_into_lines(self.analyzer.needles)))
+        def all_needles():
+            for line in self.lines:
+                row, (start, end) = line.loc
+                typ = line.type
+                if line.kind == 'use':
+                    typ += '_ref'
+                yield self.build_needle(typ, row, start, end, line.name, line.sym)
+
+        return iterable_per_line(with_start_and_end(all_needles()))
 
     def refs(self):
-        return self.analyzer.refs
+        for line in self.lines:
+            row, (start, end) = line.loc
+            qref = QualifiedRef(self.tree, (line.sym, line.name, line.type), qualname=line.sym)
+            yield self.build_ref(row, start, end, qref)