Quit globbing to find the CSVs for clang. Close #517.

This takes 100 minutes off the moz-central index time (of about 4 hours). It also, counterintuitively, drops peak RAM use of a 0-worker run by 50MB.
mozilla · Jun 7, 2016 · 64b0f05 · 64b0f05
2 parents 731327c + 131b9f6
commit 64b0f05
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 26 deletions.
diff --git a/dxr/plugins/clang/condense.py b/dxr/plugins/clang/condense.py
@@ -7,8 +7,6 @@
 """
 import csv
 from functools import partial
-from glob import glob
-from hashlib import sha1
 from itertools import chain, izip
 from os.path import join
 
@@ -265,13 +263,13 @@ def condense(lines, dispatch_table, predicate=lambda kind, fields: True):
     return ret
 
 
-def lines_from_csvs(folder, file_glob):
-    """Return an iterable of lines from all CSVs matching a glob.
+def lines_from_csvs(folder, csv_names):
+    """Return an iterable of lines from the union of many CSV files.
 
     All lines are lists of strings.
 
     :arg folder: The folder in which to look for CSVs
-    :arg file_glob: A glob matching one or more CSVs in the folder
+    :arg csv_names: The names of the *.csv files within the folder
 
     """
     def lines_from_csv(path):
@@ -280,13 +278,12 @@ def lines_from_csv(path):
             for line in csv.reader(file):
                 yield line
 
-    # This globbing is stupid but actually not that slow: a few tenths of a
-    # second on a dir of 97K files in VirtualBox. That said, it does add up.
-    paths = glob(join(folder, file_glob))
-    return chain.from_iterable(lines_from_csv(p) for p in paths)
+    return chain.from_iterable(lines_from_csv(join(folder, '%s.csv' % name))
+                               for name in csv_names)
 
 
-def condense_file(csv_folder, file_path, overrides, overriddens, parents, children):
+def condense_file(csv_folder, file_path, overrides, overriddens, parents,
+                  children, csv_names):
     """Return a dict representing an analysis of one source file.
 
     This is phase 2: the file-at-a-time phase.
@@ -306,6 +303,8 @@ def condense_file(csv_folder, file_path, overrides, overriddens, parents, childr
         parents
     :arg children: A dict whose keys are class or struct qualnames that have
         children
+    :arg csv_names: An iterable of names of CSV files within ``csv_folder`` to
+        process, minus their ".csv" extensions
 
     """
     process_maybe_function_for_override = partial(process_maybe_function,
@@ -317,18 +316,19 @@ def condense_file(csv_folder, file_path, overrides, overriddens, parents, childr
                       'ref': process_maybe_function_for_override,
                       'decldef': process_maybe_function_for_override,
                       'type': partial(process_maybe_impl, parents, children)}
-
-    return condense(lines_from_csvs(csv_folder,
-                                    '{0}.*.csv'.format(sha1(file_path).hexdigest())),
+    return condense(lines_from_csvs(csv_folder, csv_names),
                     dispatch_table)
 
 
-def condense_global(csv_folder):
+def condense_global(csv_folder, csv_names):
     """Perform the whole-program data gathering necessary to emit "overridden"
     and subclass-related needles.
 
     This is phase 1: the whole-program phase.
 
+    :arg csv_names: An iterable of the names of CSV files (minus their
+        extensions) in ``csv_folder``
+
     """
     def listify_keys(d):
         """For a dict having values that are sets, turn those into lists."""
@@ -346,7 +346,7 @@ def listify_keys(d):
     # containing overriddenname}. Ignore the direct return value and collect
     # what we want via the partials.
     condense(
-        lines_from_csvs(csv_folder, '*.csv'),
+        lines_from_csvs(csv_folder, csv_names),
         {'impl': partial(process_impl, parents, children),
          'func_override': partial(process_override, overrides, overriddens)},
         predicate=lambda kind, fields: (kind == 'func_override' or

diff --git a/dxr/plugins/clang/dxr-index.cpp b/dxr/plugins/clang/dxr-index.cpp
@@ -1413,11 +1413,7 @@ class DXRIndexAction : public PluginASTAction {
 
     // The temp directory for this plugin's output.
     const char *tmp = getenv("DXR_CXX_CLANG_TEMP_FOLDER");
-    std::string tmpdir;
-    if (tmp)
-      tmpdir = tmp;
-    else
-      tmpdir = output;
+    std::string tmpdir = tmp ? tmp : output;
     char *abs_tmpdir = realpath(tmpdir.c_str(), nullptr);
     if (!abs_tmpdir) {
       DiagnosticsEngine &D = CI.getDiagnostics();

diff --git a/dxr/plugins/clang/indexers.py b/dxr/plugins/clang/indexers.py
@@ -1,9 +1,10 @@
 from collections import defaultdict
+from hashlib import sha1
+from itertools import chain
+from operator import itemgetter
 import os
+from os import listdir
 import sys
-from operator import itemgetter
-from itertools import chain, izip, ifilter
-from functools import partial
 
 from funcy import (merge, imap, group_by, is_mapping, repeat,
                    constantly, icat, autocurry)
@@ -56,15 +57,16 @@
 class FileToIndex(FileToIndexBase):
     """C and C++ indexer using clang compiler plugin"""
 
-    def __init__(self, path, contents, plugin_name, tree, overrides, overriddens, parents, children, temp_folder):
+    def __init__(self, path, contents, plugin_name, tree, overrides, overriddens, parents, children, csv_names, temp_folder):
         super(FileToIndex, self).__init__(path, contents, plugin_name, tree)
         self.overrides = overrides
         self.overriddens = overriddens
         self.parents = parents
         self.children = children
         self.condensed = condense_file(temp_folder, path,
                                        overrides, overriddens,
-                                       parents, children)
+                                       parents, children,
+                                       csv_names)
 
     def needles_by_line(self):
         return all_needles(
@@ -218,7 +220,28 @@ def environment(self, vars_):
         return merge(vars_, env)
 
     def post_build(self):
-        self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder)
+        def csv_map():
+            """Map input files to the output CSVs corresponding to them.
+
+            Return {path sha1: [file names (minus '.csv' extension)]}.
+
+            This saves a lot of globbing later, which can add up to hours over
+            the course of tens of thousands of files, depending on IO speed. An
+            alternative approach might be a radix tree of folders: less RAM,
+            more IO. Try that and bench it sometime.
+
+            """
+            ret = defaultdict(list)
+            for csv_name in listdir(self._temp_folder):
+                if csv_name.endswith('.csv'):
+                    path_hash, content_hash, ext = csv_name.split('.')
+                    # Removing ".csv" saves at least 2MB per worker on 700K files:
+                    ret[path_hash].append(csv_name[:-4])
+            return ret
+
+        self._csv_map = csv_map()
+        self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder,
+                            chain.from_iterable(self._csv_map.itervalues()))
 
     def file_to_index(self, path, contents):
         return FileToIndex(path,
@@ -229,4 +252,5 @@ def file_to_index(self, path, contents):
                            self._overriddens,
                            self._parents,
                            self._children,
+                           self._csv_map[sha1(path).hexdigest()],
                            self._temp_folder)