Skip to content
This repository has been archived by the owner on Oct 13, 2021. It is now read-only.

Commit

Permalink
Quit globbing to find the CSVs for clang. Close #517.
Browse files Browse the repository at this point in the history
This takes 100 minutes off the moz-central index time (of about 4 hours). It also, counterintuitively, drops peak RAM use of a 0-worker run by 50MB.
  • Loading branch information
erikrose committed Jun 7, 2016
2 parents 731327c + 131b9f6 commit 64b0f05
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 26 deletions.
30 changes: 15 additions & 15 deletions dxr/plugins/clang/condense.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
"""
import csv
from functools import partial
from glob import glob
from hashlib import sha1
from itertools import chain, izip
from os.path import join

Expand Down Expand Up @@ -265,13 +263,13 @@ def condense(lines, dispatch_table, predicate=lambda kind, fields: True):
return ret


def lines_from_csvs(folder, file_glob):
"""Return an iterable of lines from all CSVs matching a glob.
def lines_from_csvs(folder, csv_names):
"""Return an iterable of lines from the union of many CSV files.
All lines are lists of strings.
:arg folder: The folder in which to look for CSVs
:arg file_glob: A glob matching one or more CSVs in the folder
:arg csv_names: The names of the *.csv files within the folder
"""
def lines_from_csv(path):
Expand All @@ -280,13 +278,12 @@ def lines_from_csv(path):
for line in csv.reader(file):
yield line

# This globbing is stupid but actually not that slow: a few tenths of a
# second on a dir of 97K files in VirtualBox. That said, it does add up.
paths = glob(join(folder, file_glob))
return chain.from_iterable(lines_from_csv(p) for p in paths)
return chain.from_iterable(lines_from_csv(join(folder, '%s.csv' % name))
for name in csv_names)


def condense_file(csv_folder, file_path, overrides, overriddens, parents, children):
def condense_file(csv_folder, file_path, overrides, overriddens, parents,
children, csv_names):
"""Return a dict representing an analysis of one source file.
This is phase 2: the file-at-a-time phase.
Expand All @@ -306,6 +303,8 @@ def condense_file(csv_folder, file_path, overrides, overriddens, parents, childr
parents
:arg children: A dict whose keys are class or struct qualnames that have
children
:arg csv_names: An iterable of names of CSV files within ``csv_folder`` to
process, minus their ".csv" extensions
"""
process_maybe_function_for_override = partial(process_maybe_function,
Expand All @@ -317,18 +316,19 @@ def condense_file(csv_folder, file_path, overrides, overriddens, parents, childr
'ref': process_maybe_function_for_override,
'decldef': process_maybe_function_for_override,
'type': partial(process_maybe_impl, parents, children)}

return condense(lines_from_csvs(csv_folder,
'{0}.*.csv'.format(sha1(file_path).hexdigest())),
return condense(lines_from_csvs(csv_folder, csv_names),
dispatch_table)


def condense_global(csv_folder):
def condense_global(csv_folder, csv_names):
"""Perform the whole-program data gathering necessary to emit "overridden"
and subclass-related needles.
This is phase 1: the whole-program phase.
:arg csv_names: An iterable of the names of CSV files (minus their
extensions) in ``csv_folder``
"""
def listify_keys(d):
"""For a dict having values that are sets, turn those into lists."""
Expand All @@ -346,7 +346,7 @@ def listify_keys(d):
# containing overriddenname}. Ignore the direct return value and collect
# what we want via the partials.
condense(
lines_from_csvs(csv_folder, '*.csv'),
lines_from_csvs(csv_folder, csv_names),
{'impl': partial(process_impl, parents, children),
'func_override': partial(process_override, overrides, overriddens)},
predicate=lambda kind, fields: (kind == 'func_override' or
Expand Down
6 changes: 1 addition & 5 deletions dxr/plugins/clang/dxr-index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1413,11 +1413,7 @@ class DXRIndexAction : public PluginASTAction {

// The temp directory for this plugin's output.
const char *tmp = getenv("DXR_CXX_CLANG_TEMP_FOLDER");
std::string tmpdir;
if (tmp)
tmpdir = tmp;
else
tmpdir = output;
std::string tmpdir = tmp ? tmp : output;
char *abs_tmpdir = realpath(tmpdir.c_str(), nullptr);
if (!abs_tmpdir) {
DiagnosticsEngine &D = CI.getDiagnostics();
Expand Down
36 changes: 30 additions & 6 deletions dxr/plugins/clang/indexers.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from collections import defaultdict
from hashlib import sha1
from itertools import chain
from operator import itemgetter
import os
from os import listdir
import sys
from operator import itemgetter
from itertools import chain, izip, ifilter
from functools import partial

from funcy import (merge, imap, group_by, is_mapping, repeat,
constantly, icat, autocurry)
Expand Down Expand Up @@ -56,15 +57,16 @@
class FileToIndex(FileToIndexBase):
"""C and C++ indexer using clang compiler plugin"""

def __init__(self, path, contents, plugin_name, tree, overrides, overriddens, parents, children, temp_folder):
def __init__(self, path, contents, plugin_name, tree, overrides, overriddens, parents, children, csv_names, temp_folder):
super(FileToIndex, self).__init__(path, contents, plugin_name, tree)
self.overrides = overrides
self.overriddens = overriddens
self.parents = parents
self.children = children
self.condensed = condense_file(temp_folder, path,
overrides, overriddens,
parents, children)
parents, children,
csv_names)

def needles_by_line(self):
return all_needles(
Expand Down Expand Up @@ -218,7 +220,28 @@ def environment(self, vars_):
return merge(vars_, env)

def post_build(self):
self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder)
def csv_map():
"""Map input files to the output CSVs corresponding to them.
Return {path sha1: [file names (minus '.csv' extension)]}.
This saves a lot of globbing later, which can add up to hours over
the course of tens of thousands of files, depending on IO speed. An
alternative approach might be a radix tree of folders: less RAM,
more IO. Try that and bench it sometime.
"""
ret = defaultdict(list)
for csv_name in listdir(self._temp_folder):
if csv_name.endswith('.csv'):
path_hash, content_hash, ext = csv_name.split('.')
# Removing ".csv" saves at least 2MB per worker on 700K files:
ret[path_hash].append(csv_name[:-4])
return ret

self._csv_map = csv_map()
self._overrides, self._overriddens, self._parents, self._children = condense_global(self._temp_folder,
chain.from_iterable(self._csv_map.itervalues()))

def file_to_index(self, path, contents):
return FileToIndex(path,
Expand All @@ -229,4 +252,5 @@ def file_to_index(self, path, contents):
self._overriddens,
self._parents,
self._children,
self._csv_map[sha1(path).hexdigest()],
self._temp_folder)

0 comments on commit 64b0f05

Please sign in to comment.