Skip to content
This repository has been archived by the owner on Oct 13, 2021. It is now read-only.

Commit

Permalink
Optimize index-building for speed.
Browse files Browse the repository at this point in the history
moz-central before: 2:27 @ 12GB (with pygmentize and everything on)
moz-central after: 2:03 @ 14GB

Profiling revealed that Flask's url-building was the hot spot, so we move that to request time, where it's still fast enough to not be noticed. As a bonus, we stop storing redundant whole menus in ES, opening the possibility of making them internationalizable. I'm not sure why the index got bigger, but we should be able to drop it back down, because there's theoretically less information there now.
  • Loading branch information
erikrose committed Aug 31, 2015
2 parents 7c42a14 + 9ec94f8 commit 2d35fb4
Show file tree
Hide file tree
Showing 33 changed files with 1,267 additions and 850 deletions.
20 changes: 10 additions & 10 deletions dxr/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
es_alias_or_not_found)
from dxr.exceptions import BadTerm
from dxr.filters import FILE, LINE
from dxr.indexers import Ref, Region
from dxr.lines import html_line, tags_per_line, finished_tags
from dxr.lines import html_line, tags_per_line, finished_tags, Ref, Region
from dxr.mime import icon, is_image, is_text
from dxr.plugins import plugins_named
from dxr.query import Query, filter_menu_items
Expand Down Expand Up @@ -380,23 +379,24 @@ def sidebar_links(sections):
# stitching the lines together.
contents = ''.join(lines)
offsets = cumulative_sum(imap(len, lines))
tree_config = config.trees[tree]
# Construct skimmer objects for all enabled plugins that define a
# file_to_skim class.
skimmers = [plugin.file_to_skim(path,
contents,
plugin.name,
config.trees[tree],
tree_config,
file_doc,
line_docs)
for plugin in config.trees[tree].enabled_plugins
for plugin in tree_config.enabled_plugins
if plugin.file_to_skim]
skim_links, refses, regionses, annotationses = skim_file(skimmers, len(line_docs))
index_refs = imap(Ref.es_to_triple,
chain.from_iterable(doc.get('refs', [])
for doc in line_docs))
index_regions = imap(Region.es_to_triple,
chain.from_iterable(doc.get('regions', [])
for doc in line_docs))
index_refs = (Ref.es_to_triple(ref, tree_config) for ref in
chain.from_iterable(doc.get('refs', [])
for doc in line_docs))
index_regions = (Region.es_to_triple(region) for region in
chain.from_iterable(doc.get('regions', [])
for doc in line_docs))
tags = finished_tags(lines,
chain(chain.from_iterable(refses), index_refs),
chain(chain.from_iterable(regionses), index_regions))
Expand Down
2 changes: 1 addition & 1 deletion dxr/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def delete_index_quietly(es, index):

# refresh() times out in prod. Wait until it doesn't. That
# probably means things are ready to rock again.
with aligned_progressbar(repeat(None), label='Refeshing index') as bar:
with aligned_progressbar(repeat(None), label='Refreshing index') as bar:
for _ in bar:
try:
es.refresh(index=index)
Expand Down
2 changes: 1 addition & 1 deletion dxr/format
Original file line number Diff line number Diff line change
@@ -1 +1 @@
13
14
99 changes: 2 additions & 97 deletions dxr/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import cgi
from collections import namedtuple
import json
from operator import itemgetter
from os.path import join, islink
from warnings import warn
Expand Down Expand Up @@ -237,7 +236,7 @@ def refs(self):
holding the contents of the file. (``refs()`` will not be called for
binary files.)
``ref`` is a :class:`~dxr.indexers.Ref`.
``ref`` is a :class:`~dxr.lines.Ref`.
"""
return []
Expand All @@ -247,7 +246,7 @@ def regions(self):
of code.
Yield an ordered list of extents and CSS classes (encapsulated in
:class:`~dxr.indexers.Region` instances)::
:class:`~dxr.lines.Region` instances)::
(start, end, Region)
Expand Down Expand Up @@ -413,100 +412,6 @@ def needles_by_line(self):
return []


class Ref(object):
"""A context menu and other metadata attached to a run of text"""

sort_order = 1
__slots__ = ['menu', 'hover', 'qualname_hash']

def __init__(self, menu, hover=None, qualname=None, qualname_hash=None):
"""
:arg hover: the contents of the <a> tag's title attribute. (The first
one wins.)
:arg menu: a list of mappings, each representing an item of the
context menu::
[{'html': 'description',
'title': 'longer description',
'href': 'URL',
'icon': 'extensionless name of a PNG from the icons folder'},
...]
:arg qualname: A unique identifier for the symbol surrounded by this
ref, for highlighting
:arg qualname_hash: The hashed version of ``qualname``, which you can
pass instead of ``qualname`` if you have access to the
already-hashed version
"""
self.menu = menu
self.hover = hover
self.qualname_hash = hash(qualname) if qualname else qualname_hash

def es(self):
ret = {'menuitems': self.menu}
if self.hover:
ret['hover'] = self.hover
if self.qualname_hash is not None: # could be 0
ret['qualname_hash'] = self.qualname_hash
return ret

@classmethod
def es_to_triple(cls, es_ref):
"""Convert ES-dwelling ref representation to a (start, end,
:class:`~dxr.indexers.Ref`) triple."""
payload = es_ref['payload']
return (es_ref['start'],
es_ref['end'],
cls(payload['menuitems'],
hover=payload.get('hover'),
qualname_hash=payload.get('qualname_hash')))

def opener(self):
menu = cgi.escape(json.dumps(self.menu), True)
if self.hover:
title = ' title="' + cgi.escape(self.hover, True) + '"'
else:
title = ''
if self.qualname_hash is not None:
cls = ' class="tok%i"' % self.qualname_hash
else:
cls = ''
return u'<a data-menu="%s"%s%s>' % (menu, title, cls)

def closer(self):
return u'</a>'


class Region(object):
"""A <span> tag with a CSS class, wrapped around a run of text"""

sort_order = 2 # Sort Regions innermost, as it doesn't matter if we split
# them.
__slots__ = ['css_class']

def __init__(self, css_class):
self.css_class = css_class

def es(self):
return self.css_class

@classmethod
def es_to_triple(cls, es_region):
"""Convert ES-dwelling region representation to a (start, end,
:class:`~dxr.indexers.Region`) triple."""
return es_region['start'], es_region['end'], cls(es_region['payload'])

def opener(self):
return u'<span class="%s">' % cgi.escape(self.css_class, True)

def closer(self):
return u'</span>'

def __repr__(self):
"""Return a nice representation for debugging."""
return 'Region("%s")' % self.css_class


# Conveniences:


Expand Down
171 changes: 170 additions & 1 deletion dxr/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@
from itertools import izip
def compress(data, selectors):
return (d for d, s in izip(data, selectors) if s)
import json
from warnings import warn

from jinja2 import Markup

from dxr.indexers import Ref, Region
from dxr.plugins import all_plugins
from dxr.utils import without_ending


class Line(object):
Expand All @@ -35,6 +37,173 @@ def __repr__(self):
LINE = Line()


class RefClassIdTagger(type):
"""Metaclass which automatically generates an ``id`` attr on the class as
a serializable class identifier.
Having a dedicated identifier allows Ref subclasses to move or change name
without breaking index compatibility.
Expects a ``_plugin`` attr to use as a prefix.
"""
def __new__(metaclass, name, bases, dict):
dict['id'] = without_ending('Ref', name)
return type.__new__(metaclass, name, bases, dict)


class Ref(object):
"""Abstract superclass for a cross-reference attached to a run of text
Carries enough data to construct a context menu, highlight instances of
the same symbol, and show something informative on hover.
"""
sort_order = 1
__slots__ = ['menu_data', 'hover', 'qualname_hash']
__metaclass__ = RefClassIdTagger

def __init__(self, tree, menu_data, hover=None, qualname=None, qualname_hash=None):
"""
:arg menu_data: Arbitrary JSON-serializable data from which we can
construct a context menu
:arg hover: The contents of the <a> tag's title attribute. (The first
one wins.)
:arg qualname: A hashable unique identifier for the symbol surrounded
by this ref, for highlighting
:arg qualname_hash: The hashed version of ``qualname``, which you can
pass instead of ``qualname`` if you have access to the
already-hashed version
"""
self.tree = tree
self.menu_data = menu_data
self.hover = hover
self.qualname_hash = hash(qualname) if qualname else qualname_hash

def es(self):
"""Return a serialization of myself to store in elasticsearch."""
ret = {'plugin': self.plugin,
'id': self.id,
# Smash the data into a string, because it will have a
# different schema from subclass to subclass, and ES will freak
# out:
'menu_data': json.dumps(self.menu_data)}
if self.hover:
ret['hover'] = self.hover
if self.qualname_hash is not None: # could be 0
ret['qualname_hash'] = self.qualname_hash
return ret

@staticmethod
def es_to_triple(es_data, tree):
"""Convert ES-dwelling ref representation to a (start, end,
:class:`~dxr.lines.Ref` subclass) triple.
Return a subclass of Ref, chosen according to the ES data. Into its
attributes "menu_data", "hover" and "qualname_hash", copy the ES
properties of the same names, JSON-decoding "menu_data" first.
:arg es_data: An item from the array under the 'refs' key of an ES LINE
document
:arg tree: The :class:`~dxr.config.TreeConfig` representing the tree
from which the ``es_data`` was pulled
"""
def ref_class(plugin, id):
"""Return the subclass of Ref identified by a combination of
plugin and class ID."""
plugins = all_plugins()
try:
return plugins[plugin].refs[id]
except KeyError:
warn('Ref subclass from plugin %s with ID %s was referenced '
'in the index but not found in the current '
'implementation. Ignored.' % (plugin, id))

payload = es_data['payload']
cls = ref_class(payload['plugin'], payload['id'])
return (es_data['start'],
es_data['end'],
cls(tree,
json.loads(payload['menu_data']),
hover=payload.get('hover'),
qualname_hash=payload.get('qualname_hash')))

def menu_items(self):
"""Return an iterable of menu items to be attached to a ref.
Return an iterable of dicts of this form::
{
html: the HTML to be used as the menu item itself
href: the URL to visit when the menu item is chosen
title: the tooltip text given on hovering over the menu item
icon: the icon to show next to the menu item: the name of a PNG
from the ``icons`` folder, without the .png extension
}
Typically, this pulls data out of ``self.menu_data``.
"""
raise NotImplementedError

def opener(self):
"""Emit the opening anchor tag for a cross reference.
Menu item text, links, and metadata are JSON-encoded and dumped into a
data attr on the tag. JS finds them there and creates a menu on click.
"""
if self.hover:
title = ' title="' + cgi.escape(self.hover, True) + '"'
else:
title = ''
if self.qualname_hash is not None:
cls = ' class="tok%i"' % self.qualname_hash
else:
cls = ''

menu_items = list(self.menu_items())
return u'<a data-menu="%s"%s%s>' % (
cgi.escape(json.dumps(menu_items), True),
title,
cls)

def closer(self):
return u'</a>'


class Region(object):
"""A <span> tag with a CSS class, wrapped around a run of text"""

sort_order = 2 # Sort Regions innermost, as it doesn't matter if we split
# them.
__slots__ = ['css_class']

def __init__(self, css_class):
self.css_class = css_class

def es(self):
return self.css_class

@classmethod
def es_to_triple(cls, es_region):
"""Convert ES-dwelling region representation to a (start, end,
:class:`~dxr.lines.Region`) triple."""
return es_region['start'], es_region['end'], cls(es_region['payload'])

def opener(self):
return u'<span class="%s">' % cgi.escape(self.css_class, True)

def closer(self):
return u'</span>'

def __repr__(self):
"""Return a nice representation for debugging."""
return 'Region("%s")' % self.css_class


def balanced_tags(tags):
"""Come up with a balanced series of tags which express the semantics of
the given sorted interleaved ones.
Expand Down

0 comments on commit 2d35fb4

Please sign in to comment.