This repository has been archived by the owner. It is now read-only.
Permalink
Browse files

Rewrite reindex command to parallelize indexing tasks (bug 1025301)

  • Loading branch information...
robhudson committed Oct 3, 2014
1 parent 01051ad commit 721a8cf409be5c1e4abd1ded44d5cfaff8c27e53
Showing with 81 additions and 108 deletions.
  1. +79 −106 lib/es/management/commands/reindex.py
  2. +1 −1 mkt/webapps/indexers.py
  3. +1 −1 requirements/prod.txt
@@ -4,13 +4,13 @@
Currently creates the indexes and re-indexes apps and feed elements.
"""
import logging
-import os
import sys
import time
+from math import ceil
from optparse import make_option
import elasticsearch
-from celery import task
+from celery import chain, chord, task
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
@@ -62,21 +62,19 @@
@task
-def delete_index(old_index):
- """Removes the index."""
- sys.stdout.write('Removing index %r\n' % old_index)
- ES.indices.delete(index=old_index)
-
-
-@task
-def create_index(new_index, alias, indexer, settings):
- """Creates a mapping for the new index.
-
- - new_index: new index name
- - alias: alias name
- - settings: a dictionary of settings
+def pre_index(new_index, old_index, alias, indexer, settings):
+ """
+ This sets up everything needed before indexing:
+ * Flags the database.
+ * Creates the new index.
"""
+ # Flag the database to indicate that the reindexing has started.
+ sys.stdout.write('Flagging the database to start the reindexation\n')
+ Reindexing.flag_reindexing(new_index=new_index, old_index=old_index,
+ alias=alias)
+ time.sleep(5) # Give celeryd some time to flag the DB.
+
sys.stdout.write(
'Create the mapping for index %r, alias: %r\n' % (new_index, alias))
@@ -98,52 +96,16 @@ def create_index(new_index, alias, indexer, settings):
wait_for_relocating_shards=0)
-@task(time_limit=time_limits['hard'], soft_time_limit=time_limits['soft'])
-def run_indexing(index, indexer, chunk_size):
- """Index the objects.
-
- - index: name of the index
-
- Note: Our ES doc sizes are about 5k in size. Chunking by 100 sends ~500kb
- of data to ES at a time.
-
- TODO: Use celery chords here to parallelize these indexing chunks. This
- requires celery 3 (bug 825938).
-
- """
- sys.stdout.write('Indexing apps into index: %s\n' % index)
-
- qs = indexer.get_indexable().values_list('id', flat=True)
- for ids in chunked(list(qs), chunk_size):
- indexer.run_indexing(ids, ES, index=index)
-
-
-@task
-def flag_database(new_index, old_index, alias):
- """Flags the database to indicate that the reindexing has started."""
- sys.stdout.write('Flagging the database to start the reindexation\n')
- Reindexing.flag_reindexing(new_index=new_index, old_index=old_index,
- alias=alias)
- time.sleep(5) # Give celeryd some time to flag the DB.
-
-
@task
-def unflag_database():
- """Unflag the database to indicate that the reindexing is over."""
- sys.stdout.write('Unflagging the database\n')
- Reindexing.unflag_reindexing()
-
-
-@task
-def update_alias(new_index, old_index, alias, settings):
+def post_index(new_index, old_index, alias, indexer, settings):
"""
- Update the alias now that indexing is over.
-
- We do 3 things:
-
- 1. Optimize (which also does a refresh and a flush by default).
- 2. Update settings to reset number of replicas.
- 3. Point the alias to this new index.
+ Perform post-indexing tasks:
+ * Optimize (which also does a refresh and a flush by default).
+ * Update settings to reset number of replicas.
+ * Point the alias to this new index.
+ * Unflag the database.
+ * Remove the old index.
+ * Output the current alias configuration.
"""
sys.stdout.write('Optimizing, updating settings and aliases.\n')
@@ -164,17 +126,42 @@ def update_alias(new_index, old_index, alias, settings):
)
ES.indices.update_aliases(body=dict(actions=actions))
+ sys.stdout.write('Unflagging the database\n')
+ Reindexing.unflag_reindexing()
+
+ sys.stdout.write('Removing index %r\n' % old_index)
+ try:
+ ES.indices.delete(index=old_index)
+ except elasticsearch.NotFoundError:
+ pass
-@task
-def output_summary():
alias_output = ''
for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:
alias_output += unicode(ES.indices.get_aliases(index=ALIAS)) + '\n'
sys.stdout.write(
- 'Reindexation done. Current Aliases configuration: %s\n' %
+ 'Reindexation done. Current aliases configuration: %s\n' %
alias_output)
+@task(ignore_result=False)
+def run_indexing(index, indexer, ids):
+ """Index the objects.
+
+ - index: name of the index
+
+ Note: `ignore_result=False` is required for the chord to work and trigger
+ the callback.
+
+ """
+ indexer.run_indexing(ids, ES, index=index)
+
+
+def chunk_indexing(indexer, chunk_size):
+ """Chunk the items to index."""
+ chunks = list(indexer.get_indexable().values_list('id', flat=True))
+ return chunked(chunks, chunk_size), len(chunks)
+
+
class Command(BaseCommand):
help = 'Reindex all ES indexes'
option_list = BaseCommand.option_list + (
@@ -210,18 +197,27 @@ def handle(self, *args, **kwargs):
raise CommandError('Indexation already occuring - use --force to '
'bypass')
elif force:
- unflag_database()
+ Reindexing.unflag_reindexing()
- chain = None
- old_indexes = []
for ALIAS, INDEXER, CHUNK_SIZE in INDEXES:
+
+ chunks, total = chunk_indexing(INDEXER, CHUNK_SIZE)
+ if not total:
+ self.stdout.write('\nNo tasks to queue for %s' % ALIAS)
+ continue
+ else:
+ total_chunks = int(ceil(total / float(CHUNK_SIZE)))
+ self.stdout.write(
+ '\nParallel indexing {total} items into {n} chunks of '
+ 'size {size}'.format(total=total, n=total_chunks,
+ size=CHUNK_SIZE))
+
# Get the old index if it exists.
try:
aliases = ES.indices.get_alias(name=ALIAS).keys()
except elasticsearch.NotFoundError:
aliases = []
old_index = aliases[0] if aliases else None
- old_indexes.append(old_index)
# Create a new index, using the index name with a timestamp.
new_index = timestamp_index(prefix + ALIAS)
@@ -240,45 +236,22 @@ def handle(self, *args, **kwargs):
num_shards = s.get('number_of_shards',
settings.ES_DEFAULT_NUM_SHARDS)
- # Flag the database to mark as currently indexing.
- if not chain:
- chain = flag_database.si(new_index, old_index, ALIAS)
- else:
- chain |= flag_database.si(new_index, old_index, ALIAS)
-
- # Create the indexes and mappings.
- # Note: We set num_replicas=0 here to lower load while re-indexing.
- # In later step we increase it which results in more efficient bulk
- # copy in ES. For ES < 0.90 we manually enable compression.
- chain |= create_index.si(new_index, ALIAS, INDEXER, {
- 'analysis': INDEXER.get_analysis(),
- 'number_of_replicas': 0, 'number_of_shards': num_shards,
- 'store.compress.tv': True, 'store.compress.stored': True,
- 'refresh_interval': '-1'})
-
- # Index all the things!
- chain |= run_indexing.si(new_index, INDEXER, CHUNK_SIZE)
-
- # After indexing we optimize the index, adjust settings, and point
- # alias to the new index.
- chain |= update_alias.si(new_index, old_index, ALIAS, {
- 'number_of_replicas': num_replicas, 'refresh_interval': '5s'})
-
- # Unflag the database to mark as done indexing.
- chain |= unflag_database.si()
-
- # Delete the old index, if any.
- for old_index in old_indexes:
- if old_index:
- chain |= delete_index.si(old_index)
-
- # All done!
- chain |= output_summary.si()
+ # Ship it.
+ chain(
+ pre_index.si(new_index, old_index, ALIAS, INDEXER, {
+ 'analysis': INDEXER.get_analysis(),
+ 'number_of_replicas': 0,
+ 'number_of_shards': num_shards,
+ 'store.compress.tv': True,
+ 'store.compress.stored': True,
+ 'refresh_interval': '-1'}),
+ chord(
+ header=[run_indexing.si(new_index, INDEXER, chunk)
+ for chunk in chunks],
+ body=post_index.si(new_index, old_index, ALIAS, INDEXER, {
+ 'number_of_replicas': num_replicas,
+ 'refresh_interval': '5s'})
+ )
+ ).apply_async()
- # Ship it.
self.stdout.write('\nNew index and indexing tasks all queued up.\n')
- os.environ['FORCE_INDEXING'] = '1'
- try:
- chain.apply_async()
- finally:
- del os.environ['FORCE_INDEXING']
View
@@ -437,7 +437,7 @@ def run_indexing(cls, ids, ES, index=None, **kw):
sys.stdout.write('Failed to index webapp {0}: {1}\n'.format(
obj.id, e))
- WebappIndexer.bulk_index(docs, es=ES, index=index or cls.get_index())
+ cls.bulk_index(docs, es=ES, index=index or cls.get_index())
@classmethod
def get_app_filter(cls, request, additional_data=None, sq=None,
View
@@ -7,7 +7,7 @@ billiard==2.7.3.34
bleach==1.4
boto==2.20.0
cef==0.5
-celery==3.0.24
+celery==3.0.25
celery-tasktree==0.3.2
certifi==0.0.8
chardet==2.1.1

0 comments on commit 721a8cf

Please sign in to comment.