Permalink
Browse files

[bug 831005] Update to elasticutils master tip

* add document_id to index
* update to elasticutils master tip -- this picks up pyelasticsearch
  and ditches pyes
* nix get_indexing_es (it was dumb)
* remove bunch of now-unused settings (also dumb)
* tweak test scaffolding -- this should make tests run faster
  for kitsune in a VM and on slower machines
* clean up docs

Note: This doesn't remove pyes. For the purposes of making this easier
to review, that'll get done in a future commit.

Note: This adds a new field to the mapping, but it's only used in indexing
and it's obviously a string. Thus it's ok for ES to infer what it is
and we don't have to do the "change the mapping" two-step dance.
  • Loading branch information...
1 parent 261a26f commit 9a2c32393094cc4ea275b3dafb8d2c760dc58fe7 @willkg willkg committed Mar 27, 2013
View
@@ -133,3 +133,9 @@
[submodule "vendor/src/django-statsd"]
path = vendor/src/django-statsd
url = git://github.com/andymckay/django-statsd.git
+[submodule "vendor/src/pyelasticsearch"]
+ path = vendor/src/pyelasticsearch
+ url = git://github.com/rhec/pyelasticsearch.git
+[submodule "vendor/src/requests"]
+ path = vendor/src/requests
+ url = git://github.com/kennethreitz/requests.git
@@ -192,6 +192,7 @@ def get_query_fields(cls):
def get_mapping(cls):
return {
'id': {'type': 'long'},
+ 'document_id': {'type': 'string', 'index': 'not_analyzed'},
'model': {'type': 'string', 'index': 'not_analyzed'},
'url': {'type': 'string', 'index': 'not_analyzed'},
'indexed_on': {'type': 'integer'},
@@ -210,12 +211,14 @@ def get_mapping(cls):
'post_replies': {'type': 'integer'}}
@classmethod
- def extract_document(cls, obj_id):
+ def extract_document(cls, obj_id, obj=None):
"""Extracts interesting thing from a Thread and its Posts"""
- obj = cls.objects.select_related('last_post').get(pk=obj_id)
+ if obj is None:
+ obj = cls.objects.select_related('last_post').get(pk=obj_id)
d = {}
d['id'] = obj.id
+ d['document_id'] = cls.get_document_id(obj.id)
d['model'] = cls.get_model_name()
d['url'] = obj.get_absolute_url()
d['indexed_on'] = int(time.time())
@@ -10,8 +10,7 @@
from questions.models import Question, QuestionVote
from questions.tasks import update_question_vote_chunk
-from search.es_utils import (ESTimeoutError, ESMaxRetryError, ESException,
- WRITE_INDEX, get_documents, get_indexing_es)
+from search.es_utils import ES_EXCEPTIONS, WRITE_INDEX, get_documents, get_es
from search.tasks import index_task
from sumo.utils import chunked
@@ -89,7 +88,7 @@ def auto_lock_old_questions():
if settings.ES_LIVE_INDEXING:
try:
- es = get_indexing_es()
+ es = get_es()
# So... the first time this runs, it'll handle 160K
# questions or so which stresses everything. Thus we
@@ -115,7 +114,7 @@ def auto_lock_old_questions():
es.flush_bulk(forced=True)
es.refresh(WRITE_INDEX, timesleep=0)
- except (ESTimeoutError, ESMaxRetryError, ESException):
+ except ES_EXCEPTIONS:
# Something happened with ES, so let's push index updating
# into an index_task which retries when it fails because
# of ES issues.
@@ -310,6 +310,7 @@ def get_query_fields(cls):
def get_mapping(cls):
return {
'id': {'type': 'long'},
+ 'document_id': {'type': 'string', 'index': 'not_analyzed'},
'model': {'type': 'string', 'index': 'not_analyzed'},
'url': {'type': 'string', 'index': 'not_analyzed'},
'indexed_on': {'type': 'integer'},
@@ -344,18 +345,27 @@ def get_mapping(cls):
}
@classmethod
- def extract_document(cls, obj_id):
+ def extract_document(cls, obj_id, obj=None):
"""Extracts indexable attributes from a Question and its answers."""
-
- # Note: Need to keep this in sync with
- # tasks.update_question_vote_chunk.
- obj = cls.uncached.values(
- 'id', 'title', 'content', 'num_answers', 'solution_id',
- 'is_locked', 'created', 'updated', 'num_votes_past_week',
- 'creator__username', 'locale').get(pk=obj_id)
+ fields = ['id', 'title', 'content', 'num_answers', 'solution_id',
+ 'is_locked', 'created', 'updated', 'num_votes_past_week',
+ 'locale']
+ composed_fields = ['creator__username']
+ all_fields = fields + composed_fields
+
+ if obj is None:
+ # Note: Need to keep this in sync with
+ # tasks.update_question_vote_chunk.
+ obj = cls.uncached.values(*all_fields).get(pk=obj_id)
+ else:
+ fixed_obj = dict([(field, getattr(obj, field))
+ for field in fields])
+ fixed_obj['creator__username'] = obj.creator.username
+ obj = fixed_obj
d = {}
d['id'] = obj['id']
+ d['document_id'] = cls.get_document_id(obj['id'])
d['model'] = cls.get_model_name()
# We do this because get_absolute_url is an instance method
@@ -10,7 +10,7 @@
from activity.models import Action
from questions import ANSWERS_PER_PAGE
from questions.karma_actions import AnswerAction, FirstAnswerAction
-from search.es_utils import ESTimeoutError, ESMaxRetryError, ESException
+from search.es_utils import ES_EXCEPTIONS
from search.tasks import index_task
@@ -90,7 +90,7 @@ def update_question_vote_chunk(data):
doc[u'question_num_votes_past_week'] = num
Question.index(doc)
- except (ESTimeoutError, ESMaxRetryError, ESException):
+ except ES_EXCEPTIONS:
# Something happened with ES, so let's push index updating
# into an index_task which retries when it fails because
# of ES issues.
@@ -25,6 +25,8 @@
import waffle
from mobility.decorators import mobile_template
from ratelimit.decorators import ratelimit
+from pyelasticsearch.exceptions import (
+ Timeout, ConnectionError, ElasticHttpError)
from session_csrf import anonymous_csrf
from statsd import statsd
from taggit.models import Tag
@@ -49,8 +51,7 @@
from questions.models import Question, Answer, QuestionVote, AnswerVote
from questions.question_config import products
from search.utils import locale_or_default, clean_excerpt
-from search.es_utils import (ESTimeoutError, ESMaxRetryError, ESException,
- Sphilastic, F)
+from search.es_utils import ES_EXCEPTIONS, Sphilastic, F
from sumo.helpers import urlparams
from sumo.urlresolvers import reverse
from sumo.utils import paginate, simple_paginate, build_paged_url, user_or_ip
@@ -1186,7 +1187,7 @@ def stats_topic_data(bucket_days, start, end):
search = search.facet_raw(**facets).values_dict()
try:
histograms_data = search.facet_counts()
- except (ESTimeoutError, ESMaxRetryError, ESException):
+ except ES_EXCEPTIONS:
return []
# The data looks like this right now:
@@ -1378,13 +1379,8 @@ def _search_suggestions(request, text, locale, product_slugs):
except Question.DoesNotExist:
pass
- except (ESTimeoutError, ESMaxRetryError, ESException) as exc:
- if isinstance(exc, ESTimeoutError):
- statsd.incr('questions.suggestions.timeouterror')
- elif isinstance(exc, ESMaxRetryError):
- statsd.incr('questions.suggestions.maxretryerror')
- elif isinstance(exc, ESException):
- statsd.incr('questions.suggestions.elasticsearchexception')
+ except (Timeout, ConnectionError, ElasticHttpError) as exc:
+ statsd.incr('questions.suggestions.eserror')
log.debug(exc)
return results
View
@@ -11,8 +11,7 @@
from search import es_utils
from search.es_utils import (get_doctype_stats, get_indexes, delete_index,
- ESTimeoutError, ESMaxRetryError,
- ESIndexMissingException, get_indexable,
+ ES_EXCEPTIONS, get_indexable,
SUMO_DOCTYPE, merge_mappings, CHUNK_SIZE,
recreate_index)
from search.models import Record, get_search_models
@@ -164,44 +163,30 @@ def search(request):
if delete_requested:
try:
return handle_delete(request)
- except DeleteError, e:
+ except DeleteError as e:
error_messages.append(u'Error: %s' % e.message)
- except ESMaxRetryError:
- error_messages.append('Error: Elastic Search is not set up on '
- 'this machine or is not responding. '
- '(MaxRetryError)')
- except ESIndexMissingException:
- error_messages.append('Error: Index is missing. Press the reindex '
- 'button below. (IndexMissingException)')
- except ESTimeoutError:
- error_messages.append('Error: Connection to Elastic Search timed '
- 'out. (TimeoutError)')
+ except ES_EXCEPTIONS as e:
+ error_messages.append('Error: {0}'.format(repr(e)))
stats = None
write_stats = None
indexes = []
+
+ try:
+ stats = get_doctype_stats(es_utils.READ_INDEX)
+ except ES_EXCEPTIONS:
+ stats = None
+
+ try:
+ write_stats = get_doctype_stats(es_utils.WRITE_INDEX)
+ except ES_EXCEPTIONS:
+ write_stats = None
+
try:
- # This gets index stats, but also tells us whether ES is in
- # a bad state.
- try:
- stats = get_doctype_stats(es_utils.READ_INDEX)
- except ESIndexMissingException:
- stats = None
- try:
- write_stats = get_doctype_stats(es_utils.WRITE_INDEX)
- except ESIndexMissingException:
- write_stats = None
indexes = get_indexes()
indexes.sort(key=lambda m: m[0])
- except ESMaxRetryError:
- error_messages.append('Error: Elastic Search is not set up on this '
- 'machine or is not responding. (MaxRetryError)')
- except ESIndexMissingException:
- error_messages.append('Error: Index is missing. Press the reindex '
- 'button below. (IndexMissingException)')
- except ESTimeoutError:
- error_messages.append('Error: Connection to Elastic Search timed out. '
- '(TimeoutError)')
+ except ES_EXCEPTIONS as e:
+ error_messages.append('Error: {0}'.format(repr(e)))
try:
client = redis_client('default')
Oops, something went wrong.

0 comments on commit 9a2c323

Please sign in to comment.