Permalink
Browse files

[Bug 894686] Step 1 - Index Documents by locale.

  • Loading branch information...
1 parent 03e571f commit 1212c97253b8470806d9f4ba806889a8cba3cd7a @mythmon committed Jul 23, 2013
Showing with 262 additions and 15 deletions.
  1. +79 −7 kitsune/search/es_utils.py
  2. +131 −0 kitsune/search/tests/test_es.py
  3. +43 −2 kitsune/settings.py
  4. +9 −6 kitsune/wiki/models.py
@@ -157,6 +157,60 @@ def get_documents(cls, ids):
return list(ret)
+def get_analysis():
+ """Generate all our custom analyzers, tokenizers, and filters
+
+ This is mostly variants of the Snowball analyzer for various
+ languages, and a custom analyzer for Burmese (my).
+ """
+ analyzers = {}
+ tokenizers = {}
+
+ snowball_langs = [
+ 'Armenian', 'Basque', 'Catalan', 'Danish', 'Dutch', 'English',
+ 'Finnish', 'French', 'German', 'Hungarian', 'Italian', 'Norwegian',
+ 'Portuguese', 'Romanian', 'Russian', 'Spanish', 'Swedish', 'Turkish',
+ ]
+
+ for lang in snowball_langs:
+ key = 'snowball-' + lang.lower()
+ analyzers[key] = {
+ 'type': 'snowball',
+ 'language': lang,
+ }
+
+ # Burmese (my) specific custom analyzer.
+ #
+ # Burmese is a language that uses spaces to divide phrases, instead
+ # of words. Because of that, it isn't really possible to split out
+ # words like in other languages. This uses a similar approach to the
+ # built in CJK analyzer (which doesn't work reliably here), which is
+ # shingling, or overlapping substrings.
+ #
+ # Given the string 'abc def', this will result in these tokens being
+ # generated: ['ab', 'bc', 'c ', ' d', 'de', 'ef']. ES understands
+ # this kind of overlapping token, and hopefully this will result in
+ # an ok search experience.
+
+ analyzers['custom-burmese'] = {
+ 'type': 'custom',
+ 'tokenizer': 'custom-burmese',
+ 'char_filter': ['html_strip'],
+ }
+
+ tokenizers['custom-burmese'] = {
+ 'type': 'nGram',
+ 'min_gram': 2,
+ 'max_gram': 2,
+ }
+
+ # Done!
+ return {
+ 'analyzer': analyzers,
+ 'tokenizer': tokenizers,
+ }
+
+
def recreate_index(es=None):
"""Deletes WRITE_INDEX if it's there and creates a new one"""
if es is None:
@@ -165,16 +219,19 @@ def recreate_index(es=None):
index = WRITE_INDEX
delete_index(index)
- mappings = get_mappings()
-
# There should be no mapping-conflict race here since the index doesn't
# exist. Live indexing should just fail.
- # Simultaneously create the index and the mappings, so live
- # indexing doesn't get a chance to index anything between the two
- # and infer a bogus mapping (which ES then freaks out over when we
- # try to lay in an incompatible explicit mapping).
- es.create_index(index, settings={'mappings': mappings})
+ # Simultaneously create the index, the mappings, the analyzers, and
+ # the tokenizers, so live indexing doesn't get a chance to index
+ # anything between and infer a bogus mapping (which ES then freaks
+ # out over when we try to lay in an incompatible explicit mapping).
+ es.create_index(index, settings={
+ 'mappings': get_mappings(),
+ 'settings': {
+ 'analysis': get_analysis(),
+ }
+ })
# Wait until the index is there.
es.health(wait_for_status='yellow')
@@ -619,3 +676,18 @@ def verify_obj(mt_name, cls, mapping, obj_id):
format_time((time.time() - cls_time) * 1000 / count)))
log.info('Done! {0}'.format(format_time(time.time() - start_time)))
+
+
+def es_analyzer_for_locale(locale, fallback="standard"):
+ """Pick an appropriate analyzer for a given locale.
+
+ If no analyzer is defined for `locale`, return fallback instead,
+ which defaults to ES analyzer named "standard".
+ """
+ analyzer = settings.ES_LOCALE_ANALYZERS.get(locale, fallback)
+
+ if (not settings.ES_USE_PLUGINS and
+ analyzer in settings.ES_PLUGIN_ANALYZERS):
+ analyzer = fallback
+
+ return analyzer
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
import json
import unittest
from datetime import datetime, timedelta
@@ -24,6 +25,7 @@
from kitsune.sumo.tests import LocalizingClient
from kitsune.sumo.urlresolvers import reverse
from kitsune.users.tests import group, user
+from kitsune.wiki.models import DocumentMappingType
from kitsune.wiki.tests import document, revision, helpful_vote
@@ -1100,3 +1102,132 @@ def test_mappings(self):
merged_mapping[key][1].append(cls_name)
# If we get here, then we're fine.
+
+
+class TestAnalyzers(ElasticTestCase):
+
+ def setUp(self):
+ super(TestAnalyzers, self).setUp()
+
+ self.locale_data = {
+ 'en-US': {
+ 'analyzer': 'snowball-english',
+ 'content': 'I have a cat.',
+ },
+ 'es': {
+ 'analyzer': 'snowball-spanish',
+ 'content': 'Tieno un gato.',
+ },
+ 'ar': {
+ 'analyzer': 'arabic',
+ 'content': u'لدي اثنين من القطط',
+ },
+ 'my': {
+ 'analyzer': 'custom-burmese',
+ 'content': u'အနုပညာ',
+ },
+ 'he': {
+ 'analyzer': 'standard',
+ 'content': u'גאולוגיה היא אחד',
+ }
+ }
+
+ self.docs = {}
+ for locale, data in self.locale_data.items():
+ d = document(locale=locale, save=True)
+ revision(document=d, content=data['content'], is_approved=True, save=True)
+ self.locale_data[locale]['doc'] = d
+
+ self.refresh()
+
+ def test_analyzer_choices(self):
+ """Check that the indexer picked the right analyzer."""
+
+ ids = [d.id for d in self.docs.values()]
+ docs = es_utils.get_documents(DocumentMappingType, ids)
+ for doc in docs:
+ locale = doc['locale']
+ eq_(doc['_analyzer'], self.locale_data[locale]['analyzer'])
+
+ def _check_locale_tokenization(self, locale, expected_tokens, p_tag=True):
+ """
+ Check that a given locale's document was tokenized correctly.
+
+ * `locale` - The locale to check.
+ * `expected_tokens` - An iterable of the tokens that should be
+ found. If any tokens from this list are missing, or if any
+ tokens not in this list are found, the check will fail.
+ * `p_tag` - Default True. If True, an extra token will be added
+ to `expected_tokens`: "p".
+
+ This is because our wiki parser wraps it's content in <p>
+ tags and many analyzers will tokenize a string like
+ '<p>Foo</p>' as ['p', 'foo'] (the HTML tag is included in
+ the tokenization). So this will show up in the tokenization
+ during this test. Not all the analyzers do this, which is
+ why it can be turned off.
+
+ Why can't we fix the analyzers to strip out that HTML, and not
+ generate spurious tokens? That could probably be done, but it
+ probably isn't worth while because:
+
+ * ES will weight common words lower, thanks to it's TF-IDF
+ algorithms, which judges words based on how often they
+ appear in the entire corpus and in the document, so the p
+ tokens will be largely ignored.
+ * The pre-l10n search code did it this way, so it doesn't
+ break search.
+ * When implementing l10n search, I wanted to minimize the
+ number of changes needed, and this seemed like an unneeded
+ change.
+ """
+
+ search = es_utils.Sphilastic(DocumentMappingType)
+ search = search.filter(document_locale=locale)
+ facet_filter = search._process_filters([('document_locale', locale)])
+ search = search.facet_raw(tokens={
+ 'terms': {'field': 'document_content'},
+ 'facet_filter': facet_filter,
+ })
+ facets = search.facet_counts()
+
+ expected = set(expected_tokens)
+ if p_tag:
+ # Since `expected` is a set, there is no problem adding this
+ # twice, since duplicates will be ignored.
+ expected.add(u'p')
+ actual = set(t['term'] for t in facets['tokens'])
+ eq_(actual, expected)
+
+ # These 5 languages were chosen for tokenization testing because
+ # they represent the 5 kinds of languages we have: English, Snowball
+ # supported languages, ES supported languages, Languages with custom
+ # analyzers, and languages with no analyzer, which use the standard
+ # analyzer.
+
+ def test_english_tokenization(self):
+ """Test that English stemming and stop words work."""
+ self._check_locale_tokenization('en-US', ['i', 'have', 'cat'])
+
+ def test_spanish_tokenization(self):
+ """Test that Spanish stemming and stop words work."""
+ self._check_locale_tokenization('es', ['tien', 'un', 'gat'])
+
+ def test_arabic_tokenization(self):
+ """Test that Arabic stemming works.
+
+ I don't read Arabic, this is just what ES gave me when I asked
+ it to analyze an Arabic text as Arabic. If someone who reads
+ Arabic can improve this test, go for it!
+ """
+ self._check_locale_tokenization('ar', [u'لد', u'اثن', u'قطط'])
+
+ def test_burmese_tokenization(self):
+ """Test that the shingle analyzer is active for Burmese."""
+ tokens = [u'အန', u'နု', u'ုပ', u'ပည', u'ညာ']
+ self._check_locale_tokenization('my', tokens, False)
+
+ def test_herbrew_tokenization(self):
+ """Test that Hebrew uses the standard analyzer."""
+ tokens = [u'גאולוגיה', u'היא', u'אחד']
+ self._check_locale_tokenization('he', tokens)
View
@@ -209,11 +209,52 @@
'sv-SE': 'sv',
}
+ES_LOCALE_ANALYZERS = {
+ 'ar': 'arabic',
+ 'bg': 'bulgarian',
+ 'ca': 'snowball-catalan',
+ 'cs': 'czech',
+ 'da': 'snowball-danish',
+ 'de': 'snowball-german',
+ 'en-US': 'snowball-english',
+ 'es': 'snowball-spanish',
+ 'eu': 'snowball-basque',
+ 'fa': 'persian',
+ 'fi': 'snowball-finnish',
+ 'fr': 'snowball-french',
+ 'gl': 'galician',
+ 'hi-IN': 'hindi',
+ 'hu': 'snowball-hungarian',
+ 'hy-AM': 'snowball-armenian',
+ 'id': 'indonesian',
+ 'it': 'snowball-italian',
+ 'ja': 'cjk',
+ 'my': 'custom-burmese',
+ 'nb-NO': 'snowball-norwegian',
+ 'nl': 'snowball-dutch',
+ 'no': 'snowball-norwegian',
+ 'pl': 'polish',
+ 'pt-BR': 'snowball-portuguese',
+ 'pt-PT': 'snowball-portuguese',
+ 'ro': 'snowball-romanian',
+ 'ru': 'snowball-russian',
+ 'sv': 'snowball-swedish',
+ 'th': 'thai',
+ 'tr': 'snowball-turkish',
+ 'zh-CN': 'chinese',
+ 'zh-TW': 'chinese',
+}
+
+ES_PLUGIN_ANALYZERS = [
+ 'polish'
+]
+
+ES_USE_PLUGINS = False
+
TEXT_DOMAIN = 'messages'
SITE_ID = 1
-
# If you set this to False, Django will make some optimizations so as
# not to load the internationalization machinery.
USE_I18N = True
@@ -547,7 +588,7 @@ def JINJA_CONFIG():
ES_INDEXES = {'default': 'sumo-20130701'}
# Indexes for indexing--set this to ES_INDEXES if you want to read to
# and write to the same index.
-ES_WRITE_INDEXES = ES_INDEXES
+ES_WRITE_INDEXES = {'default': 'sumo-20130723'}
# This is prepended to index names to get the final read/write index
# names used by kitsune. This is so that you can have multiple
# environments pointed at the same ElasticSearch cluster and not have
@@ -19,7 +19,8 @@
from kitsune.products.models import Product, Topic
from kitsune.questions.models import Question
-from kitsune.search.es_utils import UnindexMeBro, ES_EXCEPTIONS
+from kitsune.search.es_utils import (UnindexMeBro, ES_EXCEPTIONS,
+ es_analyzer_for_locale)
from kitsune.search.models import (
SearchMappingType, SearchMixin, register_for_indexing,
register_mapping_type)
@@ -686,18 +687,17 @@ def get_mapping(cls):
'product': {'type': 'string', 'index': 'not_analyzed'},
'topic': {'type': 'string', 'index': 'not_analyzed'},
- 'document_title': {'type': 'string', 'analyzer': 'snowball'},
+ 'document_title': {'type': 'string'},
'document_locale': {'type': 'string', 'index': 'not_analyzed'},
'document_current_id': {'type': 'integer'},
'document_parent_id': {'type': 'integer'},
- 'document_content': {'type': 'string', 'analyzer': 'snowball',
- 'store': 'yes',
+ 'document_content': {'type': 'string', 'store': 'yes',
'term_vector': 'with_positions_offsets'},
'document_category': {'type': 'integer'},
'document_slug': {'type': 'string', 'index': 'not_analyzed'},
'document_is_archived': {'type': 'boolean'},
- 'document_summary': {'type': 'string', 'analyzer': 'snowball'},
- 'document_keywords': {'type': 'string', 'analyzer': 'snowball'},
+ 'document_summary': {'type': 'string'},
+ 'document_keywords': {'type': 'string'},
'document_recent_helpful_votes': {'type': 'integer'}
}
}
@@ -758,6 +758,9 @@ def extract_document(cls, obj_id, obj=None):
else:
d['document_recent_helpful_votes'] = 0
+ # Select a locale-appropriate default analyzer for all strings.
+ d['_analyzer'] = es_analyzer_for_locale(obj.locale)
+
return d
@classmethod

0 comments on commit 1212c97

Please sign in to comment.