Permalink
Browse files

[Bug 894686] Step 1 - Index Documents by locale.

  • Loading branch information...
mythmon committed Jul 23, 2013
1 parent 03e571f commit 1212c97253b8470806d9f4ba806889a8cba3cd7a
Showing with 262 additions and 15 deletions.
  1. +79 −7 kitsune/search/es_utils.py
  2. +131 −0 kitsune/search/tests/test_es.py
  3. +43 −2 kitsune/settings.py
  4. +9 −6 kitsune/wiki/models.py
View
@@ -157,6 +157,60 @@ def get_documents(cls, ids):
return list(ret)
def get_analysis():
"""Generate all our custom analyzers, tokenizers, and filters
This is mostly variants of the Snowball analyzer for various
languages, and a custom analyzer for Burmese (my).
"""
analyzers = {}
tokenizers = {}
snowball_langs = [
'Armenian', 'Basque', 'Catalan', 'Danish', 'Dutch', 'English',
'Finnish', 'French', 'German', 'Hungarian', 'Italian', 'Norwegian',
'Portuguese', 'Romanian', 'Russian', 'Spanish', 'Swedish', 'Turkish',
]
for lang in snowball_langs:
key = 'snowball-' + lang.lower()
analyzers[key] = {
'type': 'snowball',
'language': lang,
}
# Burmese (my) specific custom analyzer.
#
# Burmese is a language that uses spaces to divide phrases, instead
# of words. Because of that, it isn't really possible to split out
# words like in other languages. This uses a similar approach to the
# built in CJK analyzer (which doesn't work reliably here), which is
# shingling, or overlapping substrings.
#
# Given the string 'abc def', this will result in these tokens being
# generated: ['ab', 'bc', 'c ', ' d', 'de', 'ef']. ES understands
# this kind of overlapping token, and hopefully this will result in
# an ok search experience.
analyzers['custom-burmese'] = {
'type': 'custom',
'tokenizer': 'custom-burmese',
'char_filter': ['html_strip'],
}
tokenizers['custom-burmese'] = {
'type': 'nGram',
'min_gram': 2,
'max_gram': 2,
}
# Done!
return {
'analyzer': analyzers,
'tokenizer': tokenizers,
}
def recreate_index(es=None):
"""Deletes WRITE_INDEX if it's there and creates a new one"""
if es is None:
@@ -165,16 +219,19 @@ def recreate_index(es=None):
index = WRITE_INDEX
delete_index(index)
mappings = get_mappings()
# There should be no mapping-conflict race here since the index doesn't
# exist. Live indexing should just fail.
# Simultaneously create the index and the mappings, so live
# indexing doesn't get a chance to index anything between the two
# and infer a bogus mapping (which ES then freaks out over when we
# try to lay in an incompatible explicit mapping).
es.create_index(index, settings={'mappings': mappings})
# Simultaneously create the index, the mappings, the analyzers, and
# the tokenizers, so live indexing doesn't get a chance to index
# anything between and infer a bogus mapping (which ES then freaks
# out over when we try to lay in an incompatible explicit mapping).
es.create_index(index, settings={
'mappings': get_mappings(),
'settings': {
'analysis': get_analysis(),
}
})
# Wait until the index is there.
es.health(wait_for_status='yellow')
@@ -619,3 +676,18 @@ def verify_obj(mt_name, cls, mapping, obj_id):
format_time((time.time() - cls_time) * 1000 / count)))
log.info('Done! {0}'.format(format_time(time.time() - start_time)))
def es_analyzer_for_locale(locale, fallback="standard"):
"""Pick an appropriate analyzer for a given locale.
If no analyzer is defined for `locale`, return fallback instead,
which defaults to ES analyzer named "standard".
"""
analyzer = settings.ES_LOCALE_ANALYZERS.get(locale, fallback)
if (not settings.ES_USE_PLUGINS and
analyzer in settings.ES_PLUGIN_ANALYZERS):
analyzer = fallback
return analyzer
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
import json
import unittest
from datetime import datetime, timedelta
@@ -24,6 +25,7 @@
from kitsune.sumo.tests import LocalizingClient
from kitsune.sumo.urlresolvers import reverse
from kitsune.users.tests import group, user
from kitsune.wiki.models import DocumentMappingType
from kitsune.wiki.tests import document, revision, helpful_vote
@@ -1100,3 +1102,132 @@ def test_mappings(self):
merged_mapping[key][1].append(cls_name)
# If we get here, then we're fine.
class TestAnalyzers(ElasticTestCase):
def setUp(self):
super(TestAnalyzers, self).setUp()
self.locale_data = {
'en-US': {
'analyzer': 'snowball-english',
'content': 'I have a cat.',
},
'es': {
'analyzer': 'snowball-spanish',
'content': 'Tieno un gato.',
},
'ar': {
'analyzer': 'arabic',
'content': u'لدي اثنين من القطط',
},
'my': {
'analyzer': 'custom-burmese',
'content': u'အနုပညာ',
},
'he': {
'analyzer': 'standard',
'content': u'גאולוגיה היא אחד',
}
}
self.docs = {}
for locale, data in self.locale_data.items():
d = document(locale=locale, save=True)
revision(document=d, content=data['content'], is_approved=True, save=True)
self.locale_data[locale]['doc'] = d
self.refresh()
def test_analyzer_choices(self):
"""Check that the indexer picked the right analyzer."""
ids = [d.id for d in self.docs.values()]
docs = es_utils.get_documents(DocumentMappingType, ids)
for doc in docs:
locale = doc['locale']
eq_(doc['_analyzer'], self.locale_data[locale]['analyzer'])
def _check_locale_tokenization(self, locale, expected_tokens, p_tag=True):
"""
Check that a given locale's document was tokenized correctly.
* `locale` - The locale to check.
* `expected_tokens` - An iterable of the tokens that should be
found. If any tokens from this list are missing, or if any
tokens not in this list are found, the check will fail.
* `p_tag` - Default True. If True, an extra token will be added
to `expected_tokens`: "p".
This is because our wiki parser wraps it's content in <p>
tags and many analyzers will tokenize a string like
'<p>Foo</p>' as ['p', 'foo'] (the HTML tag is included in
the tokenization). So this will show up in the tokenization
during this test. Not all the analyzers do this, which is
why it can be turned off.
Why can't we fix the analyzers to strip out that HTML, and not
generate spurious tokens? That could probably be done, but it
probably isn't worth while because:
* ES will weight common words lower, thanks to it's TF-IDF
algorithms, which judges words based on how often they
appear in the entire corpus and in the document, so the p
tokens will be largely ignored.
* The pre-l10n search code did it this way, so it doesn't
break search.
* When implementing l10n search, I wanted to minimize the
number of changes needed, and this seemed like an unneeded
change.
"""
search = es_utils.Sphilastic(DocumentMappingType)
search = search.filter(document_locale=locale)
facet_filter = search._process_filters([('document_locale', locale)])
search = search.facet_raw(tokens={
'terms': {'field': 'document_content'},
'facet_filter': facet_filter,
})
facets = search.facet_counts()
expected = set(expected_tokens)
if p_tag:
# Since `expected` is a set, there is no problem adding this
# twice, since duplicates will be ignored.
expected.add(u'p')
actual = set(t['term'] for t in facets['tokens'])
eq_(actual, expected)
# These 5 languages were chosen for tokenization testing because
# they represent the 5 kinds of languages we have: English, Snowball
# supported languages, ES supported languages, Languages with custom
# analyzers, and languages with no analyzer, which use the standard
# analyzer.
def test_english_tokenization(self):
"""Test that English stemming and stop words work."""
self._check_locale_tokenization('en-US', ['i', 'have', 'cat'])
def test_spanish_tokenization(self):
"""Test that Spanish stemming and stop words work."""
self._check_locale_tokenization('es', ['tien', 'un', 'gat'])
def test_arabic_tokenization(self):
"""Test that Arabic stemming works.
I don't read Arabic, this is just what ES gave me when I asked
it to analyze an Arabic text as Arabic. If someone who reads
Arabic can improve this test, go for it!
"""
self._check_locale_tokenization('ar', [u'لد', u'اثن', u'قطط'])
def test_burmese_tokenization(self):
"""Test that the shingle analyzer is active for Burmese."""
tokens = [u'အန', u'နု', u'ုပ', u'ပည', u'ညာ']
self._check_locale_tokenization('my', tokens, False)
def test_herbrew_tokenization(self):
"""Test that Hebrew uses the standard analyzer."""
tokens = [u'גאולוגיה', u'היא', u'אחד']
self._check_locale_tokenization('he', tokens)
View
@@ -209,11 +209,52 @@
'sv-SE': 'sv',
}
ES_LOCALE_ANALYZERS = {
'ar': 'arabic',
'bg': 'bulgarian',
'ca': 'snowball-catalan',
'cs': 'czech',
'da': 'snowball-danish',
'de': 'snowball-german',
'en-US': 'snowball-english',
'es': 'snowball-spanish',
'eu': 'snowball-basque',
'fa': 'persian',
'fi': 'snowball-finnish',
'fr': 'snowball-french',
'gl': 'galician',
'hi-IN': 'hindi',
'hu': 'snowball-hungarian',
'hy-AM': 'snowball-armenian',
'id': 'indonesian',
'it': 'snowball-italian',
'ja': 'cjk',
'my': 'custom-burmese',
'nb-NO': 'snowball-norwegian',
'nl': 'snowball-dutch',
'no': 'snowball-norwegian',
'pl': 'polish',
'pt-BR': 'snowball-portuguese',
'pt-PT': 'snowball-portuguese',
'ro': 'snowball-romanian',
'ru': 'snowball-russian',
'sv': 'snowball-swedish',
'th': 'thai',
'tr': 'snowball-turkish',
'zh-CN': 'chinese',
'zh-TW': 'chinese',
}
ES_PLUGIN_ANALYZERS = [
'polish'
]
ES_USE_PLUGINS = False
TEXT_DOMAIN = 'messages'
SITE_ID = 1
# If you set this to False, Django will make some optimizations so as
# not to load the internationalization machinery.
USE_I18N = True
@@ -547,7 +588,7 @@ def JINJA_CONFIG():
ES_INDEXES = {'default': 'sumo-20130701'}
# Indexes for indexing--set this to ES_INDEXES if you want to read to
# and write to the same index.
ES_WRITE_INDEXES = ES_INDEXES
ES_WRITE_INDEXES = {'default': 'sumo-20130723'}
# This is prepended to index names to get the final read/write index
# names used by kitsune. This is so that you can have multiple
# environments pointed at the same ElasticSearch cluster and not have
View
@@ -19,7 +19,8 @@
from kitsune.products.models import Product, Topic
from kitsune.questions.models import Question
from kitsune.search.es_utils import UnindexMeBro, ES_EXCEPTIONS
from kitsune.search.es_utils import (UnindexMeBro, ES_EXCEPTIONS,
es_analyzer_for_locale)
from kitsune.search.models import (
SearchMappingType, SearchMixin, register_for_indexing,
register_mapping_type)
@@ -686,18 +687,17 @@ def get_mapping(cls):
'product': {'type': 'string', 'index': 'not_analyzed'},
'topic': {'type': 'string', 'index': 'not_analyzed'},
'document_title': {'type': 'string', 'analyzer': 'snowball'},
'document_title': {'type': 'string'},
'document_locale': {'type': 'string', 'index': 'not_analyzed'},
'document_current_id': {'type': 'integer'},
'document_parent_id': {'type': 'integer'},
'document_content': {'type': 'string', 'analyzer': 'snowball',
'store': 'yes',
'document_content': {'type': 'string', 'store': 'yes',
'term_vector': 'with_positions_offsets'},
'document_category': {'type': 'integer'},
'document_slug': {'type': 'string', 'index': 'not_analyzed'},
'document_is_archived': {'type': 'boolean'},
'document_summary': {'type': 'string', 'analyzer': 'snowball'},
'document_keywords': {'type': 'string', 'analyzer': 'snowball'},
'document_summary': {'type': 'string'},
'document_keywords': {'type': 'string'},
'document_recent_helpful_votes': {'type': 'integer'}
}
}
@@ -758,6 +758,9 @@ def extract_document(cls, obj_id, obj=None):
else:
d['document_recent_helpful_votes'] = 0
# Select a locale-appropriate default analyzer for all strings.
d['_analyzer'] = es_analyzer_for_locale(obj.locale)
return d
@classmethod

0 comments on commit 1212c97

Please sign in to comment.