Skip to content

Commit 1212c97

Browse files
committed
[Bug 894686] Step 1 - Index Documents by locale.
1 parent 03e571f commit 1212c97

4 files changed

Lines changed: 262 additions & 15 deletions

File tree

kitsune/search/es_utils.py

Lines changed: 79 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,60 @@ def get_documents(cls, ids):
157157
return list(ret)
158158

159159

160+
def get_analysis():
161+
"""Generate all our custom analyzers, tokenizers, and filters
162+
163+
This is mostly variants of the Snowball analyzer for various
164+
languages, and a custom analyzer for Burmese (my).
165+
"""
166+
analyzers = {}
167+
tokenizers = {}
168+
169+
snowball_langs = [
170+
'Armenian', 'Basque', 'Catalan', 'Danish', 'Dutch', 'English',
171+
'Finnish', 'French', 'German', 'Hungarian', 'Italian', 'Norwegian',
172+
'Portuguese', 'Romanian', 'Russian', 'Spanish', 'Swedish', 'Turkish',
173+
]
174+
175+
for lang in snowball_langs:
176+
key = 'snowball-' + lang.lower()
177+
analyzers[key] = {
178+
'type': 'snowball',
179+
'language': lang,
180+
}
181+
182+
# Burmese (my) specific custom analyzer.
183+
#
184+
# Burmese is a language that uses spaces to divide phrases, instead
185+
# of words. Because of that, it isn't really possible to split out
186+
# words like in other languages. This uses a similar approach to the
187+
# built in CJK analyzer (which doesn't work reliably here), which is
188+
# shingling, or overlapping substrings.
189+
#
190+
# Given the string 'abc def', this will result in these tokens being
191+
# generated: ['ab', 'bc', 'c ', ' d', 'de', 'ef']. ES understands
192+
# this kind of overlapping token, and hopefully this will result in
193+
# an ok search experience.
194+
195+
analyzers['custom-burmese'] = {
196+
'type': 'custom',
197+
'tokenizer': 'custom-burmese',
198+
'char_filter': ['html_strip'],
199+
}
200+
201+
tokenizers['custom-burmese'] = {
202+
'type': 'nGram',
203+
'min_gram': 2,
204+
'max_gram': 2,
205+
}
206+
207+
# Done!
208+
return {
209+
'analyzer': analyzers,
210+
'tokenizer': tokenizers,
211+
}
212+
213+
160214
def recreate_index(es=None):
161215
"""Deletes WRITE_INDEX if it's there and creates a new one"""
162216
if es is None:
@@ -165,16 +219,19 @@ def recreate_index(es=None):
165219
index = WRITE_INDEX
166220
delete_index(index)
167221

168-
mappings = get_mappings()
169-
170222
# There should be no mapping-conflict race here since the index doesn't
171223
# exist. Live indexing should just fail.
172224

173-
# Simultaneously create the index and the mappings, so live
174-
# indexing doesn't get a chance to index anything between the two
175-
# and infer a bogus mapping (which ES then freaks out over when we
176-
# try to lay in an incompatible explicit mapping).
177-
es.create_index(index, settings={'mappings': mappings})
225+
# Simultaneously create the index, the mappings, the analyzers, and
226+
# the tokenizers, so live indexing doesn't get a chance to index
227+
# anything between and infer a bogus mapping (which ES then freaks
228+
# out over when we try to lay in an incompatible explicit mapping).
229+
es.create_index(index, settings={
230+
'mappings': get_mappings(),
231+
'settings': {
232+
'analysis': get_analysis(),
233+
}
234+
})
178235

179236
# Wait until the index is there.
180237
es.health(wait_for_status='yellow')
@@ -619,3 +676,18 @@ def verify_obj(mt_name, cls, mapping, obj_id):
619676
format_time((time.time() - cls_time) * 1000 / count)))
620677

621678
log.info('Done! {0}'.format(format_time(time.time() - start_time)))
679+
680+
681+
def es_analyzer_for_locale(locale, fallback="standard"):
682+
"""Pick an appropriate analyzer for a given locale.
683+
684+
If no analyzer is defined for `locale`, return fallback instead,
685+
which defaults to ES analyzer named "standard".
686+
"""
687+
analyzer = settings.ES_LOCALE_ANALYZERS.get(locale, fallback)
688+
689+
if (not settings.ES_USE_PLUGINS and
690+
analyzer in settings.ES_PLUGIN_ANALYZERS):
691+
analyzer = fallback
692+
693+
return analyzer

kitsune/search/tests/test_es.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
# -*- coding: utf-8 -*-
12
import json
23
import unittest
34
from datetime import datetime, timedelta
@@ -24,6 +25,7 @@
2425
from kitsune.sumo.tests import LocalizingClient
2526
from kitsune.sumo.urlresolvers import reverse
2627
from kitsune.users.tests import group, user
28+
from kitsune.wiki.models import DocumentMappingType
2729
from kitsune.wiki.tests import document, revision, helpful_vote
2830

2931

@@ -1100,3 +1102,132 @@ def test_mappings(self):
11001102
merged_mapping[key][1].append(cls_name)
11011103

11021104
# If we get here, then we're fine.
1105+
1106+
1107+
class TestAnalyzers(ElasticTestCase):
1108+
1109+
def setUp(self):
1110+
super(TestAnalyzers, self).setUp()
1111+
1112+
self.locale_data = {
1113+
'en-US': {
1114+
'analyzer': 'snowball-english',
1115+
'content': 'I have a cat.',
1116+
},
1117+
'es': {
1118+
'analyzer': 'snowball-spanish',
1119+
'content': 'Tieno un gato.',
1120+
},
1121+
'ar': {
1122+
'analyzer': 'arabic',
1123+
'content': u'لدي اثنين من القطط',
1124+
},
1125+
'my': {
1126+
'analyzer': 'custom-burmese',
1127+
'content': u'အနုပညာ',
1128+
},
1129+
'he': {
1130+
'analyzer': 'standard',
1131+
'content': u'גאולוגיה היא אחד',
1132+
}
1133+
}
1134+
1135+
self.docs = {}
1136+
for locale, data in self.locale_data.items():
1137+
d = document(locale=locale, save=True)
1138+
revision(document=d, content=data['content'], is_approved=True, save=True)
1139+
self.locale_data[locale]['doc'] = d
1140+
1141+
self.refresh()
1142+
1143+
def test_analyzer_choices(self):
1144+
"""Check that the indexer picked the right analyzer."""
1145+
1146+
ids = [d.id for d in self.docs.values()]
1147+
docs = es_utils.get_documents(DocumentMappingType, ids)
1148+
for doc in docs:
1149+
locale = doc['locale']
1150+
eq_(doc['_analyzer'], self.locale_data[locale]['analyzer'])
1151+
1152+
def _check_locale_tokenization(self, locale, expected_tokens, p_tag=True):
1153+
"""
1154+
Check that a given locale's document was tokenized correctly.
1155+
1156+
* `locale` - The locale to check.
1157+
* `expected_tokens` - An iterable of the tokens that should be
1158+
found. If any tokens from this list are missing, or if any
1159+
tokens not in this list are found, the check will fail.
1160+
* `p_tag` - Default True. If True, an extra token will be added
1161+
to `expected_tokens`: "p".
1162+
1163+
This is because our wiki parser wraps it's content in <p>
1164+
tags and many analyzers will tokenize a string like
1165+
'<p>Foo</p>' as ['p', 'foo'] (the HTML tag is included in
1166+
the tokenization). So this will show up in the tokenization
1167+
during this test. Not all the analyzers do this, which is
1168+
why it can be turned off.
1169+
1170+
Why can't we fix the analyzers to strip out that HTML, and not
1171+
generate spurious tokens? That could probably be done, but it
1172+
probably isn't worth while because:
1173+
1174+
* ES will weight common words lower, thanks to it's TF-IDF
1175+
algorithms, which judges words based on how often they
1176+
appear in the entire corpus and in the document, so the p
1177+
tokens will be largely ignored.
1178+
* The pre-l10n search code did it this way, so it doesn't
1179+
break search.
1180+
* When implementing l10n search, I wanted to minimize the
1181+
number of changes needed, and this seemed like an unneeded
1182+
change.
1183+
"""
1184+
1185+
search = es_utils.Sphilastic(DocumentMappingType)
1186+
search = search.filter(document_locale=locale)
1187+
facet_filter = search._process_filters([('document_locale', locale)])
1188+
search = search.facet_raw(tokens={
1189+
'terms': {'field': 'document_content'},
1190+
'facet_filter': facet_filter,
1191+
})
1192+
facets = search.facet_counts()
1193+
1194+
expected = set(expected_tokens)
1195+
if p_tag:
1196+
# Since `expected` is a set, there is no problem adding this
1197+
# twice, since duplicates will be ignored.
1198+
expected.add(u'p')
1199+
actual = set(t['term'] for t in facets['tokens'])
1200+
eq_(actual, expected)
1201+
1202+
# These 5 languages were chosen for tokenization testing because
1203+
# they represent the 5 kinds of languages we have: English, Snowball
1204+
# supported languages, ES supported languages, Languages with custom
1205+
# analyzers, and languages with no analyzer, which use the standard
1206+
# analyzer.
1207+
1208+
def test_english_tokenization(self):
1209+
"""Test that English stemming and stop words work."""
1210+
self._check_locale_tokenization('en-US', ['i', 'have', 'cat'])
1211+
1212+
def test_spanish_tokenization(self):
1213+
"""Test that Spanish stemming and stop words work."""
1214+
self._check_locale_tokenization('es', ['tien', 'un', 'gat'])
1215+
1216+
def test_arabic_tokenization(self):
1217+
"""Test that Arabic stemming works.
1218+
1219+
I don't read Arabic, this is just what ES gave me when I asked
1220+
it to analyze an Arabic text as Arabic. If someone who reads
1221+
Arabic can improve this test, go for it!
1222+
"""
1223+
self._check_locale_tokenization('ar', [u'لد', u'اثن', u'قطط'])
1224+
1225+
def test_burmese_tokenization(self):
1226+
"""Test that the shingle analyzer is active for Burmese."""
1227+
tokens = [u'အန', u'နု', u'ုပ', u'ပည', u'ညာ']
1228+
self._check_locale_tokenization('my', tokens, False)
1229+
1230+
def test_herbrew_tokenization(self):
1231+
"""Test that Hebrew uses the standard analyzer."""
1232+
tokens = [u'גאולוגיה', u'היא', u'אחד']
1233+
self._check_locale_tokenization('he', tokens)

kitsune/settings.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,52 @@
209209
'sv-SE': 'sv',
210210
}
211211

212+
ES_LOCALE_ANALYZERS = {
213+
'ar': 'arabic',
214+
'bg': 'bulgarian',
215+
'ca': 'snowball-catalan',
216+
'cs': 'czech',
217+
'da': 'snowball-danish',
218+
'de': 'snowball-german',
219+
'en-US': 'snowball-english',
220+
'es': 'snowball-spanish',
221+
'eu': 'snowball-basque',
222+
'fa': 'persian',
223+
'fi': 'snowball-finnish',
224+
'fr': 'snowball-french',
225+
'gl': 'galician',
226+
'hi-IN': 'hindi',
227+
'hu': 'snowball-hungarian',
228+
'hy-AM': 'snowball-armenian',
229+
'id': 'indonesian',
230+
'it': 'snowball-italian',
231+
'ja': 'cjk',
232+
'my': 'custom-burmese',
233+
'nb-NO': 'snowball-norwegian',
234+
'nl': 'snowball-dutch',
235+
'no': 'snowball-norwegian',
236+
'pl': 'polish',
237+
'pt-BR': 'snowball-portuguese',
238+
'pt-PT': 'snowball-portuguese',
239+
'ro': 'snowball-romanian',
240+
'ru': 'snowball-russian',
241+
'sv': 'snowball-swedish',
242+
'th': 'thai',
243+
'tr': 'snowball-turkish',
244+
'zh-CN': 'chinese',
245+
'zh-TW': 'chinese',
246+
}
247+
248+
ES_PLUGIN_ANALYZERS = [
249+
'polish'
250+
]
251+
252+
ES_USE_PLUGINS = False
253+
212254
TEXT_DOMAIN = 'messages'
213255

214256
SITE_ID = 1
215257

216-
217258
# If you set this to False, Django will make some optimizations so as
218259
# not to load the internationalization machinery.
219260
USE_I18N = True
@@ -547,7 +588,7 @@ def JINJA_CONFIG():
547588
ES_INDEXES = {'default': 'sumo-20130701'}
548589
# Indexes for indexing--set this to ES_INDEXES if you want to read to
549590
# and write to the same index.
550-
ES_WRITE_INDEXES = ES_INDEXES
591+
ES_WRITE_INDEXES = {'default': 'sumo-20130723'}
551592
# This is prepended to index names to get the final read/write index
552593
# names used by kitsune. This is so that you can have multiple
553594
# environments pointed at the same ElasticSearch cluster and not have

kitsune/wiki/models.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919

2020
from kitsune.products.models import Product, Topic
2121
from kitsune.questions.models import Question
22-
from kitsune.search.es_utils import UnindexMeBro, ES_EXCEPTIONS
22+
from kitsune.search.es_utils import (UnindexMeBro, ES_EXCEPTIONS,
23+
es_analyzer_for_locale)
2324
from kitsune.search.models import (
2425
SearchMappingType, SearchMixin, register_for_indexing,
2526
register_mapping_type)
@@ -686,18 +687,17 @@ def get_mapping(cls):
686687
'product': {'type': 'string', 'index': 'not_analyzed'},
687688
'topic': {'type': 'string', 'index': 'not_analyzed'},
688689

689-
'document_title': {'type': 'string', 'analyzer': 'snowball'},
690+
'document_title': {'type': 'string'},
690691
'document_locale': {'type': 'string', 'index': 'not_analyzed'},
691692
'document_current_id': {'type': 'integer'},
692693
'document_parent_id': {'type': 'integer'},
693-
'document_content': {'type': 'string', 'analyzer': 'snowball',
694-
'store': 'yes',
694+
'document_content': {'type': 'string', 'store': 'yes',
695695
'term_vector': 'with_positions_offsets'},
696696
'document_category': {'type': 'integer'},
697697
'document_slug': {'type': 'string', 'index': 'not_analyzed'},
698698
'document_is_archived': {'type': 'boolean'},
699-
'document_summary': {'type': 'string', 'analyzer': 'snowball'},
700-
'document_keywords': {'type': 'string', 'analyzer': 'snowball'},
699+
'document_summary': {'type': 'string'},
700+
'document_keywords': {'type': 'string'},
701701
'document_recent_helpful_votes': {'type': 'integer'}
702702
}
703703
}
@@ -758,6 +758,9 @@ def extract_document(cls, obj_id, obj=None):
758758
else:
759759
d['document_recent_helpful_votes'] = 0
760760

761+
# Select a locale-appropriate default analyzer for all strings.
762+
d['_analyzer'] = es_analyzer_for_locale(obj.locale)
763+
761764
return d
762765

763766
@classmethod

0 commit comments

Comments
 (0)