From 1a9aeca1d8d6ecf5773cf54391ee32d07cd34cf4 Mon Sep 17 00:00:00 2001 From: Rob Hudson Date: Wed, 4 Sep 2013 07:28:02 -0700 Subject: [PATCH] Added search ICU support to mapping (bug 907843) ICU adds unicode folding and normalization helping our multi-lingual searches. --- apps/constants/search.py | 29 +++++++ docs/topics/install-zamboni/elasticsearch.rst | 34 +++++--- lib/es/management/commands/reindex_mkt.py | 1 + mkt/search/tests/test_api.py | 9 ++ mkt/webapps/models.py | 82 +++++++++++++++++-- 5 files changed, 137 insertions(+), 18 deletions(-) diff --git a/apps/constants/search.py b/apps/constants/search.py index 74625247053..0f427155b9c 100644 --- a/apps/constants/search.py +++ b/apps/constants/search.py @@ -64,3 +64,32 @@ SEARCH_ANALYZER_PLUGINS = [ 'polish', ] + + +# Which stemmer to use for each langauge. +# +# Note: We use the keys of this dict for supported stop words, also, which is +# specified as, e.g., '_english_'. +STEMMER_MAP = { + 'arabic': 'arabic', + 'basque': 'basque', + 'brazilian': 'brazilian', + 'bulgarian': 'bulgarian', + 'catalan': 'catalan', + 'czech': 'czech', + 'danish': 'danish', + 'dutch': 'dutch', + 'english': 'minimal_english', + 'finnish': 'light_finish', # Yes, this is misspelled in ES. + 'french': 'light_french', + 'german': 'light_german', + 'greek': 'greek', + 'hungarian': 'light_hungarian', + 'indonesian': 'indonesian', + 'italian': 'light_italian', + 'portuguese': 'light_portuguese', + 'romanian': 'romanian', + 'russian': 'russian', + 'spanish': 'light_spanish', + 'swedish': 'light_swedish', +} diff --git a/docs/topics/install-zamboni/elasticsearch.rst b/docs/topics/install-zamboni/elasticsearch.rst index 9f214761f84..6488361d863 100644 --- a/docs/topics/install-zamboni/elasticsearch.rst +++ b/docs/topics/install-zamboni/elasticsearch.rst @@ -1,11 +1,11 @@ .. _elasticsearch: ============= -elasticsearch +Elasticsearch ============= -elasticsearch is a search server. Documents (key-values) get stored, -configurable queries come in, elasticsearch scores these documents, and returns +Elasticsearch is a search server. Documents (key-values) get stored, +configurable queries come in, Elasticsearch scores these documents, and returns the most relevant hits. Also check out `elasticsearch-head `_, @@ -15,14 +15,19 @@ elasticsearch over curl. Installation ------------ -elasticsearch comes with most package managers.:: +Elasticsearch comes with most package managers.:: brew install elasticsearch # or whatever your package manager is called. -If elasticsearch isn't packaged for your system, you can install it +If Elasticsearch isn't packaged for your system, you can install it manually, `here are some good instructions on how to do so `_. +For running Marketplace you must install the +`ICU Analysis Plugin `_. +See the `ICU Github Page `_ +for instructions on installing this plugin. + Settings -------- @@ -33,7 +38,7 @@ different from normal text. To get the same results as our servers, put this in your elasticsearch.yml (available at :src:`scripts/elasticsearch/elasticsearch.yml`) -Once installed, we can configure elasticsearch. Zamboni has a ```config.yml``` +Once installed, we can configure Elasticsearch. Zamboni has a ```config.yml``` in the ```scripts/elasticsearch/``` directory. If on OSX, copy that file into ```/usr/local/Cellar/elasticsearch/x.x.x/config/```. On Linux, the directory is ```/etc/elasticsearch/```. @@ -44,9 +49,9 @@ won't notice. Launching and Setting Up ------------------------ -Launch the elasticsearch service. If you used homebrew, `brew info +Launch the Elasticsearch service. If you used homebrew, `brew info elasticsearch` will show you the commands to launch. If you used aptitude, -elasticsearch will come with an start-stop daemon in /etc/init.d. +Elasticsearch will come with an start-stop daemon in /etc/init.d. Zamboni has commands that sets up mappings and indexes objects such as add-ons and apps for you. Setting up the mappings is analagous defining the structure @@ -83,7 +88,7 @@ maintained incrementally through post_save and post_delete hooks.:: ./manage.py weekly_downloads # Index weekly downloads. -Querying ElasticSearch in Django +Querying Elasticsearch in Django -------------------------------- We use `elasticutils `_, a Python @@ -98,12 +103,15 @@ manager. `.filter(**kwargs)` can be run on this search object.:: .values_dict('that_field')) On Marketplace, apps use ```mkt/webapps/models:WebappIndexer``` as its -interface to elasticsearch. +interface to Elasticsearch. Search is done a little differently using +this and results are a list of ``WebappIndexer`` objects:: + + query_results = S(WebappIndexer).filter(...) -Testing with elasticsearch +Testing with Elasticsearch -------------------------- -All test cases using ElasticSearch should inherit from `amo.tests.ESTestCase`. +All test cases using Elasticsearch should inherit from `amo.tests.ESTestCase`. All such tests will be skipped by the test runner unless:: RUN_ES_TESTS = True @@ -117,7 +125,7 @@ Troubleshooting *I got a CircularReference error on .search()* - check that a whole object is not being passed into the filters, but rather just a field's value. -*I indexed something into ElasticSearch, but my query returns nothing* - check +*I indexed something into Elasticsearch, but my query returns nothing* - check whether the query contains upper-case letters or hyphens. If so, try lowercasing your query filter. For hyphens, set the field's mapping to not be analyzed:: diff --git a/lib/es/management/commands/reindex_mkt.py b/lib/es/management/commands/reindex_mkt.py index 0a8cf5b3810..c1a3483339b 100644 --- a/lib/es/management/commands/reindex_mkt.py +++ b/lib/es/management/commands/reindex_mkt.py @@ -240,6 +240,7 @@ def handle(self, *args, **kwargs): # copy in Elasticsearch. # For ES < 0.90 we manually enable compression. chain |= create_index.si(new_index, ALIAS, { + 'analysis': WebappIndexer.get_analysis(), 'number_of_replicas': 0, 'number_of_shards': num_shards, 'store.compress.tv': True, 'store.compress.stored': True, 'refresh_interval': '-1'}) diff --git a/mkt/search/tests/test_api.py b/mkt/search/tests/test_api.py index f5e95462873..9515b76a39e 100644 --- a/mkt/search/tests/test_api.py +++ b/mkt/search/tests/test_api.py @@ -195,6 +195,15 @@ def test_q_is_tag(self): obj = res.json['objects'][0] eq_(obj['slug'], self.webapp.app_slug) + def test_icu_folding(self): + self.webapp.name = {'es': 'Páginas Amarillos'} + self.webapp.save() + self.refresh('webapp') + res = self.client.get(self.url + ({'q': 'paginas'},)) + eq_(res.status_code, 200) + obj = res.json['objects'][0] + eq_(obj['slug'], self.webapp.app_slug) + def test_name_localized(self): res = self.client.get(self.url + ({'q': 'something', 'lang': 'es'},)) diff --git a/mkt/webapps/models.py b/mkt/webapps/models.py index 51be9889c70..5bfc4dedc90 100644 --- a/mkt/webapps/models.py +++ b/mkt/webapps/models.py @@ -936,11 +936,80 @@ def get_index(cls): def get_model(cls): return Webapp + @classmethod + def get_settings(cls, settings_override=None): + """ + Returns settings to be passed to ES create_index. + + If `settings_override` is provided, this will use `settings_override` + to override the defaults defined here. + + """ + default_settings = { + 'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS, + 'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS, + 'refresh_interval': '5s', + 'store.compress.tv': True, + 'store.compress.stored': True, + 'analysis': cls.get_analysis(), + } + if settings_override: + default_settings.update(settings_override) + + return default_settings + + @classmethod + def get_analysis(cls): + """ + Returns the analysis dict to be used in settings for create_index. + + For languages that ES supports we define either the minimal or light + stemming, which isn't as aggresive as the snowball stemmer. We also + define the stopwords for that language. + + For all languages we've customized we're using the ICU plugin. + + """ + analyzers = {} + filters = {} + + # The default is used for fields that need ICU but are composed of + # many languages. + analyzers['default_icu'] = { + 'type': 'custom', + 'tokenizer': 'icu_tokenizer', + 'filter': ['icu_folding', 'icu_normalizer'], + } + + for lang, stemmer in amo.STEMMER_MAP.items(): + filters['%s_stem_filter' % lang] = { + 'type': 'stemmer', + 'name': stemmer, + } + filters['%s_stop_filter' % lang] = { + 'type': 'stop', + 'stopwords': ['_%s_' % lang], + } + + for lang in amo.STEMMER_MAP: + analyzers['%s_analyzer' % lang] = { + 'type': 'custom', + 'tokenizer': 'icu_tokenizer', + 'filter': ['icu_folding', 'icu_normalizer', + '%s_stop_filter' % lang, '%s_stem_filter' % lang], + } + + return { + 'analyzer': analyzers, + 'filter': filters, + } + @classmethod def setup_mapping(cls): """Creates the ES index/mapping.""" cls.get_es().create_index(cls.get_index(), - {'mappings': cls.get_mapping()}) + {'mappings': cls.get_mapping(), + 'settings': cls.get_settings()}) @classmethod def get_mapping(cls): @@ -948,8 +1017,10 @@ def get_mapping(cls): doc_type = cls.get_mapping_type_name() def _locale_field_mapping(field, analyzer): - return {'%s_%s' % (field, analyzer): {'type': 'string', - 'analyzer': analyzer}} + get_analyzer = lambda a: ( + '%s_analyzer' % a if a in amo.STEMMER_MAP else a) + return {'%s_%s' % (field, analyzer): { + 'type': 'string', 'analyzer': get_analyzer(analyzer)}} mapping = { doc_type: { @@ -977,7 +1048,8 @@ def _locale_field_mapping(field, analyzer): 'index': 'not_analyzed'}, 'default_locale': {'type': 'string', 'index': 'not_analyzed'}, - 'description': {'type': 'string', 'analyzer': 'snowball'}, + 'description': {'type': 'string', + 'analyzer': 'default_icu'}, 'device': {'type': 'byte'}, 'features': { 'type': 'object', @@ -1009,7 +1081,7 @@ def _locale_field_mapping(field, analyzer): }, 'manifest_url': {'type': 'string', 'index': 'not_analyzed'}, - 'name': {'type': 'string', 'analyzer': 'snowball'}, + 'name': {'type': 'string', 'analyzer': 'default_icu'}, # Turn off analysis on name so we can sort by it. 'name_sort': {'type': 'string', 'index': 'not_analyzed'}, 'owners': {'type': 'long'},