Added search ICU support to mapping (bug 907843)

ICU adds unicode folding and normalization helping our multi-lingual searches.
mozilla · Sep 4, 2013 · 1a9aeca · 1a9aeca · chuckharmston · Sep 4, 2013
1 parent 1d2395f
commit 1a9aeca
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 18 deletions.
diff --git a/apps/constants/search.py b/apps/constants/search.py
@@ -64,3 +64,32 @@
 SEARCH_ANALYZER_PLUGINS = [
     'polish',
 ]
+
+
+# Which stemmer to use for each langauge.
+#
+# Note: We use the keys of this dict for supported stop words, also, which is
+# specified as, e.g., '_english_'.
+STEMMER_MAP = {
+    'arabic': 'arabic',
+    'basque': 'basque',
+    'brazilian': 'brazilian',
+    'bulgarian': 'bulgarian',
+    'catalan': 'catalan',
+    'czech': 'czech',
+    'danish': 'danish',
+    'dutch': 'dutch',
+    'english': 'minimal_english',
+    'finnish': 'light_finish',  # Yes, this is misspelled in ES.
+    'french': 'light_french',
+    'german': 'light_german',
+    'greek': 'greek',
+    'hungarian': 'light_hungarian',
+    'indonesian': 'indonesian',
+    'italian': 'light_italian',
+    'portuguese': 'light_portuguese',
+    'romanian': 'romanian',
+    'russian': 'russian',
+    'spanish': 'light_spanish',
+    'swedish': 'light_swedish',
+}
diff --git a/docs/topics/install-zamboni/elasticsearch.rst b/docs/topics/install-zamboni/elasticsearch.rst
@@ -1,11 +1,11 @@
 .. _elasticsearch:
 
 =============
-elasticsearch
+Elasticsearch
 =============
 
-elasticsearch is a search server. Documents (key-values) get stored,
-configurable queries come in, elasticsearch scores these documents, and returns
+Elasticsearch is a search server. Documents (key-values) get stored,
+configurable queries come in, Elasticsearch scores these documents, and returns
 the most relevant hits.
 
 Also check out `elasticsearch-head <http://mobz.github.io/elasticsearch-head/>`_,
@@ -15,14 +15,19 @@ elasticsearch over curl.
 Installation
 ------------
 
-elasticsearch comes with most package managers.::
+Elasticsearch comes with most package managers.::
 
     brew install elasticsearch  # or whatever your package manager is called.
 
-If elasticsearch isn't packaged for your system, you can install it
+If Elasticsearch isn't packaged for your system, you can install it
 manually, `here are some good instructions on how to do so
 <http://www.elasticsearch.org/tutorials/2010/07/01/setting-up-elasticsearch.html>`_.
 
+For running Marketplace you must install the
+`ICU Analysis Plugin <http://www.elasticsearch.org/guide/reference/index-modules/analysis/icu-plugin/>`_.
+See the `ICU Github Page <https://github.com/elasticsearch/elasticsearch-analysis-icu>`_
+for instructions on installing this plugin.
+
 Settings
 --------
 
@@ -33,7 +38,7 @@ different from normal text. To get the same results as our servers, put this in
 your elasticsearch.yml (available at
 :src:`scripts/elasticsearch/elasticsearch.yml`)
 
-Once installed, we can configure elasticsearch. Zamboni has a ```config.yml```
+Once installed, we can configure Elasticsearch. Zamboni has a ```config.yml```
 in the ```scripts/elasticsearch/``` directory. If on OSX, copy that file into
 ```/usr/local/Cellar/elasticsearch/x.x.x/config/```. On Linux, the directory is
 ```/etc/elasticsearch/```.
@@ -44,9 +49,9 @@ won't notice.
 Launching and Setting Up
 ------------------------
 
-Launch the elasticsearch service. If you used homebrew, `brew info
+Launch the Elasticsearch service. If you used homebrew, `brew info
 elasticsearch` will show you the commands to launch. If you used aptitude,
-elasticsearch will come with an start-stop daemon in /etc/init.d.
+Elasticsearch will come with an start-stop daemon in /etc/init.d.
 
 Zamboni has commands that sets up mappings and indexes objects such as add-ons
 and apps for you. Setting up the mappings is analagous defining the structure
@@ -83,7 +88,7 @@ maintained incrementally through post_save and post_delete hooks.::
 
     ./manage.py weekly_downloads # Index weekly downloads.
 
-Querying ElasticSearch in Django
+Querying Elasticsearch in Django
 --------------------------------
 
 We use `elasticutils <http://github.com/mozilla/elasticutils>`_, a Python
@@ -98,12 +103,15 @@ manager. `.filter(**kwargs)` can be run on this search object.::
         .values_dict('that_field'))
 
 On Marketplace, apps use ```mkt/webapps/models:WebappIndexer``` as its
-interface to elasticsearch.
+interface to Elasticsearch. Search is done a little differently using
+this and results are a list of ``WebappIndexer`` objects::
+
+    query_results = S(WebappIndexer).filter(...)
 
-Testing with elasticsearch
+Testing with Elasticsearch
 --------------------------
 
-All test cases using ElasticSearch should inherit from `amo.tests.ESTestCase`.
+All test cases using Elasticsearch should inherit from `amo.tests.ESTestCase`.
 All such tests will be skipped by the test runner unless::
 
     RUN_ES_TESTS = True
@@ -117,7 +125,7 @@ Troubleshooting
 *I got a CircularReference error on .search()* - check that a whole object is
 not being passed into the filters, but rather just a field's value.
 
-*I indexed something into ElasticSearch, but my query returns nothing* - check
+*I indexed something into Elasticsearch, but my query returns nothing* - check
 whether the query contains upper-case letters or hyphens. If so, try
 lowercasing your query filter. For hyphens, set the field's mapping to not be
 analyzed::

diff --git a/lib/es/management/commands/reindex_mkt.py b/lib/es/management/commands/reindex_mkt.py
@@ -240,6 +240,7 @@ def handle(self, *args, **kwargs):
         # copy in Elasticsearch.
         # For ES < 0.90 we manually enable compression.
         chain |= create_index.si(new_index, ALIAS, {
+            'analysis': WebappIndexer.get_analysis(),
             'number_of_replicas': 0, 'number_of_shards': num_shards,
             'store.compress.tv': True, 'store.compress.stored': True,
             'refresh_interval': '-1'})

diff --git a/mkt/search/tests/test_api.py b/mkt/search/tests/test_api.py
@@ -195,6 +195,15 @@ def test_q_is_tag(self):
         obj = res.json['objects'][0]
         eq_(obj['slug'], self.webapp.app_slug)
 
+    def test_icu_folding(self):
+        self.webapp.name = {'es': 'Páginas Amarillos'}
+        self.webapp.save()
+        self.refresh('webapp')
+        res = self.client.get(self.url + ({'q': 'paginas'},))
+        eq_(res.status_code, 200)
+        obj = res.json['objects'][0]
+        eq_(obj['slug'], self.webapp.app_slug)
+
     def test_name_localized(self):
         res = self.client.get(self.url + ({'q': 'something',
                                            'lang': 'es'},))

diff --git a/mkt/webapps/models.py b/mkt/webapps/models.py
@@ -936,20 +936,91 @@ def get_index(cls):
     def get_model(cls):
         return Webapp
 
+    @classmethod
+    def get_settings(cls, settings_override=None):
+        """
+        Returns settings to be passed to ES create_index.
+
+        If `settings_override` is provided, this will use `settings_override`
+        to override the defaults defined here.
+
+        """
+        default_settings = {
+            'number_of_replicas': settings.ES_DEFAULT_NUM_REPLICAS,
+            'number_of_shards': settings.ES_DEFAULT_NUM_SHARDS,
+            'refresh_interval': '5s',
+            'store.compress.tv': True,
+            'store.compress.stored': True,
+            'analysis': cls.get_analysis(),
+        }
+        if settings_override:
+            default_settings.update(settings_override)
+
+        return default_settings
+
+    @classmethod
+    def get_analysis(cls):
+        """
+        Returns the analysis dict to be used in settings for create_index.
+
+        For languages that ES supports we define either the minimal or light
+        stemming, which isn't as aggresive as the snowball stemmer. We also
+        define the stopwords for that language.
+
+        For all languages we've customized we're using the ICU plugin.
+
+        """
+        analyzers = {}
+        filters = {}
+
+        # The default is used for fields that need ICU but are composed of
+        # many languages.
+        analyzers['default_icu'] = {
+            'type': 'custom',
+            'tokenizer': 'icu_tokenizer',
+            'filter': ['icu_folding', 'icu_normalizer'],
+        }
+
+        for lang, stemmer in amo.STEMMER_MAP.items():
+            filters['%s_stem_filter' % lang] = {
+                'type': 'stemmer',
+                'name': stemmer,
+            }
+            filters['%s_stop_filter' % lang] = {
+                'type': 'stop',
+                'stopwords': ['_%s_' % lang],
+            }
+
+        for lang in amo.STEMMER_MAP:
+            analyzers['%s_analyzer' % lang] = {
+                'type': 'custom',
+                'tokenizer': 'icu_tokenizer',
+                'filter': ['icu_folding', 'icu_normalizer',
+                           '%s_stop_filter' % lang, '%s_stem_filter' % lang],
+            }
+
+        return {
+            'analyzer': analyzers,
+            'filter': filters,
+        }
+
     @classmethod
     def setup_mapping(cls):
         """Creates the ES index/mapping."""
         cls.get_es().create_index(cls.get_index(),
-                                  {'mappings': cls.get_mapping()})
+                                  {'mappings': cls.get_mapping(),
+                                   'settings': cls.get_settings()})
 
     @classmethod
     def get_mapping(cls):
 
         doc_type = cls.get_mapping_type_name()
 
         def _locale_field_mapping(field, analyzer):
-            return {'%s_%s' % (field, analyzer): {'type': 'string',
-                                                  'analyzer': analyzer}}
+            get_analyzer = lambda a: (
+                '%s_analyzer' % a if a in amo.STEMMER_MAP else a)
+            return {'%s_%s' % (field, analyzer): {
+                'type': 'string', 'analyzer': get_analyzer(analyzer)}}
 
         mapping = {
             doc_type: {
@@ -977,7 +1048,8 @@ def _locale_field_mapping(field, analyzer):
                                         'index': 'not_analyzed'},
                     'default_locale': {'type': 'string',
                                        'index': 'not_analyzed'},
-                    'description': {'type': 'string', 'analyzer': 'snowball'},
+                    'description': {'type': 'string',
+                                    'analyzer': 'default_icu'},
                     'device': {'type': 'byte'},
                     'features': {
                         'type': 'object',
@@ -1009,7 +1081,7 @@ def _locale_field_mapping(field, analyzer):
                     },
                     'manifest_url': {'type': 'string',
                                      'index': 'not_analyzed'},
-                    'name': {'type': 'string', 'analyzer': 'snowball'},
+                    'name': {'type': 'string', 'analyzer': 'default_icu'},
                     # Turn off analysis on name so we can sort by it.
                     'name_sort': {'type': 'string', 'index': 'not_analyzed'},
                     'owners': {'type': 'long'},