Simple URL parsing for websites (bug 1147915)

mozilla · Jun 8, 2015 · 849a09f · 849a09f
1 parent 5691ce7
commit 849a09f
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 5 deletions.
diff --git a/mkt/search/filters.py b/mkt/search/filters.py
@@ -51,7 +51,7 @@ def filter_queryset(self, request, queryset, view):
         # in every document type / indexes.
         for k, v in rules:
             for field in ('name', 'short_name', 'title', 'app_slug', 'author',
-                          'url'):
+                          'url_tokenized'):
                 should.append(k(**{field: v}))
 
         # Exact matches need to be queried against a non-analyzed field. Let's

diff --git a/mkt/websites/indexers.py b/mkt/websites/indexers.py
@@ -1,10 +1,18 @@
+import re
 from operator import attrgetter
+from urlparse import urlparse
 
 from mkt.search.indexers import BaseIndexer
 from mkt.tags.models import attach_tags
 from mkt.translations.models import attach_trans_dict
 
 
+URL_RE = re.compile(
+    r'^www\.|^m\.|^mobile\.|'  # Remove common subdomains.
+    r'\.com$|\.net$|\.org$|\.\w{2}$'  # Remove common TLDs incl. ccTLDs.
+)
+
+
 class WebsiteIndexer(BaseIndexer):
     translated_fields = ('description', 'name', 'short_name', 'title')
     fields_with_language_analyzers = ('description', 'name', 'short_name')
@@ -85,9 +93,8 @@ def get_mapping(cls):
                         'analyzer': 'default_icu',
                         'position_offset_gap': 100,
                     },
-                    # FIXME: Add custom analyzer for url, that strips http,
-                    # https, maybe also www. and any .tld ?
-                    'url': {'type': 'string', 'analyzer': 'simple'},
+                    'url': cls.string_not_analyzed(),
+                    'url_tokenized': {'type': 'string', 'analyzer': 'simple'},
                 }
             }
         }
@@ -129,6 +136,7 @@ def extract_document(cls, pk=None, obj=None):
         doc['name_sort'] = unicode(obj.name).lower()
         doc['preferred_regions'] = obj.preferred_regions or []
         doc['tags'] = getattr(obj, 'keywords_list', [])
+        doc['url_tokenized'] = cls.strip_url(obj.url)
 
         # Add boost, popularity, trending values.
         doc.update(cls.extract_popularity_trending_boost(obj))
@@ -144,3 +152,29 @@ def extract_document(cls, pk=None, obj=None):
             doc.update(cls.extract_field_analyzed_translations(obj, field))
 
         return doc
+
+    @classmethod
+    def strip_url(cls, url):
+        """
+        Remove all unwanted sections of the URL and return a string that will
+        be passed to Elasticsearch.
+
+        E.g. 'https://m.domain.com/topic/' will become 'domain/topic/', which
+        will further get tokenized by Elasticsearch into 'domain' and 'topic'.
+
+        This will never be perfect but it should remove a majority of cruft
+        from getting indexed.
+
+        """
+        bits = urlparse(url)
+
+        # Get just the netloc.
+        url = bits.netloc
+
+        # Strip common subdomains and TLDs.
+        url = URL_RE.sub('', url)
+
+        # Add back stuff in the path.
+        url += bits.path
+
+        return url
diff --git a/mkt/websites/tests/test_indexers.py b/mkt/websites/tests/test_indexers.py
@@ -59,7 +59,9 @@ def test_extract(self):
         eq_(doc['default_locale'], self.obj.default_locale)
         eq_(doc['created'], self.obj.created)
         eq_(doc['modified'], self.obj.modified)
-        eq_(doc['url'], unicode(self.obj.url))
+        eq_(doc['url'], self.obj.url)
+        eq_(doc['url_tokenized'],
+            unicode(self.indexer.strip_url(self.obj.url)))
         eq_(doc['name'], [unicode(self.obj.name)])
         eq_(doc['name_translations'], [{
             'lang': u'en-US', 'string': unicode(self.obj.name)}])
@@ -138,6 +140,20 @@ def test_trending(self):
         # Adolescent regions trending value is not stored.
         ok_('trending_2' not in doc)
 
+    def test_url(self):
+        self.obj = website_factory()
+        expected = {
+            'http://domain.com': 'domain',
+            'https://www.domain.com': 'domain',
+            'http://m.domain.com': 'domain',
+            'http://mobile.domain.com': 'domain',
+            'http://domain.uk': 'domain',
+            'http://www.domain.com/path/': 'domain/path/',
+            'http://www.domain.com/path/?query#fragment': 'domain/path/',
+        }
+        for k, v in expected.items():
+            eq_(self.indexer.strip_url(k), v)
+
 
 class TestExcludedFields(ESTestCase):
     def setUp(self):