Skip to content
This repository has been archived by the owner on Mar 15, 2018. It is now read-only.

Commit

Permalink
Simple URL parsing for websites (bug 1147915)
Browse files Browse the repository at this point in the history
  • Loading branch information
robhudson committed Jun 8, 2015
1 parent 5691ce7 commit 849a09f
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 5 deletions.
2 changes: 1 addition & 1 deletion mkt/search/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def filter_queryset(self, request, queryset, view):
# in every document type / indexes.
for k, v in rules:
for field in ('name', 'short_name', 'title', 'app_slug', 'author',
'url'):
'url_tokenized'):
should.append(k(**{field: v}))

# Exact matches need to be queried against a non-analyzed field. Let's
Expand Down
40 changes: 37 additions & 3 deletions mkt/websites/indexers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import re
from operator import attrgetter
from urlparse import urlparse

from mkt.search.indexers import BaseIndexer
from mkt.tags.models import attach_tags
from mkt.translations.models import attach_trans_dict


URL_RE = re.compile(
r'^www\.|^m\.|^mobile\.|' # Remove common subdomains.
r'\.com$|\.net$|\.org$|\.\w{2}$' # Remove common TLDs incl. ccTLDs.
)


class WebsiteIndexer(BaseIndexer):
translated_fields = ('description', 'name', 'short_name', 'title')
fields_with_language_analyzers = ('description', 'name', 'short_name')
Expand Down Expand Up @@ -85,9 +93,8 @@ def get_mapping(cls):
'analyzer': 'default_icu',
'position_offset_gap': 100,
},
# FIXME: Add custom analyzer for url, that strips http,
# https, maybe also www. and any .tld ?
'url': {'type': 'string', 'analyzer': 'simple'},
'url': cls.string_not_analyzed(),
'url_tokenized': {'type': 'string', 'analyzer': 'simple'},
}
}
}
Expand Down Expand Up @@ -129,6 +136,7 @@ def extract_document(cls, pk=None, obj=None):
doc['name_sort'] = unicode(obj.name).lower()
doc['preferred_regions'] = obj.preferred_regions or []
doc['tags'] = getattr(obj, 'keywords_list', [])
doc['url_tokenized'] = cls.strip_url(obj.url)

# Add boost, popularity, trending values.
doc.update(cls.extract_popularity_trending_boost(obj))
Expand All @@ -144,3 +152,29 @@ def extract_document(cls, pk=None, obj=None):
doc.update(cls.extract_field_analyzed_translations(obj, field))

return doc

@classmethod
def strip_url(cls, url):
"""
Remove all unwanted sections of the URL and return a string that will
be passed to Elasticsearch.
E.g. 'https://m.domain.com/topic/' will become 'domain/topic/', which
will further get tokenized by Elasticsearch into 'domain' and 'topic'.
This will never be perfect but it should remove a majority of cruft
from getting indexed.
"""
bits = urlparse(url)

# Get just the netloc.
url = bits.netloc

# Strip common subdomains and TLDs.
url = URL_RE.sub('', url)

# Add back stuff in the path.
url += bits.path

return url
18 changes: 17 additions & 1 deletion mkt/websites/tests/test_indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,9 @@ def test_extract(self):
eq_(doc['default_locale'], self.obj.default_locale)
eq_(doc['created'], self.obj.created)
eq_(doc['modified'], self.obj.modified)
eq_(doc['url'], unicode(self.obj.url))
eq_(doc['url'], self.obj.url)
eq_(doc['url_tokenized'],
unicode(self.indexer.strip_url(self.obj.url)))
eq_(doc['name'], [unicode(self.obj.name)])
eq_(doc['name_translations'], [{
'lang': u'en-US', 'string': unicode(self.obj.name)}])
Expand Down Expand Up @@ -138,6 +140,20 @@ def test_trending(self):
# Adolescent regions trending value is not stored.
ok_('trending_2' not in doc)

def test_url(self):
self.obj = website_factory()
expected = {
'http://domain.com': 'domain',
'https://www.domain.com': 'domain',
'http://m.domain.com': 'domain',
'http://mobile.domain.com': 'domain',
'http://domain.uk': 'domain',
'http://www.domain.com/path/': 'domain/path/',
'http://www.domain.com/path/?query#fragment': 'domain/path/',
}
for k, v in expected.items():
eq_(self.indexer.strip_url(k), v)


class TestExcludedFields(ESTestCase):
def setUp(self):
Expand Down

0 comments on commit 849a09f

Please sign in to comment.