From 51301d4b943f5eede2ae552ec2ebcc366706fb36 Mon Sep 17 00:00:00 2001 From: John Whitlock Date: Wed, 5 Sep 2018 14:23:52 -0500 Subject: [PATCH 1/2] bug 1462475: Treat zone vanity URLs as errors If a zone vanity URL, like /en-US/Firefox/Releases/22, is requested, then treat as an errored Document, rather than raise an exception and halt scraping. --- kuma/scrape/sources/base.py | 5 ++++- kuma/scrape/sources/document.py | 6 ++++-- kuma/scrape/tests/test_source_document.py | 11 +++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/kuma/scrape/sources/base.py b/kuma/scrape/sources/base.py index 1f43cd634ec..7e9e25bc952 100644 --- a/kuma/scrape/sources/base.py +++ b/kuma/scrape/sources/base.py @@ -223,7 +223,10 @@ def __init__(self, path, **options): super(DocumentBaseSource, self).__init__(path, **options) if path != unquote(path): raise ValueError('URL-encoded path "%s"' % path) - self.locale, self.slug = self.locale_and_slug(path) + try: + self.locale, self.slug = self.locale_and_slug(path) + except ValueError: + self.locale, self.slug = None, None def locale_and_slug(self, path): """Extract a document locale and slug from a path.""" diff --git a/kuma/scrape/sources/document.py b/kuma/scrape/sources/document.py index 61f7141942d..511538798ce 100644 --- a/kuma/scrape/sources/document.py +++ b/kuma/scrape/sources/document.py @@ -17,11 +17,10 @@ class DocumentSource(DocumentBaseSource): def load_and_validate_existing(self, storage): """Load the document from storage in simple cases.""" - just_this_doc = (not self.translations and self.depth == 0 and self.revisions == 1) - if not self.force and just_this_doc: + if not self.force and just_this_doc and self.locale and self.slug: document = storage.get_document(self.locale, self.slug) if document: return True, [] @@ -31,6 +30,9 @@ def load_prereqs(self, requester, storage): """Load the data needed for a document.""" data = {'needs': []} + if self.locale is None and self.slug is None: + raise self.SourceError('Not a document path "%s"', self.path) + # Load data, gathering further source needs self.load_prereq_parent_topic(storage, data) self.load_prereq_redirect_check(storage, data) diff --git a/kuma/scrape/tests/test_source_document.py b/kuma/scrape/tests/test_source_document.py index bfdf9be900c..424f2364ad4 100644 --- a/kuma/scrape/tests/test_source_document.py +++ b/kuma/scrape/tests/test_source_document.py @@ -126,6 +126,17 @@ def test_gather_standard_doc_empty_history_is_error(): assert source.state == source.STATE_ERROR +def test_gather_document_zone_url_is_error(): + """Old vanity zone URLs are not loaded.""" + doc_path = "/en-US/Firefox/Releases/22" + source = DocumentSource(doc_path) + storage = mock_storage(spec=[]) # Storage is skipped + resources = source.gather(None, storage) + assert resources == [] + assert source.state == source.STATE_ERROR + assert source.freshness == source.FRESH_UNKNOWN + + def test_gather_standard_doc_all_prereqs(): path = '/en-US/docs/Test' source = DocumentSource(path, force=True) From fbfd5fd4b9e1d0f6c6efea466ff035630547768b Mon Sep 17 00:00:00 2001 From: John Whitlock Date: Wed, 5 Sep 2018 14:37:59 -0500 Subject: [PATCH 2/2] fix bug 1488892: Skip tags in scrape_links When scraping a page for wiki documents, skip tag links like /en-US/docs/tag/Foo, which otherwise look like documents. --- kuma/scrape/sources/links.py | 1 + 1 file changed, 1 insertion(+) diff --git a/kuma/scrape/sources/links.py b/kuma/scrape/sources/links.py index b9036298485..d0a881e8573 100644 --- a/kuma/scrape/sources/links.py +++ b/kuma/scrape/sources/links.py @@ -34,6 +34,7 @@ class LinksSource(Source): 'profiles', 'search', 'users/signin', + 'docs/tag/', )) def __init__(self, path=None, **options):