From 51301d4b943f5eede2ae552ec2ebcc366706fb36 Mon Sep 17 00:00:00 2001
From: John Whitlock <jwhitlock@mozilla.com>
Date: Wed, 5 Sep 2018 14:23:52 -0500
Subject: [PATCH 1/2] bug 1462475: Treat zone vanity URLs as errors

If a zone vanity URL, like /en-US/Firefox/Releases/22, is requested,
then treat as an errored Document, rather than raise an exception and
halt scraping.
---
 kuma/scrape/sources/base.py               |  5 ++++-
 kuma/scrape/sources/document.py           |  6 ++++--
 kuma/scrape/tests/test_source_document.py | 11 +++++++++++
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/kuma/scrape/sources/base.py b/kuma/scrape/sources/base.py
index 1f43cd634ec..7e9e25bc952 100644
--- a/kuma/scrape/sources/base.py
+++ b/kuma/scrape/sources/base.py
@@ -223,7 +223,10 @@ def __init__(self, path, **options):
         super(DocumentBaseSource, self).__init__(path, **options)
         if path != unquote(path):
             raise ValueError('URL-encoded path "%s"' % path)
-        self.locale, self.slug = self.locale_and_slug(path)
+        try:
+            self.locale, self.slug = self.locale_and_slug(path)
+        except ValueError:
+            self.locale, self.slug = None, None
 
     def locale_and_slug(self, path):
         """Extract a document locale and slug from a path."""
diff --git a/kuma/scrape/sources/document.py b/kuma/scrape/sources/document.py
index 61f7141942d..511538798ce 100644
--- a/kuma/scrape/sources/document.py
+++ b/kuma/scrape/sources/document.py
@@ -17,11 +17,10 @@ class DocumentSource(DocumentBaseSource):
 
     def load_and_validate_existing(self, storage):
         """Load the document from storage in simple cases."""
-
         just_this_doc = (not self.translations and
                          self.depth == 0 and
                          self.revisions == 1)
-        if not self.force and just_this_doc:
+        if not self.force and just_this_doc and self.locale and self.slug:
             document = storage.get_document(self.locale, self.slug)
             if document:
                 return True, []
@@ -31,6 +30,9 @@ def load_prereqs(self, requester, storage):
         """Load the data needed for a document."""
         data = {'needs': []}
 
+        if self.locale is None and self.slug is None:
+            raise self.SourceError('Not a document path "%s"', self.path)
+
         # Load data, gathering further source needs
         self.load_prereq_parent_topic(storage, data)
         self.load_prereq_redirect_check(storage, data)
diff --git a/kuma/scrape/tests/test_source_document.py b/kuma/scrape/tests/test_source_document.py
index bfdf9be900c..424f2364ad4 100644
--- a/kuma/scrape/tests/test_source_document.py
+++ b/kuma/scrape/tests/test_source_document.py
@@ -126,6 +126,17 @@ def test_gather_standard_doc_empty_history_is_error():
     assert source.state == source.STATE_ERROR
 
 
+def test_gather_document_zone_url_is_error():
+    """Old vanity zone URLs are not loaded."""
+    doc_path = "/en-US/Firefox/Releases/22"
+    source = DocumentSource(doc_path)
+    storage = mock_storage(spec=[])  # Storage is skipped
+    resources = source.gather(None, storage)
+    assert resources == []
+    assert source.state == source.STATE_ERROR
+    assert source.freshness == source.FRESH_UNKNOWN
+
+
 def test_gather_standard_doc_all_prereqs():
     path = '/en-US/docs/Test'
     source = DocumentSource(path, force=True)

From fbfd5fd4b9e1d0f6c6efea466ff035630547768b Mon Sep 17 00:00:00 2001
From: John Whitlock <jwhitlock@mozilla.com>
Date: Wed, 5 Sep 2018 14:37:59 -0500
Subject: [PATCH 2/2] fix bug 1488892: Skip tags in scrape_links

When scraping a page for wiki documents, skip tag links like
/en-US/docs/tag/Foo, which otherwise look like documents.
---
 kuma/scrape/sources/links.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kuma/scrape/sources/links.py b/kuma/scrape/sources/links.py
index b9036298485..d0a881e8573 100644
--- a/kuma/scrape/sources/links.py
+++ b/kuma/scrape/sources/links.py
@@ -34,6 +34,7 @@ class LinksSource(Source):
         'profiles',
         'search',
         'users/signin',
+        'docs/tag/',
     ))
 
     def __init__(self, path=None, **options):