Merge pull request #4964 from jwhitlock/scrape-links-1462475

bug 1462475: Improve ./manage.py scrape_links
mdn · Sep 5, 2018 · d8e4937 · d8e4937
2 parents 9d38be9 + fbfd5fd
commit d8e4937
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 3 deletions.
diff --git a/kuma/scrape/sources/base.py b/kuma/scrape/sources/base.py
@@ -223,7 +223,10 @@ def __init__(self, path, **options):
         super(DocumentBaseSource, self).__init__(path, **options)
         if path != unquote(path):
             raise ValueError('URL-encoded path "%s"' % path)
-        self.locale, self.slug = self.locale_and_slug(path)
+        try:
+            self.locale, self.slug = self.locale_and_slug(path)
+        except ValueError:
+            self.locale, self.slug = None, None
 
     def locale_and_slug(self, path):
         """Extract a document locale and slug from a path."""

diff --git a/kuma/scrape/sources/document.py b/kuma/scrape/sources/document.py
@@ -17,11 +17,10 @@ class DocumentSource(DocumentBaseSource):
 
     def load_and_validate_existing(self, storage):
         """Load the document from storage in simple cases."""
-
         just_this_doc = (not self.translations and
                          self.depth == 0 and
                          self.revisions == 1)
-        if not self.force and just_this_doc:
+        if not self.force and just_this_doc and self.locale and self.slug:
             document = storage.get_document(self.locale, self.slug)
             if document:
                 return True, []
@@ -31,6 +30,9 @@ def load_prereqs(self, requester, storage):
         """Load the data needed for a document."""
         data = {'needs': []}
 
+        if self.locale is None and self.slug is None:
+            raise self.SourceError('Not a document path "%s"', self.path)
+
         # Load data, gathering further source needs
         self.load_prereq_parent_topic(storage, data)
         self.load_prereq_redirect_check(storage, data)

diff --git a/kuma/scrape/sources/links.py b/kuma/scrape/sources/links.py
@@ -34,6 +34,7 @@ class LinksSource(Source):
         'profiles',
         'search',
         'users/signin',
+        'docs/tag/',
     ))
 
     def __init__(self, path=None, **options):

diff --git a/kuma/scrape/tests/test_source_document.py b/kuma/scrape/tests/test_source_document.py
@@ -126,6 +126,17 @@ def test_gather_standard_doc_empty_history_is_error():
     assert source.state == source.STATE_ERROR
 
 
+def test_gather_document_zone_url_is_error():
+    """Old vanity zone URLs are not loaded."""
+    doc_path = "/en-US/Firefox/Releases/22"
+    source = DocumentSource(doc_path)
+    storage = mock_storage(spec=[])  # Storage is skipped
+    resources = source.gather(None, storage)
+    assert resources == []
+    assert source.state == source.STATE_ERROR
+    assert source.freshness == source.FRESH_UNKNOWN
+
+
 def test_gather_standard_doc_all_prereqs():
     path = '/en-US/docs/Test'
     source = DocumentSource(path, force=True)