Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
Merge pull request #4964 from jwhitlock/scrape-links-1462475
Browse files Browse the repository at this point in the history
bug 1462475: Improve ./manage.py scrape_links
  • Loading branch information
escattone committed Sep 5, 2018
2 parents 9d38be9 + fbfd5fd commit d8e4937
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 3 deletions.
5 changes: 4 additions & 1 deletion kuma/scrape/sources/base.py
Expand Up @@ -223,7 +223,10 @@ def __init__(self, path, **options):
super(DocumentBaseSource, self).__init__(path, **options)
if path != unquote(path):
raise ValueError('URL-encoded path "%s"' % path)
self.locale, self.slug = self.locale_and_slug(path)
try:
self.locale, self.slug = self.locale_and_slug(path)
except ValueError:
self.locale, self.slug = None, None

def locale_and_slug(self, path):
"""Extract a document locale and slug from a path."""
Expand Down
6 changes: 4 additions & 2 deletions kuma/scrape/sources/document.py
Expand Up @@ -17,11 +17,10 @@ class DocumentSource(DocumentBaseSource):

def load_and_validate_existing(self, storage):
"""Load the document from storage in simple cases."""

just_this_doc = (not self.translations and
self.depth == 0 and
self.revisions == 1)
if not self.force and just_this_doc:
if not self.force and just_this_doc and self.locale and self.slug:
document = storage.get_document(self.locale, self.slug)
if document:
return True, []
Expand All @@ -31,6 +30,9 @@ def load_prereqs(self, requester, storage):
"""Load the data needed for a document."""
data = {'needs': []}

if self.locale is None and self.slug is None:
raise self.SourceError('Not a document path "%s"', self.path)

# Load data, gathering further source needs
self.load_prereq_parent_topic(storage, data)
self.load_prereq_redirect_check(storage, data)
Expand Down
1 change: 1 addition & 0 deletions kuma/scrape/sources/links.py
Expand Up @@ -34,6 +34,7 @@ class LinksSource(Source):
'profiles',
'search',
'users/signin',
'docs/tag/',
))

def __init__(self, path=None, **options):
Expand Down
11 changes: 11 additions & 0 deletions kuma/scrape/tests/test_source_document.py
Expand Up @@ -126,6 +126,17 @@ def test_gather_standard_doc_empty_history_is_error():
assert source.state == source.STATE_ERROR


def test_gather_document_zone_url_is_error():
"""Old vanity zone URLs are not loaded."""
doc_path = "/en-US/Firefox/Releases/22"
source = DocumentSource(doc_path)
storage = mock_storage(spec=[]) # Storage is skipped
resources = source.gather(None, storage)
assert resources == []
assert source.state == source.STATE_ERROR
assert source.freshness == source.FRESH_UNKNOWN


def test_gather_standard_doc_all_prereqs():
path = '/en-US/docs/Test'
source = DocumentSource(path, force=True)
Expand Down

0 comments on commit d8e4937

Please sign in to comment.