Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
Merge pull request #4920 from jwhitlock/scrape-zones-no-more-1462475
Browse files Browse the repository at this point in the history
bug 1462475: Remove zone scraping
  • Loading branch information
escattone committed Aug 6, 2018
2 parents 74a6693 + 3dc15e7 commit 1f487e9
Show file tree
Hide file tree
Showing 14 changed files with 197 additions and 792 deletions.
14 changes: 7 additions & 7 deletions kuma/scrape/scraper.py
Expand Up @@ -9,8 +9,8 @@

from .sources import (
DocumentChildrenSource, DocumentCurrentSource, DocumentHistorySource,
DocumentMetaSource, DocumentRenderedSource, DocumentSource, LinksSource,
RevisionSource, Source, UserSource, ZoneRootSource)
DocumentMetaSource, DocumentRedirectSource, DocumentSource, LinksSource,
RevisionSource, Source, UserSource)
from .storage import Storage

logger = logging.getLogger('kuma.scraper')
Expand All @@ -35,9 +35,9 @@ def session(self):
self._session = requests.Session()
return self._session

def request(self, path, raise_for_status=True):
def request(self, path, raise_for_status=True, method='GET'):
url = self.base_url + path
logger.debug("GET %s", url)
logger.debug("%s %s", method, url)
attempts = 0
response = None
retry = True
Expand All @@ -46,8 +46,9 @@ def request(self, path, raise_for_status=True):
attempts += 1
err = None
retry = False
request_function = getattr(self.session, method.lower())
try:
response = self.session.get(url, timeout=timeout)
response = request_function(url, timeout=timeout)
except requests.exceptions.Timeout as err:
logger.warn("Timeout on request %d for %s", attempts, url)
time.sleep(timeout)
Expand Down Expand Up @@ -90,11 +91,10 @@ class Scraper(object):
'document_current': DocumentCurrentSource,
'document_history': DocumentHistorySource,
'document_meta': DocumentMetaSource,
'document_rendered': DocumentRenderedSource,
'document_redirect': DocumentRedirectSource,
'links': LinksSource,
'revision': RevisionSource,
'user': UserSource,
'zone_root': ZoneRootSource,
}

def __init__(self, host='developer.mozilla.org', ssl=True):
Expand Down
6 changes: 2 additions & 4 deletions kuma/scrape/sources/__init__.py
Expand Up @@ -7,23 +7,21 @@
from .document_current import DocumentCurrentSource
from .document_history import DocumentHistorySource
from .document_meta import DocumentMetaSource
from .document_rendered import DocumentRenderedSource
from .document_redirect import DocumentRedirectSource
from .links import LinksSource
from .revision import RevisionSource
from .user import UserSource
from .zone_root import ZoneRootSource

__all__ = [
DocumentBaseSource,
DocumentChildrenSource,
DocumentCurrentSource,
DocumentHistorySource,
DocumentMetaSource,
DocumentRenderedSource,
DocumentRedirectSource,
DocumentSource,
LinksSource,
RevisionSource,
Source,
UserSource,
ZoneRootSource,
]
7 changes: 1 addition & 6 deletions kuma/scrape/sources/base.py
Expand Up @@ -223,12 +223,7 @@ def __init__(self, path, **options):
super(DocumentBaseSource, self).__init__(path, **options)
if path != unquote(path):
raise ValueError('URL-encoded path "%s"' % path)
try:
self.locale, self.slug = self.locale_and_slug(path)
except ValueError:
self.locale, self.slug, self.normalized_path = None, None, None
else:
self.normalized_path = path
self.locale, self.slug = self.locale_and_slug(path)

def locale_and_slug(self, path):
"""Extract a document locale and slug from a path."""
Expand Down
116 changes: 32 additions & 84 deletions kuma/scrape/sources/document.py
Expand Up @@ -20,8 +20,7 @@ def load_and_validate_existing(self, storage):

just_this_doc = (not self.translations and
self.depth == 0 and
self.revisions == 1 and
self.normalized_path)
self.revisions == 1)
if not self.force and just_this_doc:
document = storage.get_document(self.locale, self.slug)
if document:
Expand All @@ -33,41 +32,20 @@ def load_prereqs(self, requester, storage):
data = {'needs': []}

# Load data, gathering further source needs
self.load_prereq_normalized_path(storage, data)
if self.normalized_path:
self.load_prereq_parent_topic(storage, data)
self.load_prereq_rendered(storage, data)
if data.get('has_rendered'):
self.load_prereq_redirect(storage, data)
if data.get('is_standard_page'):
self.load_prereq_metadata(storage, data)
self.load_prereq_english_parent(storage, data)
self.load_prereq_history(storage, data)
self.load_prereq_children(storage, data)
self.load_prereq_parent_topic(storage, data)
self.load_prereq_redirect_check(storage, data)
if data.get('has_redirect_check'):
self.load_prereq_redirect(storage, data)
if data.get('is_standard_page'):
self.load_prereq_metadata(storage, data)
self.load_prereq_english_parent(storage, data)
self.load_prereq_history(storage, data)
self.load_prereq_children(storage, data)

return not data['needs'], data

def load_prereq_normalized_path(self, storage, data):
"""Load zone data to normalize path, if needed."""
if self.normalized_path:
return # Already normalized, done

# Determine the standard path associated with the zone
zone_data = storage.get_zone_root(self.path)
if zone_data is None:
data['needs'].append(('zone_root', self.path, {}))
elif zone_data.get('errors'):
raise self.SourceError(
'Unable to load zone root for %s', self.path)
else:
self.normalized_path = self.path.replace(
zone_data['zone_path'], zone_data['doc_path'])
self.locale, self.slug = self.locale_and_slug(
self.normalized_path)

def load_prereq_parent_topic(self, storage, data):
"""Load the parent topic, if a child page."""
assert self.normalized_path
if not self.parent_slug:
return # No parent to load

Expand All @@ -77,63 +55,37 @@ def load_prereq_parent_topic(self, storage, data):
else:
data['parent_topic'] = parent_topic

def load_prereq_rendered(self, storage, data):
"""Load the rendered page, to detect redirects and zones."""
assert self.normalized_path
rendered = storage.get_document_rendered(self.locale, self.slug)
if rendered is None:
data['needs'].append(
('document_rendered', self.normalized_path, {}))
def load_prereq_redirect_check(self, storage, data):
"""Check the URL for redirects."""
redirect = storage.get_document_redirect(self.locale, self.slug)
if redirect is None:
data['needs'].append(('document_redirect', self.path, {}))
else:
data['has_rendered'] = True
data['redirect_to'] = rendered.get('redirect_to')
data['is_zone_root'] = rendered.get('is_zone_root', False)
data['zone_css_slug'] = rendered.get('zone_css_slug', '')
data['has_redirect_check'] = True
data['redirect_to'] = redirect.get('redirect_to')

def load_prereq_redirect(self, storage, data):
"""Load the zone or standard redirect."""
assert self.normalized_path
data['is_standard_page'] = data.get('has_rendered')
"""Load the destination of a redirect."""
data['is_standard_page'] = data.get('has_redirect_check')
redirect_to = data.get('redirect_to')
if not redirect_to:
return # Not a redirect, don't follow

# Is it a zoned URL or a moved page?
try:
rd_locale, rd_slug = self.locale_and_slug(redirect_to)
except ValueError:
# Zoned URL
zone_redirect = storage.get_zone_root(redirect_to)
if zone_redirect is None:
data['needs'].append(('zone_root', redirect_to, {}))
elif zone_redirect.get('errors'):
raise self.SourceError('Unable to get zone_root "%s"',
redirect_to)
else:
data['zone_redirect_path'] = zone_redirect['zone_path']
z_path = zone_redirect['doc_path']
if z_path != self.path:
z_locale, z_slug = self.locale_and_slug(z_path)
zone_root_doc = storage.get_document(z_locale, z_slug)
if zone_root_doc is None:
data['needs'].append(('document', z_path, {}))
else:
# Moved Page
redirect = storage.get_document(rd_locale, rd_slug)
data['is_standard_page'] = False
if redirect is None:
data['needs'].append(('document', redirect_to, {}))
# Load the destination page
rd_locale, rd_slug = self.locale_and_slug(redirect_to)
redirect = storage.get_document(rd_locale, rd_slug)
data['is_standard_page'] = False
if redirect is None:
data['needs'].append(('document', redirect_to, {}))

def load_prereq_metadata(self, storage, data):
"""Load the document metadata."""
assert self.normalized_path
meta = storage.get_document_metadata(self.locale, self.slug)
if meta is None:
data['needs'].append(('document_meta', self.normalized_path,
data['needs'].append(('document_meta', self.path,
self.current_options()))
elif 'error' in meta:
raise self.SourceError('Error getting metadata for %s',
self.normalized_path)
raise self.SourceError('Error getting metadata for %s', self.path)
elif meta:
data['id'] = meta['id']
data['locale'] = meta['locale']
Expand Down Expand Up @@ -178,11 +130,11 @@ def load_prereq_history(self, storage, data):
"""Load the revision history."""
history = storage.get_document_history(self.locale, self.slug)
if history is None:
data['needs'].append(('document_history', self.normalized_path,
data['needs'].append(('document_history', self.path,
{"revisions": self.revisions}))
elif len(history) == 0:
raise self.SourceError('Empty history for document "%s"',
self.normalized_path)
self.path)

def load_prereq_children(self, storage, data):
"""Load the document children."""
Expand All @@ -191,13 +143,12 @@ def load_prereq_children(self, storage, data):
children = storage.get_document_children(self.locale, self.slug)
if children is None:
options = self.current_options()
data['needs'].append(('document_children', self.normalized_path,
options))
data['needs'].append(('document_children', self.path, options))

def save_data(self, storage, data):
"""Save the document as a redirect or full document."""
redirect_to = data.get('redirect_to')
if redirect_to and not data.get('zone_redirect_path'):
if redirect_to:
# Prepare data for a redirect document
doc_data = {
'locale': self.locale,
Expand All @@ -208,7 +159,6 @@ def save_data(self, storage, data):
# Prepare data for a full document
keys = (
'id',
'is_zone_root',
'locale',
'modified',
'parent',
Expand All @@ -217,8 +167,6 @@ def save_data(self, storage, data):
'tags',
'title',
'uuid',
'zone_css_slug',
'zone_redirect_path',
)
doc_data = {}
for key in keys:
Expand All @@ -235,5 +183,5 @@ def save_data(self, storage, data):
doc_data['locale'], self.path)
doc_data['locale'] = self.locale
storage.save_document(doc_data)
return [('document_current', self.normalized_path,
return [('document_current', self.path,
{'revisions': self.revisions})]
36 changes: 36 additions & 0 deletions kuma/scrape/sources/document_redirect.py
@@ -0,0 +1,36 @@
"""DocumentRedirectSource checks if a MDN wiki document is a redirect."""
from __future__ import absolute_import, unicode_literals

from django.utils.six.moves.urllib.parse import urlparse

from .base import DocumentBaseSource


class DocumentRedirectSource(DocumentBaseSource):
"""Request the rendered document, to detect redirects."""

def source_path(self):
return '/%s/docs/%s' % (self.locale, self.slug)

def load_prereqs(self, requester, storage):
"""Request the document, and process the redirects and response."""
response = requester.request(self.source_path(),
raise_for_status=False,
method='HEAD')
if response.status_code not in (200, 301, 302):
raise self.SourceError('status_code %s', response.status_code)
data = {}

# Is this a redirect?
if response.history:
redirect_from = urlparse(response.history[0].url).path
redirect_to = urlparse(response.url).path
if redirect_to != redirect_from:
data['redirect_to'] = self.decode_href(redirect_to)

return True, data

def save_data(self, storage, data):
"""Save the rendered document data."""
storage.save_document_redirect(self.locale, self.slug, data)
return []
67 changes: 0 additions & 67 deletions kuma/scrape/sources/document_rendered.py

This file was deleted.

0 comments on commit 1f487e9

Please sign in to comment.