diff --git a/kuma/scrape/scraper.py b/kuma/scrape/scraper.py index d2b260659aa..bae055144b3 100644 --- a/kuma/scrape/scraper.py +++ b/kuma/scrape/scraper.py @@ -35,9 +35,9 @@ def session(self): self._session = requests.Session() return self._session - def request(self, path, raise_for_status=True): + def request(self, path, raise_for_status=True, method='GET'): url = self.base_url + path - logger.debug("GET %s", url) + logger.debug("%s %s", method, url) attempts = 0 response = None retry = True @@ -46,8 +46,9 @@ def request(self, path, raise_for_status=True): attempts += 1 err = None retry = False + request_function = getattr(self.session, method.lower()) try: - response = self.session.get(url, timeout=timeout) + response = request_function(url, timeout=timeout) except requests.exceptions.Timeout as err: logger.warn("Timeout on request %d for %s", attempts, url) time.sleep(timeout) diff --git a/kuma/scrape/sources/document_rendered.py b/kuma/scrape/sources/document_rendered.py index 8ed5c897181..cbe4657e204 100644 --- a/kuma/scrape/sources/document_rendered.py +++ b/kuma/scrape/sources/document_rendered.py @@ -1,29 +1,13 @@ """DocumentRenderedSource requests MDN wiki documents.""" from __future__ import absolute_import, unicode_literals -import re - from django.utils.six.moves.urllib.parse import urlparse -from pyquery import PyQuery as pq from .base import DocumentBaseSource class DocumentRenderedSource(DocumentBaseSource): - """ - Request the rendered document. - - This is used to detect zones and redirects. - """ - - # Regular expression for custom zone CSS, like zone-firefox.css - re_custom_href = re.compile("""(?x) # Verbose RE mode - .* # Match anything - \/zone- # Match '/zone-' - (?P[^.]*) # Capture the slug - (\.[0-9a-fA-F]+)? # There may be a hash in the filename - \.css # Ends in .css - """) + """Request the rendered document, to detect redirects.""" def source_path(self): return '/%s/docs/%s' % (self.locale, self.slug) @@ -31,7 +15,8 @@ def source_path(self): def load_prereqs(self, requester, storage): """Request the document, and process the redirects and response.""" response = requester.request(self.source_path(), - raise_for_status=False) + raise_for_status=False, + method='HEAD') if response.status_code not in (200, 301, 302): raise self.SourceError('status_code %s', response.status_code) data = {} @@ -43,22 +28,6 @@ def load_prereqs(self, requester, storage): if redirect_to != redirect_from: data['redirect_to'] = self.decode_href(redirect_to) - # Is this a zone root? - parsed = pq(response.content) - body = parsed('body') - if body.has_class('zone-landing'): - data['is_zone_root'] = True - - # Find the zone stylesheet - links = parsed('head link') - for link in links: - rel = link.attrib.get('rel') - href = self.decode_href(link.attrib.get('href')) - if rel == 'stylesheet' and href: - match = self.re_custom_href.match(href) - if match: - data['zone_css_slug'] = match.group('slug') - return True, data def save_data(self, storage, data): diff --git a/kuma/scrape/tests/test_source_document_rendered.py b/kuma/scrape/tests/test_source_document_rendered.py index 94518ad3984..9dccc03ba66 100644 --- a/kuma/scrape/tests/test_source_document_rendered.py +++ b/kuma/scrape/tests/test_source_document_rendered.py @@ -1,62 +1,16 @@ # -*- coding: utf-8 -*- -"""Tests for the DocumentRenderedSource class (GET document).""" +"""Tests for the DocumentRenderedSource class (HEAD document).""" from __future__ import unicode_literals -from datetime import datetime - -import pytest - -from kuma.wiki.models import Document, DocumentZone, Revision - from . import mock_requester, mock_storage from ..sources import DocumentRenderedSource -@pytest.fixture -def zone_root_doc(root_doc, settings): - """A Document record with a DocumentZone with style and a redirect.""" - settings.PIPELINE_CSS['zone-special'] = { - 'output_filename': 'build/styles/zone-special.css'} - doc = Document.objects.create( - locale='en-US', - slug=root_doc.slug + '/Zone', - parent_topic=root_doc) - DocumentZone.objects.create( - document=doc, - url_root='Zone', - css_slug='special') - revision = Revision.objects.create( - document=doc, - creator=root_doc.current_revision.creator, - content='

This is the Zone.

', - created=datetime(2016, 12, 14)) - assert doc.current_revision == revision - doc.rendered_html = doc.current_revision.content - doc.save() - return doc - - -@pytest.fixture -def zone_child_doc(zone_root_doc): - """A Document record that is below the zone root.""" - doc = Document.objects.create( - locale='en-US', - slug=zone_root_doc.slug + '/Child', - parent_topic=zone_root_doc) - creator = zone_root_doc.current_revision.creator - Revision.objects.create( - content='

A zone subpage.

', - creator=creator, - document=doc) - return doc - - def test_root_doc(root_doc, client): """Test a page without redirects.""" url = root_doc.get_absolute_url() - html = client.get(url).content source = DocumentRenderedSource(url) - requester = mock_requester(content=html) + requester = mock_requester() storage = mock_storage(spec=['save_document_rendered']) resources = source.gather(requester, storage) assert resources == [] @@ -65,21 +19,18 @@ def test_root_doc(root_doc, client): 'en-US', 'Root', {}) -def test_non_zone_redirect(root_doc, client): +def test_redirect_no_path_change(root_doc, client): """ - Test a page with non-zone redirects. + Test a page with a redirect that doesn't change the path. - For example, a page might redirect from http:// to https:// without - changing the path. + For example, a page might redirect from http:// to https://. """ url = root_doc.get_absolute_url() - html = client.get(url).content source = DocumentRenderedSource(url) requester = mock_requester( response_spec=['content', 'history', 'status_code', 'url'], history=[(301, url)], - final_path=url, - content=html) + final_path=url) storage = mock_storage(spec=['save_document_rendered']) resources = source.gather(requester, storage) assert resources == [] @@ -88,42 +39,21 @@ def test_non_zone_redirect(root_doc, client): 'en-US', 'Root', {}) -def test_zone_root_doc(zone_root_doc, client): - """The zone_css_slug is extracted from zone roots.""" - url = zone_root_doc.get_absolute_url() - html = client.get(url, follow=True).content - source = DocumentRenderedSource(url) - requester = mock_requester( - response_spec=['content', 'history', 'status_code', 'url'], - history=[(302, url)], - final_path=zone_root_doc.zone.url_root, - content=html) - storage = mock_storage(spec=['save_document_rendered']) - resources = source.gather(requester, storage) - assert resources == [] - assert source.state == source.STATE_DONE - context = {'redirect_to': 'Zone'} - storage.save_document_rendered.assert_called_once_with( - 'en-US', 'Root/Zone', context) - - -def test_zone_child_doc(zone_root_doc, zone_child_doc, client): - """The zone_css_slug is not extracted from zone children.""" - url = zone_child_doc.get_absolute_url() - html = client.get(url, follow=True).content +def test_redirect(root_doc, client): + """Test a page with a redirect.""" + final_path = root_doc.get_absolute_url() + url = final_path.replace(root_doc.slug, 'Redirect') source = DocumentRenderedSource(url) requester = mock_requester( response_spec=['content', 'history', 'status_code', 'url'], - history=[(302, url)], - final_path=zone_root_doc.zone.url_root, - content=html) + history=[(301, url)], + final_path=final_path) storage = mock_storage(spec=['save_document_rendered']) resources = source.gather(requester, storage) assert resources == [] assert source.state == source.STATE_DONE - context = {'redirect_to': 'Zone'} storage.save_document_rendered.assert_called_once_with( - 'en-US', 'Root/Zone/Child', context) + 'en-US', 'Redirect', {'redirect_to': final_path}) def test_missing_doc(client):