Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
bug 1462475: Only look for redirects, not zones
Browse files Browse the repository at this point in the history
Use HEAD to detect if a document path is a zone, rather than GET to
check if there is a zone style or redirect in the content as well.
  • Loading branch information
jwhitlock committed Aug 3, 2018
1 parent 6d339de commit 1599c83
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 120 deletions.
7 changes: 4 additions & 3 deletions kuma/scrape/scraper.py
Expand Up @@ -35,9 +35,9 @@ def session(self):
self._session = requests.Session()
return self._session

def request(self, path, raise_for_status=True):
def request(self, path, raise_for_status=True, method='GET'):
url = self.base_url + path
logger.debug("GET %s", url)
logger.debug("%s %s", method, url)
attempts = 0
response = None
retry = True
Expand All @@ -46,8 +46,9 @@ def request(self, path, raise_for_status=True):
attempts += 1
err = None
retry = False
request_function = getattr(self.session, method.lower())
try:
response = self.session.get(url, timeout=timeout)
response = request_function(url, timeout=timeout)
except requests.exceptions.Timeout as err:
logger.warn("Timeout on request %d for %s", attempts, url)
time.sleep(timeout)
Expand Down
37 changes: 3 additions & 34 deletions kuma/scrape/sources/document_rendered.py
@@ -1,37 +1,22 @@
"""DocumentRenderedSource requests MDN wiki documents."""
from __future__ import absolute_import, unicode_literals

import re

from django.utils.six.moves.urllib.parse import urlparse
from pyquery import PyQuery as pq

from .base import DocumentBaseSource


class DocumentRenderedSource(DocumentBaseSource):
"""
Request the rendered document.
This is used to detect zones and redirects.
"""

# Regular expression for custom zone CSS, like zone-firefox.css
re_custom_href = re.compile("""(?x) # Verbose RE mode
.* # Match anything
\/zone- # Match '/zone-'
(?P<slug>[^.]*) # Capture the slug
(\.[0-9a-fA-F]+)? # There may be a hash in the filename
\.css # Ends in .css
""")
"""Request the rendered document, to detect redirects."""

def source_path(self):
return '/%s/docs/%s' % (self.locale, self.slug)

def load_prereqs(self, requester, storage):
"""Request the document, and process the redirects and response."""
response = requester.request(self.source_path(),
raise_for_status=False)
raise_for_status=False,
method='HEAD')
if response.status_code not in (200, 301, 302):
raise self.SourceError('status_code %s', response.status_code)
data = {}
Expand All @@ -43,22 +28,6 @@ def load_prereqs(self, requester, storage):
if redirect_to != redirect_from:
data['redirect_to'] = self.decode_href(redirect_to)

# Is this a zone root?
parsed = pq(response.content)
body = parsed('body')
if body.has_class('zone-landing'):
data['is_zone_root'] = True

# Find the zone stylesheet
links = parsed('head link')
for link in links:
rel = link.attrib.get('rel')
href = self.decode_href(link.attrib.get('href'))
if rel == 'stylesheet' and href:
match = self.re_custom_href.match(href)
if match:
data['zone_css_slug'] = match.group('slug')

return True, data

def save_data(self, storage, data):
Expand Down
96 changes: 13 additions & 83 deletions kuma/scrape/tests/test_source_document_rendered.py
@@ -1,62 +1,16 @@
# -*- coding: utf-8 -*-
"""Tests for the DocumentRenderedSource class (GET document)."""
"""Tests for the DocumentRenderedSource class (HEAD document)."""
from __future__ import unicode_literals

from datetime import datetime

import pytest

from kuma.wiki.models import Document, DocumentZone, Revision

from . import mock_requester, mock_storage
from ..sources import DocumentRenderedSource


@pytest.fixture
def zone_root_doc(root_doc, settings):
"""A Document record with a DocumentZone with style and a redirect."""
settings.PIPELINE_CSS['zone-special'] = {
'output_filename': 'build/styles/zone-special.css'}
doc = Document.objects.create(
locale='en-US',
slug=root_doc.slug + '/Zone',
parent_topic=root_doc)
DocumentZone.objects.create(
document=doc,
url_root='Zone',
css_slug='special')
revision = Revision.objects.create(
document=doc,
creator=root_doc.current_revision.creator,
content='<p>This is the Zone.</p>',
created=datetime(2016, 12, 14))
assert doc.current_revision == revision
doc.rendered_html = doc.current_revision.content
doc.save()
return doc


@pytest.fixture
def zone_child_doc(zone_root_doc):
"""A Document record that is below the zone root."""
doc = Document.objects.create(
locale='en-US',
slug=zone_root_doc.slug + '/Child',
parent_topic=zone_root_doc)
creator = zone_root_doc.current_revision.creator
Revision.objects.create(
content='<p>A zone subpage.</p>',
creator=creator,
document=doc)
return doc


def test_root_doc(root_doc, client):
"""Test a page without redirects."""
url = root_doc.get_absolute_url()
html = client.get(url).content
source = DocumentRenderedSource(url)
requester = mock_requester(content=html)
requester = mock_requester()
storage = mock_storage(spec=['save_document_rendered'])
resources = source.gather(requester, storage)
assert resources == []
Expand All @@ -65,21 +19,18 @@ def test_root_doc(root_doc, client):
'en-US', 'Root', {})


def test_non_zone_redirect(root_doc, client):
def test_redirect_no_path_change(root_doc, client):
"""
Test a page with non-zone redirects.
Test a page with a redirect that doesn't change the path.
For example, a page might redirect from http:// to https:// without
changing the path.
For example, a page might redirect from http:// to https://.
"""
url = root_doc.get_absolute_url()
html = client.get(url).content
source = DocumentRenderedSource(url)
requester = mock_requester(
response_spec=['content', 'history', 'status_code', 'url'],
history=[(301, url)],
final_path=url,
content=html)
final_path=url)
storage = mock_storage(spec=['save_document_rendered'])
resources = source.gather(requester, storage)
assert resources == []
Expand All @@ -88,42 +39,21 @@ def test_non_zone_redirect(root_doc, client):
'en-US', 'Root', {})


def test_zone_root_doc(zone_root_doc, client):
"""The zone_css_slug is extracted from zone roots."""
url = zone_root_doc.get_absolute_url()
html = client.get(url, follow=True).content
source = DocumentRenderedSource(url)
requester = mock_requester(
response_spec=['content', 'history', 'status_code', 'url'],
history=[(302, url)],
final_path=zone_root_doc.zone.url_root,
content=html)
storage = mock_storage(spec=['save_document_rendered'])
resources = source.gather(requester, storage)
assert resources == []
assert source.state == source.STATE_DONE
context = {'redirect_to': 'Zone'}
storage.save_document_rendered.assert_called_once_with(
'en-US', 'Root/Zone', context)


def test_zone_child_doc(zone_root_doc, zone_child_doc, client):
"""The zone_css_slug is not extracted from zone children."""
url = zone_child_doc.get_absolute_url()
html = client.get(url, follow=True).content
def test_redirect(root_doc, client):
"""Test a page with a redirect."""
final_path = root_doc.get_absolute_url()
url = final_path.replace(root_doc.slug, 'Redirect')
source = DocumentRenderedSource(url)
requester = mock_requester(
response_spec=['content', 'history', 'status_code', 'url'],
history=[(302, url)],
final_path=zone_root_doc.zone.url_root,
content=html)
history=[(301, url)],
final_path=final_path)
storage = mock_storage(spec=['save_document_rendered'])
resources = source.gather(requester, storage)
assert resources == []
assert source.state == source.STATE_DONE
context = {'redirect_to': 'Zone'}
storage.save_document_rendered.assert_called_once_with(
'en-US', 'Root/Zone/Child', context)
'en-US', 'Redirect', {'redirect_to': final_path})


def test_missing_doc(client):
Expand Down

0 comments on commit 1599c83

Please sign in to comment.