Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
bug 1316610: Switch ALLOW_ROBOTS to lists
Browse files Browse the repository at this point in the history
Instead of a yes/no setting, use lists of hostnames that allow robots,
and deny robots for other hostnames.

For the main website, robots.txt allows robots, but forbids some paths
with dynamic or user-specific content.

For the untrusted attachments and samples domain, allow robots.
Previously, they were not allowed in AWS, but they seem to be an
important part of the content.
  • Loading branch information
jwhitlock committed Feb 2, 2018
1 parent 4ba37df commit d723177
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 27 deletions.
50 changes: 30 additions & 20 deletions kuma/landing/tests/test_views.py
@@ -1,5 +1,3 @@
import pytest

from kuma.core.urlresolvers import reverse


Expand All @@ -19,27 +17,39 @@ def test_promote_buttons(client, db):
assert response.status_code == 200


@pytest.mark.parametrize('allowed', [True, False])
@pytest.mark.parametrize(
'host', [None, 'ATTACHMENT_HOST', 'ATTACHMENT_ORIGIN'])
def test_robots(client, db, settings, host, allowed):
settings.ALLOW_ROBOTS = allowed
settings.ATTACHMENT_HOST = 'demos'
settings.ATTACHMENT_ORIGIN = 'demos-origin'
settings.ENABLE_RESTRICTIONS_BY_HOST = True
headers = {'HTTP_HOST': getattr(settings, host)} if host else {}
response = client.get(reverse('robots_txt'), **headers)
def test_robots_not_allowed(client):
"""By default, robots.txt shows that robots are not allowed."""
response = client.get(reverse('robots_txt'))
assert response.status_code == 200
assert response['Content-Type'] == 'text/plain'
content = response.content
assert 'Sitemap: ' not in content
assert 'Disallow: /\n' in content
assert 'Disallow: /admin/\n' not in content


def test_robots_allowed_main_website(client, settings):
"""On the main website, allow robots with restrictions."""
host = 'main.mdn.moz.works'
settings.ALLOW_ROBOTS_WEB_DOMAINS = [host]
response = client.get(reverse('robots_txt'), HTTP_HOST=host)
assert response.status_code == 200
assert response['Content-Type'] == 'text/plain'
content = response.content
assert 'Sitemap: ' in content
assert 'Disallow: /\n' not in content
assert 'Disallow: /admin/\n' in content


def test_robots_allowed_main_attachment_host(client, settings):
"""On the main attachment host, allow robots without restrictions."""
host = 'samples.mdn.moz.works'
settings.ALLOW_ROBOTS_DOMAINS = [host]
response = client.get(reverse('robots_txt'), HTTP_HOST=host)
assert response.status_code == 200
assert response['Content-Type'] == 'text/plain'
content = response.content
if host or not allowed:
assert 'Sitemap: ' not in content
assert 'Disallow: /\n' in content
assert 'Disallow: /admin/\n' not in content
else:
assert 'Sitemap: ' in content
assert 'Disallow: /\n' not in content
assert 'Disallow: /admin/\n' in content
assert content == ''


def test_favicon_ico(client):
Expand Down
8 changes: 4 additions & 4 deletions kuma/landing/views.py
Expand Up @@ -8,7 +8,6 @@

from kuma.core.sections import SECTION_USAGE
from kuma.core.cache import memcache
from kuma.core.utils import is_untrusted
from kuma.feeder.models import Bundle
from kuma.search.models import Filter

Expand Down Expand Up @@ -120,9 +119,10 @@ def robots_txt(request):
TODO: After AWS move, try different strategy (WhiteNoise, template)
"""
if settings.ENABLE_RESTRICTIONS_BY_HOST and is_untrusted(request):
robots = ROBOTS_GO_AWAY_TXT
elif settings.ALLOW_ROBOTS:
host = request.get_host()
if host in settings.ALLOW_ROBOTS_DOMAINS:
robots = ""
elif host in settings.ALLOW_ROBOTS_WEB_DOMAINS:
robots = ROBOTS_ALLOWED_TXT
else:
robots = ROBOTS_GO_AWAY_TXT
Expand Down
16 changes: 14 additions & 2 deletions kuma/settings/common.py
Expand Up @@ -53,9 +53,7 @@ def __call__(self, value):
default='https://interactive-examples.mdn.mozilla.net')

MAINTENANCE_MODE = config('MAINTENANCE_MODE', default=False, cast=bool)
ALLOW_ROBOTS = config('ALLOW_ROBOTS', default=False, cast=bool)
REVISION_HASH = config('REVISION_HASH', default='undefined')

MANAGERS = ADMINS


Expand Down Expand Up @@ -1226,6 +1224,20 @@ def pipeline_one_scss(slug, **kwargs):
cast=bool
)

# Allow robots, but restrict some paths
# If the domain is a CDN, the CDN origin should be included.
ALLOW_ROBOTS_WEB_DOMAINS = set(
config('ALLOW_ROBOTS_WEB_DOMAINS',
default='developer.mozilla.org',
cast=Csv()))

# Allow robots, no path restrictions
# If the domain is a CDN, the CDN origin should be included.
ALLOW_ROBOTS_DOMAINS = set(
config('ALLOW_ROBOTS_DOMAINS',
default='mdn.mozillademos.org,mdn-demos-origin.moz.works',
cast=Csv()))

# Video settings, hard coded here for now.
# TODO: figure out a way that doesn't need these values
WIKI_VIDEO_WIDTH = 640
Expand Down
8 changes: 7 additions & 1 deletion tests/headless/test_robots.py
Expand Up @@ -3,6 +3,10 @@
import pytest
import requests

INDEXED_ATTACHMENT_DOMAINS = set((
'mdn.mozillademos.org', # Main attachments domain
'mdn-demos-origin.moz.works', # Attachments origin
))
INDEXED_WEB_DOMAINS = set((
'developer.mozilla.org', # Main website, CDN origin
'cdn.mdn.mozilla.net', # Assets CDN
Expand All @@ -19,7 +23,9 @@ def test_robots(any_host_url):

urlbits = urlsplit(any_host_url)
hostname = urlbits.netloc
if hostname in INDEXED_WEB_DOMAINS:
if hostname in INDEXED_ATTACHMENT_DOMAINS:
assert response.content.strip() == ''
elif hostname in INDEXED_WEB_DOMAINS:
assert 'Sitemap: ' in response.content
assert 'Disallow: /admin/\n' in response.content
else:
Expand Down

0 comments on commit d723177

Please sign in to comment.