bug 1316610: Switch ALLOW_ROBOTS to lists

Instead of a yes/no setting, use lists of hostnames that allow robots, and deny robots for other hostnames. For the main website, robots.txt allows robots, but forbids some paths with dynamic or user-specific content. For the untrusted attachments and samples domain, allow robots. Previously, they were not allowed in AWS, but they seem to be an important part of the content.
mdn · Feb 2, 2018 · d723177 · d723177
1 parent 4ba37df
commit d723177
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 27 deletions.
diff --git a/kuma/landing/tests/test_views.py b/kuma/landing/tests/test_views.py
@@ -1,5 +1,3 @@
-import pytest
-
 from kuma.core.urlresolvers import reverse
 
 
@@ -19,27 +17,39 @@ def test_promote_buttons(client, db):
     assert response.status_code == 200
 
 
-@pytest.mark.parametrize('allowed', [True, False])
-@pytest.mark.parametrize(
-    'host', [None, 'ATTACHMENT_HOST', 'ATTACHMENT_ORIGIN'])
-def test_robots(client, db, settings, host, allowed):
-    settings.ALLOW_ROBOTS = allowed
-    settings.ATTACHMENT_HOST = 'demos'
-    settings.ATTACHMENT_ORIGIN = 'demos-origin'
-    settings.ENABLE_RESTRICTIONS_BY_HOST = True
-    headers = {'HTTP_HOST': getattr(settings, host)} if host else {}
-    response = client.get(reverse('robots_txt'), **headers)
+def test_robots_not_allowed(client):
+    """By default, robots.txt shows that robots are not allowed."""
+    response = client.get(reverse('robots_txt'))
+    assert response.status_code == 200
+    assert response['Content-Type'] == 'text/plain'
+    content = response.content
+    assert 'Sitemap: ' not in content
+    assert 'Disallow: /\n' in content
+    assert 'Disallow: /admin/\n' not in content
+
+
+def test_robots_allowed_main_website(client, settings):
+    """On the main website, allow robots with restrictions."""
+    host = 'main.mdn.moz.works'
+    settings.ALLOW_ROBOTS_WEB_DOMAINS = [host]
+    response = client.get(reverse('robots_txt'), HTTP_HOST=host)
+    assert response.status_code == 200
+    assert response['Content-Type'] == 'text/plain'
+    content = response.content
+    assert 'Sitemap: ' in content
+    assert 'Disallow: /\n' not in content
+    assert 'Disallow: /admin/\n' in content
+
+
+def test_robots_allowed_main_attachment_host(client, settings):
+    """On the main attachment host, allow robots without restrictions."""
+    host = 'samples.mdn.moz.works'
+    settings.ALLOW_ROBOTS_DOMAINS = [host]
+    response = client.get(reverse('robots_txt'), HTTP_HOST=host)
     assert response.status_code == 200
     assert response['Content-Type'] == 'text/plain'
     content = response.content
-    if host or not allowed:
-        assert 'Sitemap: ' not in content
-        assert 'Disallow: /\n' in content
-        assert 'Disallow: /admin/\n' not in content
-    else:
-        assert 'Sitemap: ' in content
-        assert 'Disallow: /\n' not in content
-        assert 'Disallow: /admin/\n' in content
+    assert content == ''
 
 
 def test_favicon_ico(client):

diff --git a/kuma/landing/views.py b/kuma/landing/views.py
@@ -8,7 +8,6 @@
 
 from kuma.core.sections import SECTION_USAGE
 from kuma.core.cache import memcache
-from kuma.core.utils import is_untrusted
 from kuma.feeder.models import Bundle
 from kuma.search.models import Filter
 
@@ -120,9 +119,10 @@ def robots_txt(request):
 
     TODO: After AWS move, try different strategy (WhiteNoise, template)
     """
-    if settings.ENABLE_RESTRICTIONS_BY_HOST and is_untrusted(request):
-        robots = ROBOTS_GO_AWAY_TXT
-    elif settings.ALLOW_ROBOTS:
+    host = request.get_host()
+    if host in settings.ALLOW_ROBOTS_DOMAINS:
+        robots = ""
+    elif host in settings.ALLOW_ROBOTS_WEB_DOMAINS:
         robots = ROBOTS_ALLOWED_TXT
     else:
         robots = ROBOTS_GO_AWAY_TXT

diff --git a/kuma/settings/common.py b/kuma/settings/common.py
@@ -53,9 +53,7 @@ def __call__(self, value):
     default='https://interactive-examples.mdn.mozilla.net')
 
 MAINTENANCE_MODE = config('MAINTENANCE_MODE', default=False, cast=bool)
-ALLOW_ROBOTS = config('ALLOW_ROBOTS', default=False, cast=bool)
 REVISION_HASH = config('REVISION_HASH', default='undefined')
-
 MANAGERS = ADMINS
 
 
@@ -1226,6 +1224,20 @@ def pipeline_one_scss(slug, **kwargs):
     cast=bool
 )
 
+# Allow robots, but restrict some paths
+# If the domain is a CDN, the CDN origin should be included.
+ALLOW_ROBOTS_WEB_DOMAINS = set(
+    config('ALLOW_ROBOTS_WEB_DOMAINS',
+           default='developer.mozilla.org',
+           cast=Csv()))
+
+# Allow robots, no path restrictions
+# If the domain is a CDN, the CDN origin should be included.
+ALLOW_ROBOTS_DOMAINS = set(
+    config('ALLOW_ROBOTS_DOMAINS',
+           default='mdn.mozillademos.org,mdn-demos-origin.moz.works',
+           cast=Csv()))
+
 # Video settings, hard coded here for now.
 # TODO: figure out a way that doesn't need these values
 WIKI_VIDEO_WIDTH = 640

diff --git a/tests/headless/test_robots.py b/tests/headless/test_robots.py
@@ -3,6 +3,10 @@
 import pytest
 import requests
 
+INDEXED_ATTACHMENT_DOMAINS = set((
+    'mdn.mozillademos.org',         # Main attachments domain
+    'mdn-demos-origin.moz.works',   # Attachments origin
+))
 INDEXED_WEB_DOMAINS = set((
     'developer.mozilla.org',    # Main website, CDN origin
     'cdn.mdn.mozilla.net',      # Assets CDN
@@ -19,7 +23,9 @@ def test_robots(any_host_url):
 
     urlbits = urlsplit(any_host_url)
     hostname = urlbits.netloc
-    if hostname in INDEXED_WEB_DOMAINS:
+    if hostname in INDEXED_ATTACHMENT_DOMAINS:
+        assert response.content.strip() == ''
+    elif hostname in INDEXED_WEB_DOMAINS:
         assert 'Sitemap: ' in response.content
         assert 'Disallow: /admin/\n' in response.content
     else: