diff --git a/.gitignore b/.gitignore index 8f49214e4af..27c222b56e0 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,7 @@ build.py **-min.js media/uploads media/attachments +media/sitemap* locale xfers/* puppet/cache/* diff --git a/apps/wiki/cron.py b/apps/wiki/cron.py index 7b5ee4bdce8..00d199cf478 100644 --- a/apps/wiki/cron.py +++ b/apps/wiki/cron.py @@ -1,8 +1,18 @@ +import os +import time +from xml.dom.minidom import parseString + from django.db import connection, transaction +from django.conf import settings +from django.contrib.sites.models import Site +from django.contrib.sitemaps import GenericSitemap +from django.template import loader +from django.utils.encoding import smart_str import cronjobs from wiki import tasks +from wiki.models import Document @cronjobs.register @@ -60,3 +70,40 @@ def calculate_related_documents(): @cronjobs.register def rebuild_kb(): tasks.rebuild_kb() + + +@cronjobs.register +def build_sitemaps(): + sitemap_element = "%s%s" + sitemap_index = "" + for locale in settings.MDN_LANGUAGES: + queryset = (Document.objects + .filter(is_template=False, locale=locale) + .exclude(title__startswith='User:') + .exclude(title__iregex=r'Redirect [0-9]+$') + .exclude(html__iregex=r'^(

)?(#)?REDIRECT') + .exclude(slug__icontains='Talk:') + ) + if len(queryset) > 0: + info = {'queryset': queryset, 'date_field': 'modified'} + sitemap = GenericSitemap(info, priority=0.5) + urls = sitemap.get_urls(page=1) + xml = smart_str(loader.render_to_string('sitemap.xml', + {'urlset': urls})) + xml = xml.replace('http://', 'https://') + directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale) + if not os.path.exists(directory): + os.makedirs(directory) + f = open('%s/sitemap.xml' % directory, 'w') + f.write(xml) + f.close() + + sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" % ( + Site.objects.get_current().domain, locale)) + sitemap_index = sitemap_index + sitemap_element % (sitemap_url, + time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime())) + + sitemap_index = sitemap_index + "" + index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w') + index_file.write(parseString(sitemap_index).toxml()) + index_file.close() diff --git a/apps/wiki/sitemap.py b/apps/wiki/sitemap.py deleted file mode 100644 index a53a10c6c60..00000000000 --- a/apps/wiki/sitemap.py +++ /dev/null @@ -1,16 +0,0 @@ -from django.contrib.sitemaps import Sitemap -from wiki.models import (Document, Revision) - -class DocumentSitemap(Sitemap): - changefreq = 'weekly' - priority = 0.5 - - def items(self): - docs = Document.objects.filter(is_template=False) - return docs - - def lastmod(self, doc): - return doc.current_revision.created - - def location(self, doc): - return doc.get_absolute_url() \ No newline at end of file diff --git a/configs/htaccess-without-mindtouch b/configs/htaccess-without-mindtouch index 6546c2df4f5..e86178b04ba 100644 --- a/configs/htaccess-without-mindtouch +++ b/configs/htaccess-without-mindtouch @@ -14,6 +14,11 @@ RewriteRule ^devnews(.*) data/www/devnews$1 [L] RewriteRule ^web-tech(.*) data/www/web-tech$1 [L] RewriteRule ^css(.*) data/www/css$1 [L] +# Rewrites to robots & sitemaps +RewriteRule ^robots.txt$ media/robots.txt [L] +RewriteRule ^sitemap.xml$ media/sitemap.xml [L] +RewriteRule ^sitemaps/([\w\-]*)/sitemap.xml$ media/sitemaps/$1/sitemap.xml [L] + # Some blanket section moves / renames RewriteRule ^En/JavaScript/Reference/Objects/Array$ en-US/docs/JavaScript/Reference/Global_Objects/Array [R=301,L,NC] RewriteRule ^En/JavaScript/Reference/Objects$ en-US/docs/JavaScript/Reference/Global_Objects/Object [R=301,L,NC] diff --git a/media/robots.txt b/media/robots.txt new file mode 100644 index 00000000000..ab7c680d482 --- /dev/null +++ b/media/robots.txt @@ -0,0 +1,10 @@ +User-Agent: * +Crawl-delay: 5 +Sitemap: sitemap.xml +Request-rate: 1/5 + +Disallow: /*feed=rss +Disallow: /*type=feed +Disallow: /skins +Disallow: /template: +Disallow: /media diff --git a/puppet/files/etc/httpd/conf.d/mozilla-kuma-apache.conf b/puppet/files/etc/httpd/conf.d/mozilla-kuma-apache.conf index 5cc3af158bd..3f702e80ae3 100644 --- a/puppet/files/etc/httpd/conf.d/mozilla-kuma-apache.conf +++ b/puppet/files/etc/httpd/conf.d/mozilla-kuma-apache.conf @@ -21,19 +21,12 @@ WSGISocketPrefix /var/run/wsgi ServerAlias developer-mdndev.mozilla.org ServerAlias developer-dev.mozilla.org - DirectoryIndex index.php index.html + DirectoryIndex index.html Options -Indexes RewriteEngine On - DocumentRoot /var/www/dekiwiki - - - Options +FollowSymLinks - AllowOverride all - Order allow,deny - Allow from all - + DocumentRoot /vagrant/webroot Options All @@ -42,7 +35,6 @@ WSGISocketPrefix /var/run/wsgi Allow from all - Alias /forums /var/www/forums Alias /media/ "/vagrant/media/" Alias /uploads/ "/home/vagrant/uploads/" Alias /admin-media/ "/vagrant/vendor/src/django/django/contrib/admin/media/" @@ -60,32 +52,6 @@ WSGISocketPrefix /var/run/wsgi ProxyPass /mwsgi http://localhost:8000 retry=1 ProxyPassReverse /mwsgi http://localhost:8000 - # deki-api uses encoded slashes in query parameters so AllowEncodedSlashes must be On - AllowEncodedSlashes On - # FIXME: - # Some php flags we need. These are only needed until all - # the short php open tags are changed to long in the source code. - php_flag short_open_tag on - - # Setting php memory parameters - php_value memory_limit "128M" - php_value post_max_size "64M" - php_value upload_max_filesize "64M" - - # mod_proxy rules - ProxyPass /@api http://localhost:8081 retry=1 - ProxyPassReverse /@api http://localhost:8081 - - SetEnv force-proxy-request-1.0 1 - SetEnv proxy-nokeepalive 1 - - - AddDefaultCharset off - Order deny,allow - Deny from all - Allow from all - - # Proxy any requests for non-existent resources under /media/uploads to the # production site. That way, we don't need an export of files from prod for # references in a DB export from prod. diff --git a/settings.py b/settings.py index 42947117133..842aea21b3b 100644 --- a/settings.py +++ b/settings.py @@ -267,7 +267,7 @@ def lazy_language_deki_map(): ADMIN_MEDIA_PREFIX = '/admin-media/' # Paths that don't require a locale prefix. -SUPPORTED_NONLOCALES = ('sitemap.xml', 'media', 'admin', 'robots.txt', 'services', +SUPPORTED_NONLOCALES = ('media', 'admin', 'robots.txt', 'services', '1', 'files', '@api', ) # Make this unique, and don't share it with anybody. diff --git a/urls.py b/urls.py index af5fb459422..a5260d5e8c0 100644 --- a/urls.py +++ b/urls.py @@ -4,7 +4,6 @@ from django.shortcuts import redirect from django.views.i18n import javascript_catalog from django.views.decorators.cache import cache_page -from wiki.sitemap import DocumentSitemap import authority import jingo @@ -13,10 +12,6 @@ admin.autodiscover() authority.autodiscover() -sitemaps = { - 'documents': DocumentSitemap, -} - urlpatterns = patterns('', # Home / landing pages: ('', include('landing.urls')), @@ -74,9 +69,6 @@ # Users ('', include('users.urls')), - #Sitemap - (r'^sitemap\.xml$', 'django.contrib.sitemaps.views.sitemap', { 'sitemaps': sitemaps }), - # Services and sundry. #(r'', include('sumo.urls')), (r'^humans.txt$', 'django.views.static.serve',