Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
fix bug 792947 - use cron to create static sitemap.xml
Browse files Browse the repository at this point in the history
per-locale sitemaps excluding noisy docs

use a sitemap index file

change DocumentRoot to django webroot

unused sitemap module; Site for domain; https locs
  • Loading branch information
groovecoder committed Oct 3, 2012
1 parent db8ed07 commit 8bab924
Show file tree
Hide file tree
Showing 8 changed files with 66 additions and 61 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -18,6 +18,7 @@ build.py
**-min.js
media/uploads
media/attachments
media/sitemap*
locale
xfers/*
puppet/cache/*
Expand Down
47 changes: 47 additions & 0 deletions apps/wiki/cron.py
@@ -1,8 +1,18 @@
import os
import time
from xml.dom.minidom import parseString

from django.db import connection, transaction
from django.conf import settings
from django.contrib.sites.models import Site
from django.contrib.sitemaps import GenericSitemap
from django.template import loader
from django.utils.encoding import smart_str

import cronjobs

from wiki import tasks
from wiki.models import Document


@cronjobs.register
Expand Down Expand Up @@ -60,3 +70,40 @@ def calculate_related_documents():
@cronjobs.register
def rebuild_kb():
tasks.rebuild_kb()


@cronjobs.register
def build_sitemaps():
sitemap_element = "<sitemap><loc>%s</loc><lastmod>%s</lastmod></sitemap>"
sitemap_index = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">"
for locale in settings.MDN_LANGUAGES:
queryset = (Document.objects
.filter(is_template=False, locale=locale)
.exclude(title__startswith='User:')
.exclude(title__iregex=r'Redirect [0-9]+$')
.exclude(html__iregex=r'^(<p>)?(#)?REDIRECT')
.exclude(slug__icontains='Talk:')
)
if len(queryset) > 0:
info = {'queryset': queryset, 'date_field': 'modified'}
sitemap = GenericSitemap(info, priority=0.5)
urls = sitemap.get_urls(page=1)
xml = smart_str(loader.render_to_string('sitemap.xml',
{'urlset': urls}))
xml = xml.replace('http://', 'https://')
directory = '%s/sitemaps/%s' % (settings.MEDIA_ROOT, locale)
if not os.path.exists(directory):
os.makedirs(directory)
f = open('%s/sitemap.xml' % directory, 'w')
f.write(xml)
f.close()

sitemap_url = ("https://%s/sitemaps/%s/sitemap.xml" % (
Site.objects.get_current().domain, locale))
sitemap_index = sitemap_index + sitemap_element % (sitemap_url,
time.strftime('%Y-%m-%dT%H:%M:%S', time.localtime()))

sitemap_index = sitemap_index + "</sitemapindex>"
index_file = open('%s/sitemap.xml' % settings.MEDIA_ROOT, 'w')
index_file.write(parseString(sitemap_index).toxml())
index_file.close()
16 changes: 0 additions & 16 deletions apps/wiki/sitemap.py

This file was deleted.

5 changes: 5 additions & 0 deletions configs/htaccess-without-mindtouch
Expand Up @@ -14,6 +14,11 @@ RewriteRule ^devnews(.*) data/www/devnews$1 [L]
RewriteRule ^web-tech(.*) data/www/web-tech$1 [L]
RewriteRule ^css(.*) data/www/css$1 [L]

# Rewrites to robots & sitemaps
RewriteRule ^robots.txt$ media/robots.txt [L]
RewriteRule ^sitemap.xml$ media/sitemap.xml [L]
RewriteRule ^sitemaps/([\w\-]*)/sitemap.xml$ media/sitemaps/$1/sitemap.xml [L]

# Some blanket section moves / renames
RewriteRule ^En/JavaScript/Reference/Objects/Array$ en-US/docs/JavaScript/Reference/Global_Objects/Array [R=301,L,NC]
RewriteRule ^En/JavaScript/Reference/Objects$ en-US/docs/JavaScript/Reference/Global_Objects/Object [R=301,L,NC]
Expand Down
10 changes: 10 additions & 0 deletions media/robots.txt
@@ -0,0 +1,10 @@
User-Agent: *
Crawl-delay: 5
Sitemap: sitemap.xml
Request-rate: 1/5

Disallow: /*feed=rss
Disallow: /*type=feed
Disallow: /skins
Disallow: /template:
Disallow: /media
38 changes: 2 additions & 36 deletions puppet/files/etc/httpd/conf.d/mozilla-kuma-apache.conf
Expand Up @@ -21,19 +21,12 @@ WSGISocketPrefix /var/run/wsgi
ServerAlias developer-mdndev.mozilla.org
ServerAlias developer-dev.mozilla.org

DirectoryIndex index.php index.html
DirectoryIndex index.html
Options -Indexes

RewriteEngine On

DocumentRoot /var/www/dekiwiki

<Directory /var/www/dekiwiki>
Options +FollowSymLinks
AllowOverride all
Order allow,deny
Allow from all
</Directory>
DocumentRoot /vagrant/webroot

<Directory "/vagrant/webroot">
Options All
Expand All @@ -42,7 +35,6 @@ WSGISocketPrefix /var/run/wsgi
Allow from all
</Directory>

Alias /forums /var/www/forums
Alias /media/ "/vagrant/media/"
Alias /uploads/ "/home/vagrant/uploads/"
Alias /admin-media/ "/vagrant/vendor/src/django/django/contrib/admin/media/"
Expand All @@ -60,32 +52,6 @@ WSGISocketPrefix /var/run/wsgi
ProxyPass /mwsgi http://localhost:8000 retry=1
ProxyPassReverse /mwsgi http://localhost:8000

# deki-api uses encoded slashes in query parameters so AllowEncodedSlashes must be On
AllowEncodedSlashes On
# FIXME:
# Some php flags we need. These are only needed until all
# the short php open tags are changed to long in the source code.
php_flag short_open_tag on

# Setting php memory parameters
php_value memory_limit "128M"
php_value post_max_size "64M"
php_value upload_max_filesize "64M"

# mod_proxy rules
ProxyPass /@api http://localhost:8081 retry=1
ProxyPassReverse /@api http://localhost:8081

SetEnv force-proxy-request-1.0 1
SetEnv proxy-nokeepalive 1

<Proxy *>
AddDefaultCharset off
Order deny,allow
Deny from all
Allow from all
</Proxy>

# Proxy any requests for non-existent resources under /media/uploads to the
# production site. That way, we don't need an export of files from prod for
# references in a DB export from prod.
Expand Down
2 changes: 1 addition & 1 deletion settings.py
Expand Up @@ -267,7 +267,7 @@ def lazy_language_deki_map():
ADMIN_MEDIA_PREFIX = '/admin-media/'

# Paths that don't require a locale prefix.
SUPPORTED_NONLOCALES = ('sitemap.xml', 'media', 'admin', 'robots.txt', 'services',
SUPPORTED_NONLOCALES = ('media', 'admin', 'robots.txt', 'services',
'1', 'files', '@api', )

# Make this unique, and don't share it with anybody.
Expand Down
8 changes: 0 additions & 8 deletions urls.py
Expand Up @@ -4,7 +4,6 @@
from django.shortcuts import redirect
from django.views.i18n import javascript_catalog
from django.views.decorators.cache import cache_page
from wiki.sitemap import DocumentSitemap

import authority
import jingo
Expand All @@ -13,10 +12,6 @@
admin.autodiscover()
authority.autodiscover()

sitemaps = {
'documents': DocumentSitemap,
}

urlpatterns = patterns('',
# Home / landing pages:
('', include('landing.urls')),
Expand Down Expand Up @@ -74,9 +69,6 @@
# Users
('', include('users.urls')),

#Sitemap
(r'^sitemap\.xml$', 'django.contrib.sitemaps.views.sitemap', { 'sitemaps': sitemaps }),

# Services and sundry.
#(r'', include('sumo.urls')),
(r'^humans.txt$', 'django.views.static.serve',
Expand Down

0 comments on commit 8bab924

Please sign in to comment.