Skip to content

Commit

Permalink
Stop doing this fancy-schmancy parsing of the page text in order to g…
Browse files Browse the repository at this point in the history
…et the sections dynamically. Instead, pull the cached HTML and read that instead. This cuts down the run-time from 40 minutes to about 6 minutes. \o/

Also, exclude redirects in the form of "#REDIRECT [[Foo|Bar]]". These are technically valid, but wildly silly and should be fixed in one or two sweeps. This could be fixed in the database population script as well, but these cases are rare enough that it probably isn't worth it.
  • Loading branch information
mzmcbride committed May 4, 2011
1 parent ce9ece4 commit 99d2476
Showing 1 changed file with 12 additions and 22 deletions.
34 changes: 12 additions & 22 deletions brokensectionanchors.py
Expand Up @@ -5,39 +5,28 @@
import re
import MySQLdb
import wikitools
from urllib import unquote
import urllib
import settings

def get_article_section_anchors(article):
# Returns a list of section anchors from a specified article
article_sections = []
anchor_re = re.compile(r'<span id="(.+?)"></span>', re.I)
params = {'action' : 'parse',
'prop' : 'sections|text',
'page' : '%s' % article,
'format' : 'json'}
request = wikitools.APIRequest(wiki, params)
response = request.query(querycontinue=False)
page_text = response['parse']['text']['*']
for match in anchor_re.finditer(page_text):
# Set a user-agent :-)
class urlopener(urllib.FancyURLopener):
version = 'http://en.wikipedia.org/wiki/Wikipedia_talk:Database_reports'
id_re = re.compile(r'id="(.+?)"')
target_url = settings.apiurl.replace('w/api.php','wiki/%s' % article)
urlopener = urlopener()
page = urlopener.open(target_url)
page_text = page.read()
for match in id_re.finditer(page_text):
article_sections.append(unescape_id(match.group(1).encode('utf-8')))
for entry in response['parse']['sections']:
article_sections.append(unescape_id(entry[u'anchor'].encode('utf-8')))
if len(article_sections) == 0:
params = {'action' : 'parse',
'prop' : 'sections',
'text' : '__FORCETOC__{{:%s}}' % article,
'format' : 'json'}
request = wikitools.APIRequest(wiki, params)
response = request.query(querycontinue=False)
for entry in response['parse']['sections']:
article_sections.append(unescape_id(entry[u'anchor'].encode('utf-8')))
return article_sections

def unescape_id(fragment):
fragment = fragment.replace('%', 'UNIQUE MARKER')
fragment = fragment.replace('.', '%')
fragment = unquote(fragment)
fragment = urllib.unquote(fragment)
fragment = fragment.replace('%', '.')
fragment = fragment.replace('UNIQUE MARKER', '%')
return fragment
Expand Down Expand Up @@ -94,6 +83,7 @@ def get_top_edit_timestamp(cursor, page_id):
ON rd.rd_from = page_id
WHERE page_namespace = 0
AND rd_fragment IS NOT NULL
AND rd.rd_title NOT LIKE '%|%'
GROUP BY rd.rd_title
LIMIT 2500;
''')
Expand Down

0 comments on commit 99d2476

Please sign in to comment.