Stop doing this fancy-schmancy parsing of the page text in order to g…

…et the sections dynamically. Instead, pull the cached HTML and read that instead. This cuts down the run-time from 40 minutes to about 6 minutes. \o/ Also, exclude redirects in the form of "#REDIRECT [[Foo|Bar]]". These are technically valid, but wildly silly and should be fixed in one or two sweeps. This could be fixed in the database population script as well, but these cases are rare enough that it probably isn't worth it.
mzmcbride · May 4, 2011 · 99d2476 · 99d2476
1 parent ce9ece4
commit 99d2476
Showing 1 changed file with 12 additions and 22 deletions.
diff --git a/brokensectionanchors.py b/brokensectionanchors.py
@@ -5,39 +5,28 @@
 import re
 import MySQLdb
 import wikitools
-from urllib import unquote
+import urllib
 import settings
 
 def get_article_section_anchors(article):
     # Returns a list of section anchors from a specified article
     article_sections = []
-    anchor_re = re.compile(r'<span id="(.+?)"></span>', re.I)
-    params = {'action' : 'parse',
-              'prop'   : 'sections|text',
-              'page'   : '%s' % article,
-              'format' : 'json'}
-    request = wikitools.APIRequest(wiki, params)
-    response = request.query(querycontinue=False)
-    page_text = response['parse']['text']['*']
-    for match in anchor_re.finditer(page_text):
+    # Set a user-agent :-)
+    class urlopener(urllib.FancyURLopener):
+        version = 'http://en.wikipedia.org/wiki/Wikipedia_talk:Database_reports'
+    id_re = re.compile(r'id="(.+?)"')
+    target_url = settings.apiurl.replace('w/api.php','wiki/%s' % article)
+    urlopener = urlopener()
+    page = urlopener.open(target_url)
+    page_text = page.read()
+    for match in id_re.finditer(page_text):
         article_sections.append(unescape_id(match.group(1).encode('utf-8')))
-    for entry in response['parse']['sections']:
-        article_sections.append(unescape_id(entry[u'anchor'].encode('utf-8')))
-    if len(article_sections) == 0:
-        params = {'action' : 'parse',
-                  'prop'   : 'sections',
-                  'text'   : '__FORCETOC__{{:%s}}' % article,
-                  'format' : 'json'}
-        request = wikitools.APIRequest(wiki, params)
-        response = request.query(querycontinue=False)
-        for entry in response['parse']['sections']:
-            article_sections.append(unescape_id(entry[u'anchor'].encode('utf-8')))
     return article_sections
 
 def unescape_id(fragment):
     fragment = fragment.replace('%', 'UNIQUE MARKER')
     fragment = fragment.replace('.', '%')
-    fragment = unquote(fragment)
+    fragment = urllib.unquote(fragment)
     fragment = fragment.replace('%', '.')
     fragment = fragment.replace('UNIQUE MARKER', '%')
     return fragment
@@ -94,6 +83,7 @@ def get_top_edit_timestamp(cursor, page_id):
                ON rd.rd_from = page_id
                WHERE page_namespace = 0
                AND rd_fragment IS NOT NULL
+               AND rd.rd_title NOT LIKE '%|%'
                GROUP BY rd.rd_title
                LIMIT 2500;
                ''')