Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Loading…

bug 750880: Error and perf tweaks for locale migration #190

Merged
merged 1 commit into from

2 participants

@lmorchard

These are changes I did to trap errors, quiet some messages, and speed a few things up for a full migration run including all locales.

@groovecoder groovecoder merged commit d7cf675 into from
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
Showing with 58 additions and 24 deletions.
  1. +58 −24 apps/dekicompat/management/commands/migrate_to_kuma_wiki.py
View
82 apps/dekicompat/management/commands/migrate_to_kuma_wiki.py
@@ -26,6 +26,7 @@
CommandError)
import django.db
from django.db import connections, transaction
+from django.db.utils import DatabaseError
from django.utils import encoding, hashcompat
import commonware.log
@@ -67,10 +68,10 @@
)
MT_NS_NAME_TO_ID = dict(MT_NAMESPACES)
MT_NS_ID_TO_NAME = dict((x[1], x[0]) for x in MT_NAMESPACES)
-MT_MIGRATED_NS_IDS = (MT_NS_NAME_TO_ID[x] for x in (
+MT_MIGRATED_NS_IDS = [MT_NS_NAME_TO_ID[x] for x in (
'', 'Talk:', 'User:', 'User_talk:', 'Project:', 'Project_talk:',
'Template:', 'Template_talk:',
-))
+)]
# NOTE: These are MD5 hashes of garbage User page content. The criteria is that
# the content was found to repeat more than 3 times, and was hand-reviewed by
@@ -227,7 +228,7 @@ def handle_migration(self, rows):
log.info("Found %s docs already migrated" %
len(self.docs_migrated.values()))
- start_ts = ts_now = time.time()
+ start_ts = ts_now = ts_last_status = time.time()
self.rev_ct = 0
ct, skip_ct, error_ct = 0, 0, 0
@@ -266,7 +267,9 @@ def handle_migration(self, rows):
ts_now = time.time()
duration = ts_now - start_ts
total_ct = ct + skip_ct + error_ct
- if (total_ct % 10) == 0:
+ # Emit status every 5 seconds
+ if ((ts_now - ts_last_status) > 5.0):
+ ts_last_status = time.time()
log.info("Rate: %s docs/sec, %s secs/doc, "
"%s total in %s seconds" %
((total_ct + 1) / (duration + 1),
@@ -335,6 +338,7 @@ def handle_syntax_metrics(self, rows):
@transaction.commit_manually(using='default')
def make_languages_relationships(self, rows):
"""Set the parent_id of Kuma pages using wiki.languages params"""
+ log.info("Building parent/child locale tree...")
wl_pat = re.compile(r"""^wiki.languages\((.+)\)""")
# language_tree is {page_id: [child_id, child_id, ...], ...}
language_tree = {}
@@ -345,6 +349,8 @@ def make_languages_relationships(self, rows):
if not r['page_title'].lower().startswith('en/'):
# Page is not an english page, skip it
continue
+ # Build the page slug from namespace + title or display name
+ locale, slug = self.get_kuma_locale_and_slug_for_page(r)
parent_id = r['page_id']
doc = pq(r['page_text'])
spans = doc.find('span.script')
@@ -360,17 +366,29 @@ def make_languages_relationships(self, rows):
try:
page_languages = json.loads(page_languages_json)
except ValueError:
- log.info("Error parsing wiki.languages JSON")
+ log.error("\t%s/%s (%s) error parsing wiki.languages JSON" %
+ (locale, slug, r['page_display_name']))
+ continue
+ vals = page_languages.values()
+ if not vals:
+ continue
wc = self.wikidb.cursor()
sql = """
SELECT page_id
FROM pages
WHERE page_title IN ('%s')
AND page_namespace = 0
- """ % "','".join(page_languages.values())
- wc.execute(sql)
+ """ % "','".join(vals)
+ try:
+ wc.execute(sql)
+ except Exception, e:
+ log.error("\t%s/%s (%s) error %s" %
+ (locale, slug, r['page_display_name'], e))
+ continue
for row in wc:
language_tree[parent_id].append(row[0])
+
+ log.info("Building translation relationships...")
kc = self.kumadb.cursor()
for parent_id, children in language_tree.items():
# Now that we have our tree of docs and children, migrate them
@@ -381,22 +399,38 @@ def make_languages_relationships(self, rows):
rows = self._query("SELECT * FROM pages WHERE page_id = %s" %
parent_id)
self.handle_migration(rows)
- parent_doc = Document.objects.get(mindtouch_page_id=parent_id)
- sql = "SELECT * FROM pages WHERE page_id in (%s)" % (
- ",".join([str(x) for x in children]))
- rows = self._query(sql)
- self.handle_migration(rows)
+ try:
+ parent_doc = Document.objects.get(mindtouch_page_id=parent_id)
+ except Document.DoesNotExist:
+ # Ugh, even after migration we didn't end up with the
+ # parent doc
+ continue
+
+ # Migrate any child documents that haven't already been
+ existing = [str(x['mindtouch_page_id']) for x in
+ Document.objects.filter(mindtouch_page_id__in=children)
+ .values('mindtouch_page_id')]
+ need_migrate_ids = [str(x) for x in children if str(x) not in existing]
+ if need_migrate_ids:
+ sql = "SELECT * FROM pages WHERE page_id in (%s)" % (
+ ",".join(need_migrate_ids))
+ rows = self._query(sql)
+ self.handle_migration(rows)
+
# All parents and children migrated, now set parent_id
# TODO: refactor this to source_id when we change to
# source/translation relationship model
- sql = """
- UPDATE wiki_document
- SET parent_id = %s
- WHERE mindtouch_page_id IN (%s)
- """ % (parent_doc.id, ",".join([str(x) for x in children]))
- kc.execute(sql)
- transaction.commit()
- log.info("Updated %s documents with parent ID." % kc.rowcount)
+ child_ids = [str(x) for x in children]
+ if child_ids:
+ log.info(u"\t%s (%s)" % (parent_doc.full_path, parent_doc.title))
+ sql = """
+ UPDATE wiki_document
+ SET parent_id = %s
+ WHERE mindtouch_page_id IN (%s)
+ """ % (parent_doc.id, ",".join(child_ids))
+ kc.execute(sql)
+ log.info(u"\t\tUpdated %s documents with parent ID." % kc.rowcount)
+ transaction.commit()
@transaction.commit_on_success
def wipe_documents(self):
@@ -602,8 +636,8 @@ def update_document(self, r):
last_mod = self.docs_migrated.get(r['page_id'], (None, None))[1]
if (not self.options['update_documents'] and last_mod is not None
and last_mod >= page_ts):
- log.debug("\t%s/%s (%s) up to date" %
- (locale, slug, r['page_display_name']))
+ # log.debug("\t%s/%s (%s) up to date" %
+ # (locale, slug, r['page_display_name']))
return False
# Check to see if this doc's content hash falls in the list of User:
@@ -612,8 +646,8 @@ def update_document(self, r):
content_hash = (hashlib.md5(r['page_text'].encode('utf-8'))
.hexdigest())
if content_hash in USER_NS_EXCLUDED_CONTENT_HASHES:
- log.debug("\t%s/%s (%s) matched User: content exclusion list" %
- (locale, slug, r['page_display_name']))
+ # log.debug("\t%s/%s (%s) matched User: content exclusion list" %
+ # (locale, slug, r['page_display_name']))
return False
# Skip migrating Template:MindTouch/* templates
Something went wrong with that request. Please try again.