Bug 747403 -- refine section ID generation

This is a first step, namely matching MindTouch behavior for sections whose names contain non-ASCII characters. We now generate IDs in a similar way: any section name which contains only ASCII content merely has spaces replaced with underscores. A name which contains non-ASCII characters has each such character replaced by hexadecimal digits representing the appropriate UTF-8 codepoint(s), with each set of digits preceded by a dot. The test cases are a sampling of non-ASCII and mixed-character-set section names and the slugs MindTouch generates for them. This does not guarantee absolute parity with MindTouch, but probably gets us close enough. It also does not deal with the problem of a document in which not all section names are unique, but so far as I can tell this does not introduce any new problems, merely perpeturts an old one, assuming any such documents exist.
mdn · Jun 18, 2012 · f3594ea · f3594ea
1 parent 1ee73d6
commit f3594ea
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 2 deletions.
diff --git a/apps/wiki/content.py b/apps/wiki/content.py
@@ -118,7 +118,28 @@ def gen_id(self):
 
     def slugify(self, text):
         """Turn the text content of a header into a slug for use in an ID"""
-        return (text.replace(' ', '_'))
+        non_ascii = [c for c in text if ord(c) > 128]
+        if non_ascii:
+            for c in non_ascii:
+                text = text.replace(c, self.encode_non_ascii(c))
+        text = text.replace(' ', '_')
+        return text
+
+    def encode_non_ascii(self, c):
+        # This is slightly gnarly.
+        #
+        # What MindTouch does is basically turn any non-ASCII characters
+        # into UTF-8 codepoints, preceded by a dot.
+        #
+        # This is somewhat tricky in Python because Python's internals are
+        # UCS-2, meaning that Python will give us, essentially, UTF-16
+        # codepoints out of Unicode strings. So, an ugly but functional
+        # hack: encode the offending character UTF-8 and repr that, which
+        # gives us the codepoints preceded by '\x' escape sequences. Then
+        # we can just replace the escape sequence with the dot, uppercase
+        # it, and we have the thing MindTouch would generate.
+        return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper()
+
 
     def __iter__(self):
         input = html5lib_Filter.__iter__(self)

diff --git a/apps/wiki/tests/test_content.py b/apps/wiki/tests/test_content.py
@@ -10,7 +10,7 @@
 from sumo.tests import TestCase
 import wiki.content
 from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter,
-                          SectionTOCFilter, SECTION_TAGS)
+                          SectionTOCFilter, SectionIDFilter, SECTION_TAGS)
 from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
 from wiki.tests import normalize_html
 
@@ -338,6 +338,32 @@ def test_code_syntax_conversion(self):
                   .filter(CodeSyntaxFilter).serialize())
         eq_(normalize_html(expected), normalize_html(result))
 
+    def test_non_ascii_section_headers(self):
+        headers = [
+            (u'Documentation à propos de HTML',
+             'Documentation_.C3.A0_propos_de_HTML'),
+            (u'Outils facilitant le développement HTML',
+             'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
+            (u'例:\u00a0スキューと平行移動',
+             '.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
+            (u'例:\u00a0回転',
+             '.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
+            (u'Documentação',
+             'Documenta.C3.A7.C3.A3o'),
+            (u'Lektury uzupełniające',
+             'Lektury_uzupe.C5.82niaj.C4.85ce'),
+            (u'Атрибуты',
+             '.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
+            (u'HTML5 엘리먼트',
+             'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
+        ]
+
+        section_filter = SectionIDFilter('')
+
+        for original, slugified in headers:
+            ok_(slugified == section_filter.slugify(original))
+
+
     @attr('toc')
     def test_generate_toc(self):
         doc_src = """