Skip to content
Permalink
Browse files

Bug 776703: encode some additional characters in section IDs.

In addition to non-ASCII, MindTouch encodes non-URL-safe and some
additional characters. This brings us to parity and, based on some
testing with a MindTouch install, encodes everything MindTouch
encodes, with the exception of the two angle-bracket characters; so
far as I can tell, those don't need special handling since they end up
HTML-escaped in both MindTouch and kuma.
  • Loading branch information
ubernostrum committed Jul 24, 2012
1 parent 90fa492 commit be10b92234bda15a86f98a893b38fc1dce56e1a9
Showing with 18 additions and 5 deletions.
  1. +12 −1 apps/wiki/content.py
  2. +6 −4 apps/wiki/tests/test_content.py
@@ -119,14 +119,25 @@ def gen_id(self):
self.known_ids.add(id)
return id

# MindTouch encodes these characters, so we have to encode them
# too.
non_url_safe = ['"', '#', '$', '%', '&', '+',
',', '/', ':', ';', '=', '?',
'@', '[', '\\', ']', '^', '`',
'{', '|', '}', '~']

def slugify(self, text):
"""Turn the text content of a header into a slug for use in an ID"""
non_safe = [c for c in text if c in self.non_url_safe]
if non_safe:
for c in non_safe:
text = text.replace(c, hex(ord(c)).replace('0x', '.').upper())
non_ascii = [c for c in text if ord(c) > 128]
if non_ascii:
for c in non_ascii:
text = text.replace(c, self.encode_non_ascii(c))
text = text.replace(' ', '_')
return text
return text

def encode_non_ascii(self, c):
# This is slightly gnarly.
@@ -340,14 +340,14 @@ def test_code_syntax_conversion(self):

def test_non_ascii_section_headers(self):
headers = [
(u'Documentation à propos de HTML',
'Documentation_.C3.A0_propos_de_HTML'),
(u'Documentation à propos de HTML',
'Documentation_.C3.A0_propos_de_HTML'),
(u'Outils facilitant le développement HTML',
'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
(u'例:\u00a0スキューと平行移動',
'.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
'.E4.BE.8B.3A.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
(u'例:\u00a0回転',
'.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
'.E4.BE.8B.3A.C2.A0.E5.9B.9E.E8.BB.A2'),
(u'Documentação',
'Documenta.C3.A7.C3.A3o'),
(u'Lektury uzupełniające',
@@ -356,6 +356,8 @@ def test_non_ascii_section_headers(self):
'.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
(u'HTML5 엘리먼트',
'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
(u'Non safe title "#$%&+,/:;=?@[\\]^`{|}~',
u'Non_safe_title_.22.23.24.25.26.2B.2C.2F.3A.3B.3D.3F.40.5B.5C.5D.5E.60.7B.7C.7D.7E'),
]

section_filter = SectionIDFilter('')

0 comments on commit be10b92

Please sign in to comment.
You can’t perform that action at this time.