Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
Bug 747403 -- refine section ID generation
Browse files Browse the repository at this point in the history
This is a first step, namely matching MindTouch behavior for sections
whose names contain non-ASCII characters. We now generate IDs in a
similar way: any section name which contains only ASCII content merely
has spaces replaced with underscores. A name which contains non-ASCII
characters has each such character replaced by hexadecimal digits
representing the appropriate UTF-8 codepoint(s), with each set of
digits preceded by a dot.

The test cases are a sampling of non-ASCII and mixed-character-set
section names and the slugs MindTouch generates for them.

This does not guarantee absolute parity with MindTouch, but probably
gets us close enough. It also does not deal with the problem of a
document in which not all section names are unique, but so far as I
can tell this does not introduce any new problems, merely perpeturts
an old one, assuming any such documents exist.
  • Loading branch information
ubernostrum committed Jun 18, 2012
1 parent 1ee73d6 commit f3594ea
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 2 deletions.
23 changes: 22 additions & 1 deletion apps/wiki/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,28 @@ def gen_id(self):

def slugify(self, text):
"""Turn the text content of a header into a slug for use in an ID"""
return (text.replace(' ', '_'))
non_ascii = [c for c in text if ord(c) > 128]
if non_ascii:
for c in non_ascii:
text = text.replace(c, self.encode_non_ascii(c))
text = text.replace(' ', '_')
return text

def encode_non_ascii(self, c):
# This is slightly gnarly.
#
# What MindTouch does is basically turn any non-ASCII characters
# into UTF-8 codepoints, preceded by a dot.
#
# This is somewhat tricky in Python because Python's internals are
# UCS-2, meaning that Python will give us, essentially, UTF-16
# codepoints out of Unicode strings. So, an ugly but functional
# hack: encode the offending character UTF-8 and repr that, which
# gives us the codepoints preceded by '\x' escape sequences. Then
# we can just replace the escape sequence with the dot, uppercase
# it, and we have the thing MindTouch would generate.
return repr(c.encode('utf-8')).strip("'").replace(r'\x', '.').upper()


def __iter__(self):
input = html5lib_Filter.__iter__(self)
Expand Down
28 changes: 27 additions & 1 deletion apps/wiki/tests/test_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from sumo.tests import TestCase
import wiki.content
from wiki.content import (CodeSyntaxFilter, DekiscriptMacroFilter,
SectionTOCFilter, SECTION_TAGS)
SectionTOCFilter, SectionIDFilter, SECTION_TAGS)
from wiki.models import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
from wiki.tests import normalize_html

Expand Down Expand Up @@ -338,6 +338,32 @@ def test_code_syntax_conversion(self):
.filter(CodeSyntaxFilter).serialize())
eq_(normalize_html(expected), normalize_html(result))

def test_non_ascii_section_headers(self):
headers = [
(u'Documentation à propos de HTML',
'Documentation_.C3.A0_propos_de_HTML'),
(u'Outils facilitant le développement HTML',
'Outils_facilitant_le_d.C3.A9veloppement_HTML'),
(u'例:\u00a0スキューと平行移動',
'.E4.BE.8B:.C2.A0.E3.82.B9.E3.82.AD.E3.83.A5.E3.83.BC.E3.81.A8.E5.B9.B3.E8.A1.8C.E7.A7.BB.E5.8B.95'),
(u'例:\u00a0回転',
'.E4.BE.8B:.C2.A0.E5.9B.9E.E8.BB.A2'),
(u'Documentação',
'Documenta.C3.A7.C3.A3o'),
(u'Lektury uzupełniające',
'Lektury_uzupe.C5.82niaj.C4.85ce'),
(u'Атрибуты',
'.D0.90.D1.82.D1.80.D0.B8.D0.B1.D1.83.D1.82.D1.8B'),
(u'HTML5 엘리먼트',
'HTML5_.EC.97.98.EB.A6.AC.EB.A8.BC.ED.8A.B8'),
]

section_filter = SectionIDFilter('')

for original, slugified in headers:
ok_(slugified == section_filter.slugify(original))


@attr('toc')
def test_generate_toc(self):
doc_src = """
Expand Down

0 comments on commit f3594ea

Please sign in to comment.