Bug 730707 kumascript #164
Changes from all commits
262ee17
7f2f0d2
24f464d
716dd4a
d37b4c6
395efcc
dfc3188
f3befb6
cf79314
90968e3
e63f3a9
d2504ab
a567d83
04ef5eb
53809c2
cb9588a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,12 @@ | ||
import logging | ||
import re | ||
from urllib import urlencode | ||
|
||
from xml.sax.saxutils import quoteattr | ||
|
||
import html5lib | ||
from html5lib.filters._base import Filter as html5lib_Filter | ||
from pyquery import PyQuery as pq | ||
|
||
from tower import ugettext as _ | ||
|
||
|
@@ -27,6 +31,16 @@ def parse(src): | |
return ContentSectionTool(src) | ||
|
||
|
||
def filter_out_noinclude(src): | ||
"""Quick and dirty filter to remove <div class="noinclude"> blocks""" | ||
# NOTE: This started as an html5lib filter, but it started getting really | ||
# complex. Seems like pyquery works well enough without corrupting | ||
# character encoding. | ||
doc = pq(src) | ||
doc.remove('*[class=noinclude]') | ||
return doc.html() | ||
|
||
|
||
class ContentSectionTool(object): | ||
|
||
def __init__(self, src=None): | ||
|
@@ -58,7 +72,7 @@ def parse(self, src): | |
def serialize(self, stream=None): | ||
if stream is None: | ||
stream = self.stream | ||
return "".join(self.serializer.serialize(stream)) | ||
return u"".join(self.serializer.serialize(stream)) | ||
|
||
def __unicode__(self): | ||
return self.serialize() | ||
|
@@ -102,6 +116,10 @@ def gen_id(self): | |
self.known_ids.add(id) | ||
return id | ||
|
||
def slugify(self, text): | ||
"""Turn the text content of a header into a slug for use in an ID""" | ||
return (text.replace(' ', '_')) | ||
|
||
def __iter__(self): | ||
input = html5lib_Filter.__iter__(self) | ||
|
||
|
@@ -113,17 +131,63 @@ def __iter__(self): | |
attrs = dict(token['data']) | ||
if 'id' in attrs: | ||
self.known_ids.add(attrs['id']) | ||
if 'name' in attrs: | ||
self.known_ids.add(attrs['name']) | ||
|
||
# Pass 2: Sprinkle in IDs where they're missing | ||
for token in buffer: | ||
if ('StartTag' == token['type'] and | ||
# Pass 2: Sprinkle in IDs where they're needed | ||
while len(buffer): | ||
token = buffer.pop(0) | ||
|
||
if not ('StartTag' == token['type'] and | ||
token['name'] in SECTION_TAGS): | ||
yield token | ||
else: | ||
attrs = dict(token['data']) | ||
id = attrs.get('id', None) | ||
if not id: | ||
|
||
# Treat a name attribute as a human-specified ID override | ||
name = attrs.get('name', None) | ||
if name: | ||
attrs['id'] = name | ||
token['data'] = attrs.items() | ||
yield token | ||
continue | ||
|
||
# If this is not a header, then generate a section ID. | ||
if token['name'] not in HEAD_TAGS: | ||
attrs['id'] = self.gen_id() | ||
token['data'] = attrs.items() | ||
yield token | ||
yield token | ||
continue | ||
|
||
# If this is a header, then scoop up the rest of the header and | ||
# gather the text it contains. | ||
start, text, tmp = token, [], [] | ||
while len(buffer): | ||
token = buffer.pop(0) | ||
tmp.append(token) | ||
if token['type'] in ('Characters', 'SpaceCharacters'): | ||
text.append(token['data']) | ||
elif ('EndTag' == token['type'] and | ||
start['name'] == token['name']): | ||
# Note: This is naive, and doesn't track other | ||
# start/end tags nested in the header. Odd things might | ||
# happen in a case like <h1><h1></h1></h1>. But, that's | ||
# invalid markup and the worst case should be a | ||
# truncated ID because all the text wasn't accumulated. | ||
break | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may be a silly question, but the comment here made me think of it: is there any mechanism enforcing uniqueness of IDs within the document? What happens if IDs end up colliding? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kind of punted on that... There is a mechanism for uniqueness, but only for auto-generated IDs (eg. sect1, sect2, etc). For IDs based on element text or the This is really a half-baked feature, ugh. :/ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. FWIW, I just filed bug 747403 to remember to put more work into this feature |
||
|
||
# Slugify the text we found inside the header, generate an ID | ||
# as a last resort. | ||
slug = self.slugify(u''.join(text)) | ||
if not slug: | ||
slug = self.gen_id() | ||
attrs['id'] = slug | ||
start['data'] = attrs.items() | ||
|
||
# Finally, emit the tokens we scooped up for the header. | ||
yield start | ||
for t in tmp: | ||
yield t | ||
|
||
|
||
class SectionEditLinkFilter(html5lib_Filter): | ||
|
@@ -152,17 +216,18 @@ def __iter__(self): | |
'title': _('Edit section'), | ||
'class': 'edit-section', | ||
'data-section-id': id, | ||
'data-section-src-url': '%s?%s' % ( | ||
'data-section-src-url': u'%s?%s' % ( | ||
reverse('wiki.document', | ||
args=[self.full_path], | ||
locale=self.locale), | ||
urlencode({'section': id, 'raw': 'true'}) | ||
urlencode({'section': id.encode('utf-8'), | ||
'raw': 'true'}) | ||
), | ||
'href': '%s?%s' % ( | ||
'href': u'%s?%s' % ( | ||
reverse('wiki.edit_document', | ||
args=[self.full_path], | ||
locale=self.locale), | ||
urlencode({'section': id, | ||
urlencode({'section': id.encode('utf-8'), | ||
'edit_links': 'true'}) | ||
) | ||
}}, | ||
|
@@ -385,12 +450,26 @@ def __iter__(self): | |
continue | ||
|
||
ds_call = [] | ||
while len(buffer) and 'EndTag' != token['type']: | ||
while len(buffer): | ||
token = buffer.pop(0) | ||
if 'Characters' == token['type']: | ||
if token['type'] in ('Characters', 'SpaceCharacters'): | ||
ds_call.append(token['data']) | ||
|
||
ds_call = ''.join(ds_call).strip() | ||
elif 'StartTag' == token['type']: | ||
attrs = token['data'] | ||
if attrs: | ||
a_out = (u' %s' % u' '.join( | ||
(u'%s=%s' % | ||
(name, quoteattr(val)) | ||
for name, val in attrs))) | ||
else: | ||
a_out = u'' | ||
ds_call.append(u'<%s%s>' % (token['name'], a_out)) | ||
elif 'EndTag' == token['type']: | ||
if 'span' == token['name']: | ||
break | ||
ds_call.append('</%s>' % token['name']) | ||
|
||
ds_call = u''.join(ds_call).strip() | ||
|
||
# Snip off any "template." prefixes | ||
strip_prefixes = ('template.', 'wiki.') | ||
|
@@ -417,7 +496,11 @@ def __iter__(self): | |
if m: | ||
ds_call = '%s()' % (m.group(1)) | ||
|
||
yield dict( | ||
type="Characters", | ||
data='{{ %s }}' % ds_call | ||
) | ||
# HACK: This is dirty, but seems like the easiest way to | ||
# reconstitute the token stream, including what gets parsed as | ||
# markup in the middle of macro parameters. | ||
# | ||
# eg. {{ Note("This is <strong>strongly</strong> discouraged") }} | ||
parsed = parse('{{ %s }}' % ds_call) | ||
for token in parsed.stream: | ||
yield token |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any chance these IDs ever end up as part of a URL (not just a fragment identifier)? Looks like they do further down and if so, that's a potential Unicode issue -- we might want to do something like Django's own built-in slugify template filter, which has a little Unicode-normalization song-and-dance to produce a readable but URL-safe result.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, these will probably end up in section editing URLs. :/ Need to look at this some more, because I want to make sure it matches up with existing anchor links from MindTouch. I don't think it quite does that all the way, either.