Permalink
Browse files

Add rendering for allowed and disallowed HTML tags

In the HTML preprocessor, the allowed tags are cleaned (only the allowed
parameters are kept) and output as HTML and the disallowed tags are output as text.

In the text preprocessor, the allowed tags are rendered by specific
functions and the disallowed ones are output as text.
  • Loading branch information...
1 parent 19d8ad3 commit bcfad0095cbf069186f2c19376427a6f12815e54 @peter17 committed Jul 21, 2011
Showing with 265 additions and 15 deletions.
  1. +48 −1 html.py
  2. +4 −4 mediawiki.pijnu
  3. +16 −1 raw.py
  4. +25 −0 tests/__init__.py
  5. +57 −0 tests/test_html_postprocessor.py
  6. +56 −0 tests/test_text_postprocessor.py
  7. +51 −1 text.py
  8. +8 −8 wikitextParser.py
View
49 html.py
@@ -1,5 +1,8 @@
from constants import html_entities
+allowed_tags = ['p', 'span', 'b', 'br', 'hr']
+allowed_parameters = ['class', 'style', 'name', 'id']
@erikrose

erikrose Jul 21, 2011

Of course, we'll eventually want to be able to pass in lists of allowed tags and params. This should be straightforward by passing in toolset functions.

+
def render_title2(node):
node.value = '<h2>%s</h2>\n' % node.leaf()
@@ -29,13 +32,57 @@ def render_lt(node):
def render_gt(node):
node.value = '&gt;'
+def process_attributes(node, allowed_tag):
+ result = ''
+ if len(node.value) == 1:
+ pass
+ elif len(node.value) == 2:
+ attributes = node.value[1].value
+ for i in range(len(attributes)):
+ attribute_name = attributes[i].value[0].value
+ attribute_value = attributes[i].value[1].value
+ if not allowed_tag or attribute_name in allowed_parameters:
+ result += ' %s="%s"' % (attribute_name, attribute_value)
+ else:
+ raise exception, "Bad AST shape!"
+ return result
+
+def render_tag_open(node):
+ tag_name = node.value[0].value
+ if tag_name in allowed_tags:
+ attributes = process_attributes(node, True)
+ node.value = '<%s%s>' % (tag_name, attributes)
+ else:
+ attributes = process_attributes(node, False)
+ node.value = '&lt;%s%s&gt;' % (tag_name, attributes)
+
+def render_tag_close(node):
+ tag_name = node.value[0].value
+ if tag_name in allowed_tags:
+ node.value = "</%s>" % tag_name
+ else:
+ node.value = "&lt;/%s&gt;" % tag_name
+
+def render_tag_autoclose(node):
+ tag_name = node.value[0].value
+ if tag_name in allowed_tags:
+ attributes = process_attributes(node, True)
+ node.value = '<%s%s />' % (tag_name, attributes)
+ else:
+ attributes = process_attributes(node, False)
+ node.value = '&lt;%s%s /&gt;' % (tag_name, attributes)
+
toolset = {'render_raw_text': render_raw_text,
'render_paragraph': render_paragraph,
'render_title2': render_title2,
+ 'render_title6': render_title6,
'render_body': render_body,
'render_entity': render_entity,
'render_lt': render_lt,
- 'render_gt': render_gt}
+ 'render_gt': render_gt,
+ 'render_tag_open': render_tag_open,
+ 'render_tag_close': render_tag_close,
+ 'render_tag_autoclose': render_tag_autoclose}
from mediawiki_parser import wikitextParser
View
@@ -91,9 +91,9 @@ def replace_by_space(node):
optional_attributes : optional_attribute*
tag_lt : LT : drop
tag_gt : GT : drop
- tag_open : tag_lt tag_name optional_attributes SPACETABEOL* tag_gt
- tag_close : tag_lt SLASH tag_name tag_gt
- tag_autoclose : tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt
+ tag_open : tag_lt tag_name optional_attributes SPACETABEOL* tag_gt : render_tag_open
+ tag_close : tag_lt SLASH tag_name tag_gt : render_tag_close
+ tag_autoclose : tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt : render_tag_autoclose
tag : tag_autoclose / tag_open / tag_close
# HTML entities
@@ -154,7 +154,7 @@ def replace_by_space(node):
# Titles
- title6 : TITLE6_BEGIN inline TITLE6_END : liftValue
+ title6 : TITLE6_BEGIN inline TITLE6_END : liftValue render_title6
title5 : TITLE5_BEGIN inline TITLE5_END : liftValue
title4 : TITLE4_BEGIN inline TITLE4_END : liftValue
title3 : TITLE3_BEGIN inline TITLE3_END : liftValue
View
17 raw.py
@@ -1,5 +1,7 @@
from constants import html_entities
+allowed_tags = {}
+
def render_title2(node):
pass
@@ -29,13 +31,26 @@ def render_lt(node):
def render_gt(node):
pass
+def render_tag_open(node):
+ pass
+
+def render_tag_close(node):
+ pass
+
+def render_tag_autoclose(node):
+ pass
+
toolset = {'render_raw_text': render_raw_text,
'render_paragraph': render_paragraph,
'render_title2': render_title2,
+ 'render_title6': render_title6,
'render_body': render_body,
'render_entity': render_entity,
'render_lt': render_lt,
- 'render_gt': render_gt}
+ 'render_gt': render_gt,
+ 'render_tag_open': render_tag_open,
+ 'render_tag_close': render_tag_close,
+ 'render_tag_autoclose': render_tag_autoclose}
from mediawiki_parser import wikitextParser
View
@@ -47,3 +47,28 @@ def parsed_equal_string(self, source, result, method_name, templates={}):
def parsed_equal_tree(self, source, result, method_name, templates={}):
preprocessed = self._preprocessor(templates).parseTest(source).value
self.assertEquals(self._grammar(method_name).parseTest(preprocessed).treeView(), result)
+
+
+class PostprocessorTestCase(TestCase):
+ def _preprocessor(self, templates):
+ from mediawiki_parser import preprocessor
+ return preprocessor.make_parser(templates)
+
+ def _grammar(self, method_name, postprocessor_name):
+ """Return a full or partial grammar.
+
+ method_name -- If truthy, the attribute of the full grammar to return
+
+ """
+ if postprocessor_name == 'html':
+ from mediawiki_parser import html as postprocessor
+ elif postprocessor_name == 'text':
+ from mediawiki_parser import text as postprocessor
+ else:
+ from mediawiki_parser import raw as postprocessor
+ parser = postprocessor.make_parser()
+ return getattr(parser, method_name) if method_name else parser
+
+ def parsed_equal_string(self, source, result, method_name, templates={}, postprocessor='raw'):
+ preprocessed = self._preprocessor(templates).parseTest(source).value
+ self.assertEquals(unicode(self._grammar(method_name, postprocessor).parseTest(preprocessed).leaves()), result)
@@ -0,0 +1,57 @@
+# -*- coding: utf8 -*-
+
+from mediawiki_parser.tests import PostprocessorTestCase
+
+
+class HTMLBackendTests(PostprocessorTestCase):
+ def test_simple_title2(self):
+ source = '== A title ==\n'
+ result = "<h2> A title </h2>\n"
+ self.parsed_equal_string(source, result, 'wikitext', {}, 'html')
+
+ def test_simple_title6(self):
+ source = '====== Test! ======\n'
+ result = "<h6> Test! </h6>\n"
+ self.parsed_equal_string(source, result, 'wikitext', {}, 'html')
+
+ def test_simple_allowed_open_tag(self):
+ source = 'a<span>test'
+ result = 'a<span>test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_complex_allowed_open_tag(self):
+ """ The postprocessor should remove the disallowed attributes. """
+ source = '<span class="wikitext" style="color:red" onclick="javascript:alert()">'
+ result = '<span class="wikitext" style="color:red">'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_simple_disallowed_open_tag(self):
+ source = 'another <a> test'
+ result = 'another &lt;a&gt; test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_complex_disallowed_open_tag(self):
+ """ The postprocessor doesn't remove the disallowed attributes, but outputs everything as text. """
+ source = '<a href="test" class="test" style="color:red" anything="anything">'
+ result = '&lt;a href="test" class="test" style="color:red" anything="anything"&gt;'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_simple_allowed_autoclose_tag(self):
+ source = 'a<br />test'
+ result = 'a<br />test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_complex_allowed_autoclose_tag(self):
+ source = 'one more <br name="test" /> test'
+ result = 'one more <br name="test" /> test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_simple_disallowed_autoclose_tag(self):
+ source = 'a<test />test'
+ result = 'a&lt;test /&gt;test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
+
+ def test_complex_disallowed_autoclose_tag(self):
+ source = '<img src="file.png" />'
+ result = '&lt;img src="file.png" /&gt;'
+ self.parsed_equal_string(source, result, 'inline', {}, 'html')
@@ -0,0 +1,56 @@
+# -*- coding: utf8 -*-
+
+from mediawiki_parser.tests import PostprocessorTestCase
+
+
+class TextBackendTests(PostprocessorTestCase):
+ def test_simple_title2(self):
+ source = '== A title ==\n'
+ result = ' A title \n'
+ self.parsed_equal_string(source, result, 'wikitext', {}, 'text')
+
+ def test_simple_title6(self):
+ source = '====== Test! ======\n'
+ result = ' Test! \n'
+ self.parsed_equal_string(source, result, 'wikitext', {}, 'text')
+
+ def test_simple_allowed_open_tag(self):
+ source = 'a<p>test'
+ result = 'a\ntest'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_complex_allowed_open_tag(self):
+ """ The attributes are ignored. """
+ source = 'a<p class="wikitext" style="color:red" onclick="javascript:alert()">test'
+ result = 'a\ntest'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_simple_disallowed_open_tag(self):
+ source = '<a>'
+ result = '<a>'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_complex_disallowed_open_tag(self):
+ source = '<a href="test" class="test" style="color:red" anything="anything">'
+ result = '<a href="test" class="test" style="color:red" anything="anything">'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_simple_allowed_autoclose_tag(self):
+ source = 'a<br />test'
+ result = 'a\ntest'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_complex_allowed_autoclose_tag(self):
+ source = 'one more <br name="test" /> test'
+ result = 'one more \n test'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_simple_disallowed_autoclose_tag(self):
+ source = '<test />'
+ result = '<test />'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
+
+ def test_complex_disallowed_autoclose_tag(self):
+ source = '<img src="file.png" />'
+ result = '<img src="file.png" />'
+ self.parsed_equal_string(source, result, 'inline', {}, 'text')
View
52 text.py
@@ -1,5 +1,14 @@
from constants import html_entities
+def render_tag_p(attributes):
+ return '\n'
+
+def render_tag_br(attributes):
+ return '\n'
+
+allowed_tags = {'p': render_tag_p,
+ 'br': render_tag_br}
+
def render_title2(node):
node.value += '\n'
@@ -28,13 +37,54 @@ def render_lt(node):
def render_gt(node):
pass
+def process_attributes(node, allowed_tag):
+ result = ''
+ if len(node.value) == 1:
+ pass
+ elif len(node.value) == 2:
+ attributes = node.value[1].value
+ for i in range(len(attributes)):
+ attribute_name = attributes[i].value[0].value
+ attribute_value = attributes[i].value[1].value
+ result += ' %s="%s"' % (attribute_name, attribute_value)
+ else:
+ raise exception, "Bad AST shape!"
+ return result
+
+def render_tag_open(node):
+ tag_name = node.value[0].value
+ if tag_name in allowed_tags:
+ attributes = process_attributes(node, True)
+ tag_processor = allowed_tags[tag_name]
+ node.value = tag_processor(attributes)
+ else:
+ attributes = process_attributes(node, False)
+ node.value = '<%s%s>' % (tag_name, attributes)
+
+def render_tag_close(node):
+ node.value = ''
+
+def render_tag_autoclose(node):
+ tag_name = node.value[0].value
+ if tag_name in allowed_tags:
+ attributes = process_attributes(node, True)
+ tag_processor = allowed_tags[tag_name]
+ node.value = tag_processor(attributes)
+ else:
+ attributes = process_attributes(node, False)
+ node.value = '<%s%s />' % (tag_name, attributes)
+
toolset = {'render_raw_text': render_raw_text,
'render_paragraph': render_paragraph,
'render_title2': render_title2,
+ 'render_title6': render_title6,
'render_body': render_body,
'render_entity': render_entity,
'render_lt': render_lt,
- 'render_gt': render_gt}
+ 'render_gt': render_gt,
+ 'render_tag_open': render_tag_open,
+ 'render_tag_close': render_tag_close,
+ 'render_tag_autoclose': render_tag_autoclose}
from mediawiki_parser import wikitextParser
View
@@ -87,9 +87,9 @@
optional_attributes : optional_attribute*
tag_lt : LT : drop
tag_gt : GT : drop
- tag_open : tag_lt tag_name optional_attributes SPACETABEOL* tag_gt
- tag_close : tag_lt SLASH tag_name tag_gt
- tag_autoclose : tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt
+ tag_open : tag_lt tag_name optional_attributes SPACETABEOL* tag_gt : render_tag_open
+ tag_close : tag_lt SLASH tag_name tag_gt : render_tag_close
+ tag_autoclose : tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt : render_tag_autoclose
tag : tag_autoclose / tag_open / tag_close
# HTML entities
@@ -150,7 +150,7 @@
# Titles
- title6 : TITLE6_BEGIN inline TITLE6_END : liftValue
+ title6 : TITLE6_BEGIN inline TITLE6_END : liftValue render_title6
title5 : TITLE5_BEGIN inline TITLE5_END : liftValue
title4 : TITLE4_BEGIN inline TITLE4_END : liftValue
title3 : TITLE3_BEGIN inline TITLE3_END : liftValue
@@ -362,9 +362,9 @@ def replace_by_space(node):
optional_attributes = Repetition(optional_attribute, numMin=False, numMax=False, expression='optional_attribute*', name='optional_attributes')
tag_lt = Clone(LT, expression='LT', name='tag_lt')(toolset['drop'])
tag_gt = Clone(GT, expression='GT', name='tag_gt')(toolset['drop'])
- tag_open = Sequence([tag_lt, tag_name, optional_attributes, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), tag_gt], expression='tag_lt tag_name optional_attributes SPACETABEOL* tag_gt', name='tag_open')
- tag_close = Sequence([tag_lt, SLASH, tag_name, tag_gt], expression='tag_lt SLASH tag_name tag_gt', name='tag_close')
- tag_autoclose = Sequence([tag_lt, tag_name, optional_attributes, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), SLASH, tag_gt], expression='tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt', name='tag_autoclose')
+ tag_open = Sequence([tag_lt, tag_name, optional_attributes, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), tag_gt], expression='tag_lt tag_name optional_attributes SPACETABEOL* tag_gt', name='tag_open')(toolset['render_tag_open'])
+ tag_close = Sequence([tag_lt, SLASH, tag_name, tag_gt], expression='tag_lt SLASH tag_name tag_gt', name='tag_close')(toolset['render_tag_close'])
+ tag_autoclose = Sequence([tag_lt, tag_name, optional_attributes, Repetition(SPACETABEOL, numMin=False, numMax=False, expression='SPACETABEOL*'), SLASH, tag_gt], expression='tag_lt tag_name optional_attributes SPACETABEOL* SLASH tag_gt', name='tag_autoclose')(toolset['render_tag_autoclose'])
tag = Choice([tag_autoclose, tag_open, tag_close], expression='tag_autoclose / tag_open / tag_close', name='tag')
# HTML entities
@@ -425,7 +425,7 @@ def replace_by_space(node):
# Titles
- title6 = Sequence([TITLE6_BEGIN, inline, TITLE6_END], expression='TITLE6_BEGIN inline TITLE6_END', name='title6')(toolset['liftValue'])
+ title6 = Sequence([TITLE6_BEGIN, inline, TITLE6_END], expression='TITLE6_BEGIN inline TITLE6_END', name='title6')(toolset['liftValue'], toolset['render_title6'])
title5 = Sequence([TITLE5_BEGIN, inline, TITLE5_END], expression='TITLE5_BEGIN inline TITLE5_END', name='title5')(toolset['liftValue'])
title4 = Sequence([TITLE4_BEGIN, inline, TITLE4_END], expression='TITLE4_BEGIN inline TITLE4_END', name='title4')(toolset['liftValue'])
title3 = Sequence([TITLE3_BEGIN, inline, TITLE3_END], expression='TITLE3_BEGIN inline TITLE3_END', name='title3')(toolset['liftValue'])

0 comments on commit bcfad00

Please sign in to comment.