Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
406 lines (341 sloc) 15.8 KB
import re
import html as _html
from html.entities import codepoint2name as _entities
from io import StringIO
import textwrap
class Element:
__slots__ = ['name', 'attrs', 'style', 'content']
def __init__(self, name, attrs, style, content):
assert isinstance(name, str) = name
assert attrs is None or isinstance(attrs, dict)
self.attrs = attrs or {}
assert style is None or isinstance(style, dict) = style
assert not isinstance(content, str)
self.content = list(content)
assert all(isinstance(item, (str, Element)) for item in self.content)
def is_block(self):
""" True if this is a block element.
Block elements have the property that extra whitespace before or after
either the start tag or the end tag is ignored. (Some elements, like
<section>, are expected to contain only block elements and no text; but
of course there are also block elements, like <p>, that contain inline
elements and text.)"""
return _tag_data[][0] == 'B'
def to_html(self):
f = StringIO()
write_html(f, self, strict=False)
return f.getvalue()
__repr__ = to_html
def kids(self, name=None):
""" Return an iterator over (index, child) pairs where each child is a
child element of self. The iterator is robust against mutating
self.content to the right of i. """
for i, kid in enumerate(self.content):
if isinstance(kid, Element) and (name is None or == name):
yield i, kid
def with_content(self, replaced_content):
""" Return a copy of self with different content. """
return Element(, self.attrs,, replaced_content)
def with_content_slice(self, start, stop, replaced_content):
copy = self.content[:]
copy[start:stop] = replaced_content
return self.with_content(copy)
def with_(self, name=None, attrs=None, style=None, content=None):
if name is None: name =
if attrs is None: attrs = self.attrs
if style is None: style =
if content is None: content = self.content
return Element(name, attrs, style, content)
def find(self, matcher):
match_result = matcher(self)
assert match_result is True or match_result is False or match_result is None
if match_result is None:
for kid in self.content:
if isinstance(kid, Element):
for x in kid.find(matcher):
yield x
if match_result:
yield self
def find_replace(self, matcher, replacement):
""" A sort of map() on htmodel content.
self - An Element to transform.
matcher - A function mapping an Element to True, False, or None.
replacement - A function mapping an Element to a content list.
Walk the entire tree under the Element self; for each element e such
that matcher(e) is True, call replacement(e); return a tree with each
such Element replaced by the content in replacement(e).
If matcher(e) is False or None, replacement(e) is not called, and the
element is left in the result tree; the difference is that if
matcher(e) is False, e's descendants are visited; if it is None, the
descendants are skipped entirely and left unchanged in the result tree.
self is left unmodified, but the result is not a deep copy: it may be
self or an Element whose tree shares some parts of self.
def map_element(e):
match_result = matcher(e)
assert match_result is True or match_result is False or match_result is None
if match_result is None:
return [e]
replaced_content = map_content(e.content)
if replaced_content is e.content:
e2 = e
e2 = e.with_content(replaced_content)
if match_result:
return list(replacement(e2))
return [e2]
def map_content(source):
changed = False
result = []
for child in source:
if isinstance(child, str):
seq = map_element(child)
changed = changed or seq != [child]
result += seq
if changed:
return result
return source
result_content = map_element(self)
if len(result_content) != 1:
raise ValueError("replaced root element with {} pieces of content".format(len(result_content)))
result_elt = result_content[0]
if not isinstance(result_elt, Element):
raise ValueError("replaced root element with non-element content")
return result_elt
def replace(self, name, replacement):
""" A sort of map() on htmodel content.
self - An Element to transform.
name - A string.
replacement - A function taking a single Element and returning a content list
(that is, a list of Elements and/or strings).
Walk the entire tree under the Element self; for each Element e with the given
name, call replacement(e); return a tree with each such Element replaced by
the content in replacement(e).
If == name and list(replacement(self)) is not a list consisting of
exactly one Element, raise a ValueError.
self is left unmodified, but the result is not a deep copy: it may be
self or an Element whose tree shares some parts of self.
assert isinstance(name, str) # detect common bug
return self.find_replace(lambda e: == name, replacement)
def escape(s, quote=False):
""" Escape the string s for HTML output.
This escapes characters that are special in HTML (& < >) and all non-ASCII characters.
If 'quote' is true, escape quotes (' ") as well.
Why use character entity references for non-ASCII characters? The program
encodes the output as UTF-8, so we should be fine without escaping. We
escape only for maximum robustness against broken tools.
def replace(m):
c = ord(
if c in _entities:
return '&' + _entities[c] + ';'
return '&#x{:x};'.format(c)
# The stdlib takes care of & > < ' " for us.
s = _html.escape(s, quote)
# Now we only need to worry about non-ascii characters.
return re.sub('[^\n -~]', replace, s)
empty_tags = {'meta', 'br', 'hr', 'link'}
non_indenting_tags = {'html', 'body'}
def save_html(filename, ht, strict=True):
assert == 'html'
with open(filename, 'w', encoding='utf-8') as f:
f.write("<!doctype html>\n")
write_html(f, ht, strict=strict)
# When wrap_hack is true, space characters in attribute values are replaced
# with this character (so that a big chunk of HTML code can be word-wrapped
# without fear of breaking inside an attribute). The character is in the
# Private Use area.
NOT_A_SPACE = '\uF123'
def write_html(f, ht, indent='', strict=True, wrap_hack=False):
WIDTH = 130
def htmlify(name):
""" Convert a pythonified tag name or attribute name back to HTML. """
if name.endswith('_'):
name = name[:-1]
name = name.replace('_', '-')
return name
def escape_attr_value(v, wrap_hack):
ev = escape(v, True)
if wrap_hack:
ev = ev.replace(" ", NOT_A_SPACE)
return ev
def start_tag(ht, wrap_hack):
attrs = ''.join(' {0}="{1}"'.format(htmlify(k), escape_attr_value(v, wrap_hack))
for k, v in sorted(ht.attrs.items()))
assert 'style' not in ht.attrs
style = '; '.join(name + ": " + value for name, value in sorted(
attrs += ' style="{0}"'.format(style)
return '<{0}{1}>'.format(, attrs)
def is_ht_inline(ht):
return isinstance(ht, str) or _tag_data[][0] == 'I'
def write_inline_content(f, content, indent, allow_last_block, strict, strict_blame, wrap_hack):
if (allow_last_block
and isinstance(content[-1], Element)
and content[-1].name in ('ol', 'ul', 'table', 'figure')):
last = content[-1]
content = content[:-1]
last = None
for kid in content:
if isinstance(kid, str):
if strict and not is_ht_inline(kid):
raise ValueError("block element <{}> can't appear in inline content:\n".format(
+ repr(strict_blame))
write_html(f, kid, indent, strict, wrap_hack)
if last is not None:
write_html(f, last, indent, strict, wrap_hack)
if isinstance(ht, str):
assert not strict
info = _tag_data[]
content = ht.content
assert info[1] != '0' or len(content) == 0 # empty tag is empty
if ( in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')
or ( == 'li' and ht.content and is_ht_inline(ht.content[0]))):
# Word-wrap the inline content in this block element.
# First, find out if this is an li element with a trailing list.
if == 'li' and isinstance(ht.content[-1], Element) and ht.content[-1].name in ('ol', 'ul', 'table'):
content_to_wrap = content[:-1]
last_block = content[-1]
content_to_wrap = content
last_block = None
# Dump content_to_wrap to a temporary buffer.
# We are going to word-wrap this later, so use wrap_hack=True
# which makes sure no spaces appear in attributes.
tmpf = StringIO()
tmpf.write(start_tag(ht, True))
write_inline_content(tmpf, content_to_wrap, indent + ' ', == 'li', strict=strict, strict_blame=ht,
if last_block is None:
text = tmpf.getvalue()
# Write the output to f.
if '\n' in text:
# This is unexpected; don't word-wrap. Write it verbatim, with newlines.
text = text.replace(NOT_A_SPACE, ' ')
f.write(indent + text + "\n")
# The usual case. Word-wrap and write.
subsequent_indent = indent
if != 'p':
subsequent_indent += ' '
text = textwrap.fill(text, WIDTH, expand_tabs=False, replace_whitespace=False,
initial_indent=indent, subsequent_indent=subsequent_indent,
break_long_words=False, break_on_hyphens=False)
# Now that word wrapping is done, we can change the hacked spaces
# back to real spaces.
text = text.replace(NOT_A_SPACE, ' ')
f.write(text + "\n")
# If we had a trailing block, dump it now (and the end tag we skipped before).
if last_block:
write_html(f, last_block, indent + ' ', strict=strict,
f.write(indent + "</{}>\n".format(
elif info[0] == 'B':
# Block.
f.write(indent + start_tag(ht, wrap_hack))
if info != 'B0':
if content:
if is_ht_inline(content[0]):
if strict and info[1] not in 'i?s':
if isinstance(content[0], str):
raise ValueError("<{}> element may only contain tags, not text".format(
raise ValueError("<{}> element may not contain inline element <{}>".format(, content[0].name))
write_inline_content(f, content, indent + ' ', == 'li', strict, strict_blame=ht, wrap_hack=wrap_hack)
if strict and info[1] not in 'b?':
raise ValueError("<{}> element may not contain block element <{}>".format(, content[0].name))
inner_indent = indent
if not in non_indenting_tags:
inner_indent += ' '
prev_needs_space = False
first = True
for kid in content:
if strict and is_ht_inline(kid):
if isinstance(kid, str):
raise ValueError("<{}> element may contain either text or block content, not both".format(
raise ValueError("<{}> element may contain either blocks (like <{}>) "
"or inline content (like <{}>), not both".format(, content[0].name,
needs_space = ((strict or isinstance(kid, Element))
and ( in {'p', 'figure', 'section'}
or ( in {'div', 'li', 'td'}
and kid.content and not is_ht_inline(kid.content[0]))))
if not first and (prev_needs_space or needs_space):
write_html(f, kid, inner_indent, strict,
prev_needs_space = needs_space
first = False
# Inline. Content must be inline too.
assert info in ('Ii', 'I0')
f.write(start_tag(ht, wrap_hack))
if info != 'I0':
write_inline_content(f, content, indent + ' ', False, strict, ht, wrap_hack)
_tag_data = {}
def _init(v):
# Every tag is one of:
# - like section, table, tr, ol, ul: block containing only blocks
# - like p, h1: block containing only inline content
# - like span, em, strong, i, b: inline containing only inline content
# - like li or td: block containing either block or inline content (or in the
# unique case of li, inline followed by a list)
# - like hr, img: block containing nothing
# - like br, wbr: inline containing nothing
_tag_raw_data = '''\
html head body section hgroup table tbody thead tfoot tr ol\
ul blockquote ol ul dl figure: Bb
title p h1 h2 h3 h4 h5 h6 address figcaption pre: Bi
a em strong small s cite q dfn abbr data time code var samp\
kbd sub sup i b u mark bdi bdo span: Ii
br wbr: I0
div li td th noscript object dt dd: B?
meta link hr img: B0
style script: Bs'''
def element_constructor(name):
def construct(*content, **attrs):
if 'class_' in attrs:
attrs['class'] = attrs['class_']
del attrs['class_']
return Element(name, attrs, None, list(content))
construct.__name__ = name
return construct
for line in _tag_raw_data.splitlines():
names, colon, info = line.partition(':')
assert colon
info = info.strip()
for name in names.split():
v[name] = element_constructor(name)
_tag_data[name] = info
__all__ = ['Element'] + list(_tag_data.keys())