[svn r3900] r4637@delle: sbehnel | 2008-07-16 08:55:48 +0200

html5lib parser module provided by Armin Ronacher --HG-- branch : trunk
lxml · Jul 16, 2008 · 8fa6dc2 · 8fa6dc2
1 parent 745f4d3
commit 8fa6dc2
Show file tree

Hide file tree

Showing 4 changed files with 342 additions and 1 deletion.
diff --git a/doc/docstructure.py b/doc/docstructure.py
@@ -6,7 +6,8 @@
                               'api.txt', 'parsing.txt',
                               'validation.txt', 'xpathxslt.txt',
                               'objectify.txt', 'lxmlhtml.txt',
-                              'cssselect.txt', 'elementsoup.txt')),
+                              'cssselect.txt', 'elementsoup.txt',
+                              'html5parser.txt')),
     ('Extending lxml', ('resolvers.txt', 'extensions.txt',
                         'element_classes.txt', 'sax.txt', 'capi.txt')),
     ('Developing lxml', ('build.txt', 'lxml-source-howto.txt',

diff --git a/doc/html5parser.txt b/doc/html5parser.txt
@@ -0,0 +1,80 @@
+===============
+html5lib Parser
+===============
+
+`html5lib`_ is a Python package that implements the HTML5 parsing algorithm
+which is heavily influenced by current browsers and based on the `WHATWG
+HTML5 specification`_.
+
+.. _html5lib: http://code.google.com/p/html5lib/
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _WHATWG HTML5 specification: http://www.whatwg.org/specs/web-apps/current-work/
+
+lxml can benefit from the parsing capabilities of `html5lib` through
+the ``lxml.html.html5parser`` module.  It provides a similar interface
+to the ``lxml.html`` module by providing ``fromstring()``,
+``parse()``, ``document_fromstring()``, ``fragment_fromstring()`` and
+``fragments_fromstring()`` that work like the regular html parsing
+functions.
+
+
+Differences to regular HTML parsing
+===================================
+
+There are a few differences in the returned tree to the regular HTML
+parsing functions from ``lxml.html``.  html5lib normalizes some elements
+and element structures to a common format.  For example even if a tables
+does not have a `tbody` html5lib will inject one automatically:
+
+.. sourcecode:: pycon
+
+    >>> from lxml.html import tostring, html5parser
+    >>> tostring(html5parser.fromstring("<table><td>foo"))
+    '<table><tbody><tr><td>foo</td></tr></tbody></table>'
+
+Also the parameters the functions accept are different.
+
+
+Function Reference
+==================
+
+``parse(filename_url_or_file)``:
+    Parses the named file or url, or if the object has a ``.read()``
+    method, parses from that.
+
+``document_fromstring(html, guess_charset=True)``:
+    Parses a document from the given string.  This always creates a
+    correct HTML document, which means the parent node is ``<html>``,
+    and there is a body and possibly a head.
+
+    If a bytestring is passed and ``guess_charset`` is true the chardet
+    library (if installed) will guess the charset if ambiguities exist.
+
+``fragment_fromstring(string, create_parent=False, guess_charset=False)``:
+    Returns an HTML fragment from a string.  The fragment must contain
+    just a single element, unless ``create_parent`` is given;
+    e.g,. ``fragment_fromstring(string, create_parent='div')`` will
+    wrap the element in a ``<div>``.  If ``create_parent`` is true the
+    default parent tag (div) is used.
+
+    If a bytestring is passed and ``guess_charset`` is true the chardet
+    library (if installed) will guess the charset if ambiguities exist.
+
+``fragments_fromstring(string, no_leading_text=False, parser=None)``:
+    Returns a list of the elements found in the fragment.  The first item in
+    the list may be a string.  If ``no_leading_text`` is true, then it will
+    be an error if there is leading text, and it will always be a list of
+    only elements.
+
+    If a bytestring is passed and ``guess_charset`` is true the chardet
+    library (if installed) will guess the charset if ambiguities exist.
+
+``fromstring(string)``:
+    Returns ``document_fromstring`` or ``fragment_fromstring``, based
+    on whether the string looks like a full document, or just a
+    fragment.
+
+Additionally all parsing functions accept an ``parser`` keyword argument
+that can be set to a custom parser instance.  To create custom parsers
+you can subclass the ``HTMLParser`` and ``XHTMLParser`` from the same
+module.  Note that these are the parser classes provided by html5lib.
diff --git a/src/lxml/html/_html5builder.py b/src/lxml/html/_html5builder.py
@@ -0,0 +1,96 @@
+"""
+This module implements a tree builder for html5lib that generates lxml
+html element trees.  This module uses camelCase as it follows the
+html5lib style guide.
+"""
+
+from html5lib.treebuilders import _base, etree as etree_builders
+from lxml import html, etree
+
+
+class DocumentType(object):
+
+    def __init__(self, name, publicId, systemId):
+        self.name = name
+        self.publicId = publicId
+        self.systemId = systemId
+
+class Document(object):
+
+    def __init__(self):
+        self._elementTree = None
+        self.childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document
+
+    def __init__(self):
+        html_builder = etree_builders.getETreeModule(html, fullTree=False)
+        etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
+        self.elementClass = html_builder.Element
+        self.commentClass = etree_builder.Comment
+        _base.TreeBuilder.__init__(self)
+
+    def reset(self):
+        _base.TreeBuilder.reset(self)
+        self.rootInserted = False
+        self.initialComments = []
+        self.doctype = None
+
+    def getDocument(self):
+        return self.document._elementTree
+
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(element.getchildren())
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, name, publicId, systemId):
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.doctype = doctype
+
+    def insertComment(self, data, parent=None):
+        if not self.rootInserted:
+            self.initialComments.append(data)
+        else:
+            _base.TreeBuilder.insertComment(self, data, parent)
+
+    def insertRoot(self, name):
+        buf = []
+        if self.doctype and self.doctype.name:
+            buf.append('<!DOCTYPE %s' % self.doctype.name)
+            if self.doctype.publicId is not None or self.doctype.systemId is not None:
+                buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
+                                                  self.doctype.systemId))
+            buf.append('>')
+        buf.append('<html></html>')
+        root = html.fromstring(u''.join(buf))
+
+        # Append the initial comments:
+        for comment in self.initialComments:
+            root.addprevious(etree.Comment(comment))
+
+        # Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+
+        # Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name)
+        root_element._element = root
+        self.document.childNodes.append(root_element)
+        self.openElements.append(root_element)
+
+        self.rootInserted = True
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
@@ -0,0 +1,164 @@
+"""
+An interface to html5lib.
+"""
+
+import urllib
+from html5lib import HTMLParser as _HTMLParser, XHTMLParser as _XHTMLParser
+from lxml import etree
+from lxml.html import _contains_block_level_tag, XHTML_NAMESPACE
+from lxml.html._html5builder import TreeBuilder
+
+# python3 compatibility
+try:
+    _strings = basestring
+except NameError:
+    _strings = (bytes, str)
+
+
+class HTMLParser(_HTMLParser):
+    """An html5lib HTML parser with lxml as tree."""
+
+    def __init__(self, strict=False):
+        _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
+
+
+class XHTMLParser(_XHTMLParser):
+    """An html5lib XHTML Parser with lxml as tree."""
+
+    def __init__(self, strict=False):
+        _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder)
+
+
+def _find_tag(tree, tag):
+    elem = tree.find(tag)
+    if elem is not None:
+        return elem
+    return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
+
+
+def document_fromstring(html, guess_charset=True, parser=None):
+    """Parse a whole document into a string."""
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+
+    if parser is None:
+        parser = html_parser
+
+    return parser.parse(html, useChardet=guess_charset).getroot()
+
+
+def fragments_fromstring(html, no_leading_text=False,
+                         guess_charset=False, parser=None):
+    """Parses several HTML elements, returning a list of elements.
+
+    The first item in the list may be a string.  If no_leading_text is true,
+    then it will be an error if there is leading text, and it will always be
+    a list of only elements.
+
+    If `guess_charset` is `True` and the text was not unicode but a
+    bytestring, the `chardet` library will perform charset guessing on the
+    string.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+
+    if parser is None:
+        parser = html_parser
+
+    children = parser.parseFragment(html, 'div', useChardet=guess_charset)
+    if children and isinstance(children[0], _strings):
+        if no_leading_text:
+            if children[0].strip():
+                raise etree.ParserError('There is leading text: %r' %
+                                        children[0])
+            del children[0]
+    return children
+
+
+def fragment_fromstring(html, create_parent=False,
+                        guess_charset=False, parser=None):
+    """Parses a single HTML element; it is an error if there is more than
+    one element, or if anything but whitespace precedes or follows the
+    element.
+
+    If create_parent is true (or is a tag name) then a parent node
+    will be created to encapsulate the HTML in a single element.
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+
+    if create_parent:
+        container = create_parent or 'div'
+        html = '<%s>%s</%s>' % (container, html, container)
+
+    children = fragments_fromstring(html, True, guess_charset, parser)
+    if not children:
+        raise etree.ParserError('No elements found')
+    if len(children) > 1:
+        raise etree.ParserError('Multiple elements found')
+
+    result = children[0]
+    if result.tail and result.tail.strip():
+        raise etree.ParserError('Element followed by text: %r' % el.tail)
+    result.tail = None
+    return result
+
+
+def fromstring(html, guess_charset=True, parser=None):
+    """Parse the html, returning a single element/document.
+
+    This tries to minimally parse the chunk of text, without knowing if it
+    is a fragment or a document.
+
+    base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+    """
+    if not isinstance(html, _strings):
+        raise TypeError('string required')
+    doc = document_fromstring(html, parser=parser,
+                              guess_charset=guess_charset)
+
+    # document starts with doctype or <html>, full document!
+    start = html[:50].lstrip().lower()
+    if start.startswith('<html') or start.startswith('<!doctype'):
+        return doc
+
+    head = _find_tag(doc, 'head')
+
+    # if the head is not empty we have a full document
+    if len(head):
+        return doc
+
+    body = _find_tag(doc, 'body')
+
+    # The body has just one element, so it was probably a single
+    # element passed in
+    if (len(body) == 1 and (not body.text or not body.text.strip())
+        and (not body[-1].tail or not body[-1].tail.strip())):
+        return body[0]
+
+    # Now we have a body which represents a bunch of tags which have the
+    # content that was passed in.  We will create a fake container, which
+    # is the body tag, except <body> implies too much structure.
+    if _contains_block_level_tag(body):
+        body.tag = 'div'
+    else:
+        body.tag = 'span'
+    return body
+
+
+def parse(filename_url_or_file, guess_charset=True, parser=None):
+    """Parse a filename, URL, or file-like object into an HTML document
+    tree.  Note: this returns a tree, not an element.  Use
+    ``parse(...).getroot()`` to get the document root.
+    """
+    if parser is None:
+        parser = html_parser
+    if isinstance(filename_url_or_file, basestring):
+        fp = urllib.urlopen(filename_url_or_file)
+    else:
+        fp = filename_url_or_file
+    return parser.parse(html, useChardet=guess_charset)
+
+
+html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()