Merge pull request #170 from mozbugbox/port-beautifulsoup-v4

Port beautifulsoup v4
lxml · Jun 5, 2015 · 447f49f · 447f49f
2 parents 01288f6 + 9d14dd8
commit 447f49f
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 19 deletions.
diff --git a/doc/elementsoup.txt b/doc/elementsoup.txt
@@ -47,7 +47,7 @@ Here is a document full of tag soup, similar to, but not quite like, HTML:
 
 .. sourcecode:: pycon
 
-    >>> tag_soup = '<meta><head><title>Hello</head><body onload=crash()>Hi all<p>'
+    >>> tag_soup = '<meta/><head><title>Hello</head><body onload=crash()>Hi all<p>'
 
 all you need to do is pass it to the ``fromstring()`` function:
 

diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py
@@ -8,11 +8,12 @@
 
 try:
     from bs4 import (
-        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration)
+        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration, CData, Doctype)
 except ImportError:
     from BeautifulSoup import (
-        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration)
-
+        BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration, CData)
+    class Doctype:
+        pass
 
 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
     """Parse a string of HTML data into an Element tree using the
@@ -65,8 +66,12 @@ def convert_tree(beautiful_soup_tree, makeelement=None):
 def _parse(source, beautifulsoup, makeelement, **bsargs):
     if beautifulsoup is None:
         beautifulsoup = BeautifulSoup
-    if 'convertEntities' not in bsargs:
-        bsargs['convertEntities'] = 'html'
+    if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
+        if 'convertEntities' not in bsargs:
+            bsargs['convertEntities'] = 'html'
+    if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
+        if 'features' not in bsargs:
+            bsargs['features'] = ['html.parser'] # force bs html parser
     tree = beautifulsoup(source, **bsargs)
     root = _convert_tree(tree, makeelement)
     # from ET: wrap the document in a html root element, if necessary
@@ -113,7 +118,7 @@ def _convert_tree(beautiful_soup_tree, makeelement):
             last_element_idx = i
             if html_root is None and e.name and e.name.lower() == 'html':
                 html_root = e
-        elif declaration is None and isinstance(e, Declaration):
+        elif declaration is None and isinstance(e, (Declaration, Doctype)):
             declaration = e
 
     # For a nice, well-formatted document, the variable roots below is
@@ -152,7 +157,12 @@ def _convert_tree(beautiful_soup_tree, makeelement):
             prev = converted
 
     if declaration is not None:
-        match = _parse_doctype_declaration(declaration.string)
+        if hasattr(declaration, "output_ready"):
+            # bs4, got full Doctype string
+            doctype_string = declaration.output_ready().strip().strip("<!>")
+        else:
+            doctype_string = declaration.string
+        match = _parse_doctype_declaration(doctype_string)
         if not match:
             # Something is wrong if we end up in here. Since soupparser should
             # tolerate errors, do not raise Exception, just let it pass.
@@ -170,7 +180,14 @@ def _convert_tree(beautiful_soup_tree, makeelement):
 def _convert_node(bs_node, parent=None, makeelement=None):
     res = None
     if isinstance(bs_node, (Tag, _PseudoTag)):
-        attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
+        if isinstance(bs_node.attrs, dict): # bs4
+            attribs = {}
+            for k, v in bs_node.attrs.items():
+                if isinstance(v, list):
+                    v = " ".join(v)
+                attribs[k] = unescape(v)
+        else:
+            attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
         if parent is not None:
             res = etree.SubElement(parent, bs_node.name, attrib=attribs)
         else:
@@ -195,7 +212,7 @@ def _convert_node(bs_node, parent=None, makeelement=None):
             parent.append(res)
     elif isinstance(bs_node, Declaration):
         pass
-    else:  # CData
+    elif isinstance(bs_node, CData):
         _append_text(parent, unescape(bs_node))
     return res
 

diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py
@@ -1,9 +1,9 @@
 import unittest, sys
 from lxml.tests.common_imports import make_doctest, HelperTestCase
 
+BS_INSTALLED = True
 try:
-    import BeautifulSoup
-    BS_INSTALLED = True
+    import lxml.html.soupparser
 except ImportError:
     BS_INSTALLED = False
 
@@ -24,21 +24,21 @@ def test_broken_attribute(self):
 
         def test_body(self):
             html = '''<body><p>test</p></body>'''
-            res = '''<html><body><p>test</p></body></html>'''
+            res = b'''<html><body><p>test</p></body></html>'''
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
         def test_head_body(self):
             # HTML tag missing, parser should fix that
             html = '<head><title>test</title></head><body><p>test</p></body>'
-            res = '<html><head><title>test</title></head><body><p>test</p></body></html>'
+            res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
         def test_wrap_html(self):
             # <head> outside <html>, parser should fix that
             html = '<head><title>title</test></head><html><body/></html>'
-            res = '<html><head><title>title</title></head><body></body></html>'
+            res = b'<html><head><title>title</title></head><body></body></html>'
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
@@ -47,7 +47,7 @@ def test_comment_pi(self):
 <?test asdf?>
 <head><title>test</title></head><body><p>test</p></body>
 <!-- another comment -->'''
-            res = '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+            res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
 <!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
             tree = self.soupparser.fromstring(html).getroottree()
             self.assertEqual(tostring(tree, method='html'), res)
@@ -60,7 +60,7 @@ def test_doctype1(self):
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''
 
             res = \
-'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             tree = self.soupparser.fromstring(html).getroottree()
@@ -75,7 +75,7 @@ def test_doctype2(self):
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             res = \
-'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
+b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             tree = self.soupparser.fromstring(html).getroottree()
@@ -84,18 +84,30 @@ def test_doctype2(self):
 
         def test_doctype_html5(self):
             # html 5 doctype declaration
-            html = '<!DOCTYPE html>\n<html lang="en"></html>'
+            html = b'<!DOCTYPE html>\n<html lang="en"></html>'
 
             tree = self.soupparser.fromstring(html).getroottree()
             self.assertTrue(tree.docinfo.public_id is None)
             self.assertEqual(tostring(tree), html)
 
+else:
+    class SoupNotInstalledTestCase(HelperTestCase):
+
+        def test_beautifulsoup_not_installed(self):
+            # If BS_INSTALLED failed, beautifulsoup should not exist
+            with self.assertRaises(ImportError):
+                import bs4
+            with self.assertRaises(ImportError):
+                import BeautifulSoup
+
 def test_suite():
     suite = unittest.TestSuite()
     if BS_INSTALLED:
         suite.addTests([unittest.makeSuite(SoupParserTestCase)])
         if sys.version_info[0] < 3:
             suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
+    else:
+        suite.addTests([unittest.makeSuite(SoupNotInstalledTestCase)])
     return suite
 
 if __name__ == '__main__':