Skip to content

Commit

Permalink
Merge pull request #170 from mozbugbox/port-beautifulsoup-v4
Browse files Browse the repository at this point in the history
Port beautifulsoup v4
  • Loading branch information
scoder committed Jun 5, 2015
2 parents 01288f6 + 9d14dd8 commit 447f49f
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 19 deletions.
2 changes: 1 addition & 1 deletion doc/elementsoup.txt
Expand Up @@ -47,7 +47,7 @@ Here is a document full of tag soup, similar to, but not quite like, HTML:

.. sourcecode:: pycon

>>> tag_soup = '<meta><head><title>Hello</head><body onload=crash()>Hi all<p>'
>>> tag_soup = '<meta/><head><title>Hello</head><body onload=crash()>Hi all<p>'

all you need to do is pass it to the ``fromstring()`` function:

Expand Down
35 changes: 26 additions & 9 deletions src/lxml/html/soupparser.py
Expand Up @@ -8,11 +8,12 @@

try:
from bs4 import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration)
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration, CData, Doctype)
except ImportError:
from BeautifulSoup import (
BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration)

BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, Declaration, CData)
class Doctype:
pass

def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
"""Parse a string of HTML data into an Element tree using the
Expand Down Expand Up @@ -65,8 +66,12 @@ def convert_tree(beautiful_soup_tree, makeelement=None):
def _parse(source, beautifulsoup, makeelement, **bsargs):
if beautifulsoup is None:
beautifulsoup = BeautifulSoup
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
if 'convertEntities' not in bsargs:
bsargs['convertEntities'] = 'html'
if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
if 'features' not in bsargs:
bsargs['features'] = ['html.parser'] # force bs html parser
tree = beautifulsoup(source, **bsargs)
root = _convert_tree(tree, makeelement)
# from ET: wrap the document in a html root element, if necessary
Expand Down Expand Up @@ -113,7 +118,7 @@ def _convert_tree(beautiful_soup_tree, makeelement):
last_element_idx = i
if html_root is None and e.name and e.name.lower() == 'html':
html_root = e
elif declaration is None and isinstance(e, Declaration):
elif declaration is None and isinstance(e, (Declaration, Doctype)):
declaration = e

# For a nice, well-formatted document, the variable roots below is
Expand Down Expand Up @@ -152,7 +157,12 @@ def _convert_tree(beautiful_soup_tree, makeelement):
prev = converted

if declaration is not None:
match = _parse_doctype_declaration(declaration.string)
if hasattr(declaration, "output_ready"):
# bs4, got full Doctype string
doctype_string = declaration.output_ready().strip().strip("<!>")
else:
doctype_string = declaration.string
match = _parse_doctype_declaration(doctype_string)
if not match:
# Something is wrong if we end up in here. Since soupparser should
# tolerate errors, do not raise Exception, just let it pass.
Expand All @@ -170,7 +180,14 @@ def _convert_tree(beautiful_soup_tree, makeelement):
def _convert_node(bs_node, parent=None, makeelement=None):
res = None
if isinstance(bs_node, (Tag, _PseudoTag)):
attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
if isinstance(bs_node.attrs, dict): # bs4
attribs = {}
for k, v in bs_node.attrs.items():
if isinstance(v, list):
v = " ".join(v)
attribs[k] = unescape(v)
else:
attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
if parent is not None:
res = etree.SubElement(parent, bs_node.name, attrib=attribs)
else:
Expand All @@ -195,7 +212,7 @@ def _convert_node(bs_node, parent=None, makeelement=None):
parent.append(res)
elif isinstance(bs_node, Declaration):
pass
else: # CData
elif isinstance(bs_node, CData):
_append_text(parent, unescape(bs_node))
return res

Expand Down
30 changes: 21 additions & 9 deletions src/lxml/html/tests/test_elementsoup.py
@@ -1,9 +1,9 @@
import unittest, sys
from lxml.tests.common_imports import make_doctest, HelperTestCase

BS_INSTALLED = True
try:
import BeautifulSoup
BS_INSTALLED = True
import lxml.html.soupparser
except ImportError:
BS_INSTALLED = False

Expand All @@ -24,21 +24,21 @@ def test_broken_attribute(self):

def test_body(self):
html = '''<body><p>test</p></body>'''
res = '''<html><body><p>test</p></body></html>'''
res = b'''<html><body><p>test</p></body></html>'''
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)

def test_head_body(self):
# HTML tag missing, parser should fix that
html = '<head><title>test</title></head><body><p>test</p></body>'
res = '<html><head><title>test</title></head><body><p>test</p></body></html>'
res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)

def test_wrap_html(self):
# <head> outside <html>, parser should fix that
html = '<head><title>title</test></head><html><body/></html>'
res = '<html><head><title>title</title></head><body></body></html>'
res = b'<html><head><title>title</title></head><body></body></html>'
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)

Expand All @@ -47,7 +47,7 @@ def test_comment_pi(self):
<?test asdf?>
<head><title>test</title></head><body><p>test</p></body>
<!-- another comment -->'''
res = '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
tree = self.soupparser.fromstring(html).getroottree()
self.assertEqual(tostring(tree, method='html'), res)
Expand All @@ -60,7 +60,7 @@ def test_doctype1(self):
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''

res = \
'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

tree = self.soupparser.fromstring(html).getroottree()
Expand All @@ -75,7 +75,7 @@ def test_doctype2(self):
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

res = \
'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''

tree = self.soupparser.fromstring(html).getroottree()
Expand All @@ -84,18 +84,30 @@ def test_doctype2(self):

def test_doctype_html5(self):
# html 5 doctype declaration
html = '<!DOCTYPE html>\n<html lang="en"></html>'
html = b'<!DOCTYPE html>\n<html lang="en"></html>'

tree = self.soupparser.fromstring(html).getroottree()
self.assertTrue(tree.docinfo.public_id is None)
self.assertEqual(tostring(tree), html)

else:
class SoupNotInstalledTestCase(HelperTestCase):

def test_beautifulsoup_not_installed(self):
# If BS_INSTALLED failed, beautifulsoup should not exist
with self.assertRaises(ImportError):
import bs4
with self.assertRaises(ImportError):
import BeautifulSoup

def test_suite():
suite = unittest.TestSuite()
if BS_INSTALLED:
suite.addTests([unittest.makeSuite(SoupParserTestCase)])
if sys.version_info[0] < 3:
suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
else:
suite.addTests([unittest.makeSuite(SoupNotInstalledTestCase)])
return suite

if __name__ == '__main__':
Expand Down

0 comments on commit 447f49f

Please sign in to comment.