From 59bf873511e9432caae45c3ba502f69e0c516d84 Mon Sep 17 00:00:00 2001 From: mozbugbox Date: Fri, 5 Jun 2015 00:16:53 +0800 Subject: [PATCH 1/5] Port soupparser to bs4 --- src/lxml/html/soupparser.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py index 78fbc12f3..d28b9d04b 100644 --- a/src/lxml/html/soupparser.py +++ b/src/lxml/html/soupparser.py @@ -5,9 +5,15 @@ import re from lxml import etree, html -from BeautifulSoup import \ - BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \ - Declaration +try: + from BeautifulSoup import \ + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \ + Declaration, CData +except ImportError: + from bs4 import \ + BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \ + Declaration, CData + def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs): @@ -59,8 +65,9 @@ def convert_tree(beautiful_soup_tree, makeelement=None): def _parse(source, beautifulsoup, makeelement, **bsargs): if beautifulsoup is None: beautifulsoup = BeautifulSoup - if 'convertEntities' not in bsargs: - bsargs['convertEntities'] = 'html' + if hasattr(beautifulsoup, "HTML_ENTITIES"): # not bs4 + if 'convertEntities' not in bsargs: + bsargs['convertEntities'] = 'html' tree = beautifulsoup(source, **bsargs) root = _convert_tree(tree, makeelement) # from ET: wrap the document in a html root element, if necessary @@ -164,7 +171,14 @@ def _convert_tree(beautiful_soup_tree, makeelement): def _convert_node(bs_node, parent=None, makeelement=None): res = None if isinstance(bs_node, (Tag, _PseudoTag)): - attribs = dict((k, unescape(v)) for k, v in bs_node.attrs) + if isinstance(bs_node.attrs, dict): # bs4 + attribs = {} + for k, v in bs_node.attrs.items(): + if isinstance(v, list): + v = " ".join(v) + attribs[k] = unescape(v) + else: + attribs = dict((k, unescape(v)) for k, v in bs_node.attrs) if parent is not None: res = etree.SubElement(parent, bs_node.name, attrib=attribs) else: @@ -189,7 +203,7 @@ def _convert_node(bs_node, parent=None, makeelement=None): parent.append(res) elif isinstance(bs_node, Declaration): pass - else: # CData + elif isinstance(bs_node, CData): _append_text(parent, unescape(bs_node)) return res From cf15d10de1d8a6bff76d23e588eea4d7bc24e3e2 Mon Sep 17 00:00:00 2001 From: mozbugbox Date: Fri, 5 Jun 2015 19:27:48 +0800 Subject: [PATCH 2/5] BeautifulSoup 4: handle Doctype and Declaration bs4 can use lxml or html5lib to parse html content. Force bs4 builtin html parser when parse html with soupparser. --- src/lxml/html/soupparser.py | 18 ++++++++++++++---- src/lxml/html/tests/test_elementsoup.py | 21 ++++++++++++--------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py index d28b9d04b..fbbc63d50 100644 --- a/src/lxml/html/soupparser.py +++ b/src/lxml/html/soupparser.py @@ -9,10 +9,12 @@ from BeautifulSoup import \ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \ Declaration, CData + class Doctype: + pass except ImportError: from bs4 import \ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \ - Declaration, CData + Declaration, CData, Doctype @@ -65,9 +67,12 @@ def convert_tree(beautiful_soup_tree, makeelement=None): def _parse(source, beautifulsoup, makeelement, **bsargs): if beautifulsoup is None: beautifulsoup = BeautifulSoup - if hasattr(beautifulsoup, "HTML_ENTITIES"): # not bs4 + if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3 if 'convertEntities' not in bsargs: bsargs['convertEntities'] = 'html' + if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4 + if 'features' not in bsargs: + bsargs['features'] = ['html.parser'] # force bs html parser tree = beautifulsoup(source, **bsargs) root = _convert_tree(tree, makeelement) # from ET: wrap the document in a html root element, if necessary @@ -114,7 +119,7 @@ def _convert_tree(beautiful_soup_tree, makeelement): last_element_idx = i if html_root is None and e.name and e.name.lower() == 'html': html_root = e - elif declaration is None and isinstance(e, Declaration): + elif declaration is None and isinstance(e, (Declaration, Doctype)): declaration = e # For a nice, well-formatted document, the variable roots below is @@ -153,7 +158,12 @@ def _convert_tree(beautiful_soup_tree, makeelement): prev = converted if declaration is not None: - match = _parse_doctype_declaration(declaration.string) + try: + # bs4, got full Doctype string + doctype_string = declaration.output_ready().strip().strip("") + except AttributeError: + doctype_string = declaration.string + match = _parse_doctype_declaration(doctype_string) if not match: # Something is wrong if we end up in here. Since soupparser should # tolerate errors, do not raise Exception, just let it pass. diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py index 2b19965d4..d16a702e1 100644 --- a/src/lxml/html/tests/test_elementsoup.py +++ b/src/lxml/html/tests/test_elementsoup.py @@ -1,11 +1,14 @@ import unittest, sys from lxml.tests.common_imports import make_doctest, HelperTestCase +BS_INSTALLED = True try: import BeautifulSoup - BS_INSTALLED = True except ImportError: - BS_INSTALLED = False + try: + import bs4 + except ImportError: + BS_INSTALLED = False from lxml.html import tostring @@ -24,21 +27,21 @@ def test_broken_attribute(self): def test_body(self): html = '''

test

''' - res = '''

test

''' + res = b'''

test

''' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res) def test_head_body(self): # HTML tag missing, parser should fix that html = 'test

test

' - res = 'test

test

' + res = b'test

test

' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res) def test_wrap_html(self): # outside , parser should fix that html = 'title</test></head><html><body/></html>' - res = '<html><head><title>title' + res = b'title' tree = self.soupparser.fromstring(html) self.assertEqual(tostring(tree), res) @@ -47,7 +50,7 @@ def test_comment_pi(self): test

test

''' - res = ''' + res = b''' test

test

''' tree = self.soupparser.fromstring(html).getroottree() self.assertEqual(tostring(tree, method='html'), res) @@ -60,7 +63,7 @@ def test_doctype1(self): My first HTML document

Hello world!

''' res = \ -''' +b''' My first HTML document

Hello world!

''' tree = self.soupparser.fromstring(html).getroottree() @@ -75,7 +78,7 @@ def test_doctype2(self): My first HTML document

Hello world!

''' res = \ -''' +b''' My first HTML document

Hello world!

''' tree = self.soupparser.fromstring(html).getroottree() @@ -84,7 +87,7 @@ def test_doctype2(self): def test_doctype_html5(self): # html 5 doctype declaration - html = '\n' + html = b'\n' tree = self.soupparser.fromstring(html).getroottree() self.assertTrue(tree.docinfo.public_id is None) From 309d3f300234c1daecee9910a5dd3a0d86743977 Mon Sep 17 00:00:00 2001 From: mozbugbox Date: Fri, 5 Jun 2015 20:16:37 +0800 Subject: [PATCH 3/5] change code in elementsoup.txt for bs4 behavior This will make test pass --- doc/elementsoup.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/elementsoup.txt b/doc/elementsoup.txt index 417ab8492..bc19c5d63 100644 --- a/doc/elementsoup.txt +++ b/doc/elementsoup.txt @@ -47,7 +47,7 @@ Here is a document full of tag soup, similar to, but not quite like, HTML: .. sourcecode:: pycon - >>> tag_soup = 'Hello</head><body onload=crash()>Hi all<p>' + >>> tag_soup = '<meta/><head><title>Hello</head><body onload=crash()>Hi all<p>' all you need to do is pass it to the ``fromstring()`` function: From afa17103213b0fab66f381d7eae01e3433ac2d01 Mon Sep 17 00:00:00 2001 From: mozbugbox <mozbugbox@yahoo.com.au> Date: Fri, 5 Jun 2015 23:28:51 +0800 Subject: [PATCH 4/5] bs4: replace exception with hasattr on output_ready --- src/lxml/html/soupparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py index 94391bfa2..5951745cc 100644 --- a/src/lxml/html/soupparser.py +++ b/src/lxml/html/soupparser.py @@ -157,10 +157,10 @@ def _convert_tree(beautiful_soup_tree, makeelement): prev = converted if declaration is not None: - try: + if hasattr(declaration, "output_ready"): # bs4, got full Doctype string doctype_string = declaration.output_ready().strip().strip("<!>") - except AttributeError: + else: doctype_string = declaration.string match = _parse_doctype_declaration(doctype_string) if not match: From 9d14dd83b95ffdba04f3ed62b34e5ce862c5cc6f Mon Sep 17 00:00:00 2001 From: mozbugbox <mozbugbox@yahoo.com.au> Date: Sat, 6 Jun 2015 00:04:19 +0800 Subject: [PATCH 5/5] unittest check beautifulsoup/bs4 import properly --- src/lxml/html/tests/test_elementsoup.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py index d16a702e1..4eaed7eca 100644 --- a/src/lxml/html/tests/test_elementsoup.py +++ b/src/lxml/html/tests/test_elementsoup.py @@ -3,12 +3,9 @@ BS_INSTALLED = True try: - import BeautifulSoup + import lxml.html.soupparser except ImportError: - try: - import bs4 - except ImportError: - BS_INSTALLED = False + BS_INSTALLED = False from lxml.html import tostring @@ -93,12 +90,24 @@ def test_doctype_html5(self): self.assertTrue(tree.docinfo.public_id is None) self.assertEqual(tostring(tree), html) +else: + class SoupNotInstalledTestCase(HelperTestCase): + + def test_beautifulsoup_not_installed(self): + # If BS_INSTALLED failed, beautifulsoup should not exist + with self.assertRaises(ImportError): + import bs4 + with self.assertRaises(ImportError): + import BeautifulSoup + def test_suite(): suite = unittest.TestSuite() if BS_INSTALLED: suite.addTests([unittest.makeSuite(SoupParserTestCase)]) if sys.version_info[0] < 3: suite.addTests([make_doctest('../../../../doc/elementsoup.txt')]) + else: + suite.addTests([unittest.makeSuite(SoupNotInstalledTestCase)]) return suite if __name__ == '__main__':