From 59bf873511e9432caae45c3ba502f69e0c516d84 Mon Sep 17 00:00:00 2001
From: mozbugbox <mozbugbox@yahoo.com.au>
Date: Fri, 5 Jun 2015 00:16:53 +0800
Subject: [PATCH 1/5] Port soupparser to bs4

---
 src/lxml/html/soupparser.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py
index 78fbc12f3..d28b9d04b 100644
--- a/src/lxml/html/soupparser.py
+++ b/src/lxml/html/soupparser.py
@@ -5,9 +5,15 @@
 
 import re
 from lxml import etree, html
-from BeautifulSoup import \
-     BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \
-     Declaration
+try:
+    from BeautifulSoup import \
+         BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \
+         Declaration, CData
+except ImportError:
+    from bs4 import \
+         BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \
+         Declaration, CData
+
 
 
 def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
@@ -59,8 +65,9 @@ def convert_tree(beautiful_soup_tree, makeelement=None):
 def _parse(source, beautifulsoup, makeelement, **bsargs):
     if beautifulsoup is None:
         beautifulsoup = BeautifulSoup
-    if 'convertEntities' not in bsargs:
-        bsargs['convertEntities'] = 'html'
+    if hasattr(beautifulsoup, "HTML_ENTITIES"): # not bs4
+        if 'convertEntities' not in bsargs:
+            bsargs['convertEntities'] = 'html'
     tree = beautifulsoup(source, **bsargs)
     root = _convert_tree(tree, makeelement)
     # from ET: wrap the document in a html root element, if necessary
@@ -164,7 +171,14 @@ def _convert_tree(beautiful_soup_tree, makeelement):
 def _convert_node(bs_node, parent=None, makeelement=None):
     res = None
     if isinstance(bs_node, (Tag, _PseudoTag)):
-        attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
+        if isinstance(bs_node.attrs, dict): # bs4
+            attribs = {}
+            for k, v in bs_node.attrs.items():
+                if isinstance(v, list):
+                    v = " ".join(v)
+                attribs[k] = unescape(v)
+        else:
+            attribs = dict((k, unescape(v)) for k, v in bs_node.attrs)
         if parent is not None:
             res = etree.SubElement(parent, bs_node.name, attrib=attribs)
         else:
@@ -189,7 +203,7 @@ def _convert_node(bs_node, parent=None, makeelement=None):
             parent.append(res)
     elif isinstance(bs_node, Declaration):
         pass
-    else:  # CData
+    elif isinstance(bs_node, CData):
         _append_text(parent, unescape(bs_node))
     return res
 

From cf15d10de1d8a6bff76d23e588eea4d7bc24e3e2 Mon Sep 17 00:00:00 2001
From: mozbugbox <mozbugbox@yahoo.com.au>
Date: Fri, 5 Jun 2015 19:27:48 +0800
Subject: [PATCH 2/5] BeautifulSoup 4: handle Doctype and Declaration

bs4 can use lxml or html5lib to parse html content. Force bs4 builtin
html parser when parse html with soupparser.
---
 src/lxml/html/soupparser.py             | 18 ++++++++++++++----
 src/lxml/html/tests/test_elementsoup.py | 21 ++++++++++++---------
 2 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py
index d28b9d04b..fbbc63d50 100644
--- a/src/lxml/html/soupparser.py
+++ b/src/lxml/html/soupparser.py
@@ -9,10 +9,12 @@
     from BeautifulSoup import \
          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \
          Declaration, CData
+    class Doctype:
+        pass
 except ImportError:
     from bs4 import \
          BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString, \
-         Declaration, CData
+         Declaration, CData, Doctype
 
 
 
@@ -65,9 +67,12 @@ def convert_tree(beautiful_soup_tree, makeelement=None):
 def _parse(source, beautifulsoup, makeelement, **bsargs):
     if beautifulsoup is None:
         beautifulsoup = BeautifulSoup
-    if hasattr(beautifulsoup, "HTML_ENTITIES"): # not bs4
+    if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
         if 'convertEntities' not in bsargs:
             bsargs['convertEntities'] = 'html'
+    if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
+        if 'features' not in bsargs:
+            bsargs['features'] = ['html.parser'] # force bs html parser
     tree = beautifulsoup(source, **bsargs)
     root = _convert_tree(tree, makeelement)
     # from ET: wrap the document in a html root element, if necessary
@@ -114,7 +119,7 @@ def _convert_tree(beautiful_soup_tree, makeelement):
             last_element_idx = i
             if html_root is None and e.name and e.name.lower() == 'html':
                 html_root = e
-        elif declaration is None and isinstance(e, Declaration):
+        elif declaration is None and isinstance(e, (Declaration, Doctype)):
             declaration = e
 
     # For a nice, well-formatted document, the variable roots below is
@@ -153,7 +158,12 @@ def _convert_tree(beautiful_soup_tree, makeelement):
             prev = converted
 
     if declaration is not None:
-        match = _parse_doctype_declaration(declaration.string)
+        try:
+            # bs4, got full Doctype string
+            doctype_string = declaration.output_ready().strip().strip("<!>")
+        except AttributeError:
+            doctype_string = declaration.string
+        match = _parse_doctype_declaration(doctype_string)
         if not match:
             # Something is wrong if we end up in here. Since soupparser should
             # tolerate errors, do not raise Exception, just let it pass.
diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py
index 2b19965d4..d16a702e1 100644
--- a/src/lxml/html/tests/test_elementsoup.py
+++ b/src/lxml/html/tests/test_elementsoup.py
@@ -1,11 +1,14 @@
 import unittest, sys
 from lxml.tests.common_imports import make_doctest, HelperTestCase
 
+BS_INSTALLED = True
 try:
     import BeautifulSoup
-    BS_INSTALLED = True
 except ImportError:
-    BS_INSTALLED = False
+    try:
+        import bs4
+    except ImportError:
+        BS_INSTALLED = False
 
 from lxml.html import tostring
 
@@ -24,21 +27,21 @@ def test_broken_attribute(self):
 
         def test_body(self):
             html = '''<body><p>test</p></body>'''
-            res = '''<html><body><p>test</p></body></html>'''
+            res = b'''<html><body><p>test</p></body></html>'''
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
         def test_head_body(self):
             # HTML tag missing, parser should fix that
             html = '<head><title>test</title></head><body><p>test</p></body>'
-            res = '<html><head><title>test</title></head><body><p>test</p></body></html>'
+            res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
         def test_wrap_html(self):
             # <head> outside <html>, parser should fix that
             html = '<head><title>title</test></head><html><body/></html>'
-            res = '<html><head><title>title</title></head><body></body></html>'
+            res = b'<html><head><title>title</title></head><body></body></html>'
             tree = self.soupparser.fromstring(html)
             self.assertEqual(tostring(tree), res)
 
@@ -47,7 +50,7 @@ def test_comment_pi(self):
 <?test asdf?>
 <head><title>test</title></head><body><p>test</p></body>
 <!-- another comment -->'''
-            res = '''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+            res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
 <!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
             tree = self.soupparser.fromstring(html).getroottree()
             self.assertEqual(tostring(tree, method='html'), res)
@@ -60,7 +63,7 @@ def test_doctype1(self):
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''
 
             res = \
-'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             tree = self.soupparser.fromstring(html).getroottree()
@@ -75,7 +78,7 @@ def test_doctype2(self):
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             res = \
-'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
+b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
 <!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
 
             tree = self.soupparser.fromstring(html).getroottree()
@@ -84,7 +87,7 @@ def test_doctype2(self):
 
         def test_doctype_html5(self):
             # html 5 doctype declaration
-            html = '<!DOCTYPE html>\n<html lang="en"></html>'
+            html = b'<!DOCTYPE html>\n<html lang="en"></html>'
 
             tree = self.soupparser.fromstring(html).getroottree()
             self.assertTrue(tree.docinfo.public_id is None)

From 309d3f300234c1daecee9910a5dd3a0d86743977 Mon Sep 17 00:00:00 2001
From: mozbugbox <mozbugbox@yahoo.com.au>
Date: Fri, 5 Jun 2015 20:16:37 +0800
Subject: [PATCH 3/5] change code in elementsoup.txt for bs4 behavior

This will make test pass
---
 doc/elementsoup.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/elementsoup.txt b/doc/elementsoup.txt
index 417ab8492..bc19c5d63 100644
--- a/doc/elementsoup.txt
+++ b/doc/elementsoup.txt
@@ -47,7 +47,7 @@ Here is a document full of tag soup, similar to, but not quite like, HTML:
 
 .. sourcecode:: pycon
 
-    >>> tag_soup = '<meta><head><title>Hello</head><body onload=crash()>Hi all<p>'
+    >>> tag_soup = '<meta/><head><title>Hello</head><body onload=crash()>Hi all<p>'
 
 all you need to do is pass it to the ``fromstring()`` function:
 

From afa17103213b0fab66f381d7eae01e3433ac2d01 Mon Sep 17 00:00:00 2001
From: mozbugbox <mozbugbox@yahoo.com.au>
Date: Fri, 5 Jun 2015 23:28:51 +0800
Subject: [PATCH 4/5] bs4: replace exception with hasattr on output_ready

---
 src/lxml/html/soupparser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py
index 94391bfa2..5951745cc 100644
--- a/src/lxml/html/soupparser.py
+++ b/src/lxml/html/soupparser.py
@@ -157,10 +157,10 @@ def _convert_tree(beautiful_soup_tree, makeelement):
             prev = converted
 
     if declaration is not None:
-        try:
+        if hasattr(declaration, "output_ready"):
             # bs4, got full Doctype string
             doctype_string = declaration.output_ready().strip().strip("<!>")
-        except AttributeError:
+        else:
             doctype_string = declaration.string
         match = _parse_doctype_declaration(doctype_string)
         if not match:

From 9d14dd83b95ffdba04f3ed62b34e5ce862c5cc6f Mon Sep 17 00:00:00 2001
From: mozbugbox <mozbugbox@yahoo.com.au>
Date: Sat, 6 Jun 2015 00:04:19 +0800
Subject: [PATCH 5/5] unittest check beautifulsoup/bs4 import properly

---
 src/lxml/html/tests/test_elementsoup.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py
index d16a702e1..4eaed7eca 100644
--- a/src/lxml/html/tests/test_elementsoup.py
+++ b/src/lxml/html/tests/test_elementsoup.py
@@ -3,12 +3,9 @@
 
 BS_INSTALLED = True
 try:
-    import BeautifulSoup
+    import lxml.html.soupparser
 except ImportError:
-    try:
-        import bs4
-    except ImportError:
-        BS_INSTALLED = False
+    BS_INSTALLED = False
 
 from lxml.html import tostring
 
@@ -93,12 +90,24 @@ def test_doctype_html5(self):
             self.assertTrue(tree.docinfo.public_id is None)
             self.assertEqual(tostring(tree), html)
 
+else:
+    class SoupNotInstalledTestCase(HelperTestCase):
+
+        def test_beautifulsoup_not_installed(self):
+            # If BS_INSTALLED failed, beautifulsoup should not exist
+            with self.assertRaises(ImportError):
+                import bs4
+            with self.assertRaises(ImportError):
+                import BeautifulSoup
+
 def test_suite():
     suite = unittest.TestSuite()
     if BS_INSTALLED:
         suite.addTests([unittest.makeSuite(SoupParserTestCase)])
         if sys.version_info[0] < 3:
             suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
+    else:
+        suite.addTests([unittest.makeSuite(SoupNotInstalledTestCase)])
     return suite
 
 if __name__ == '__main__':