In [1]:
import re
import json

import requests
from bs4 import BeautifulSoup
from IPython.display import HTML

In [2]:
%%html
<style>
table {float:left}
</style>

---
# HTML Parser

* Preserve original content structure
* Does not drop ```<HTML>```, ```</HTML>``` tags.

In [69]:
content = """<DOCUMENT>
    <TYPE>10-Q
    <SEQUENCE>1
    <FILENAME>d740164d10q.htm
    <DESCRIPTION>10-Q
    <TEXT>
        <HTML>
            <HEAD><TITLE>${DOCUMEHT_TYPE}</TITLE></HEAD>
            <BODY BGCOLOR="WHITE">
                <table>
                  <tr>
                    <th>Company</th>
                    <th>Contact</th>
                    <th>Country</th>
                  </tr>
                  <tr>
                    <td>Alfreds Futterkiste</td>
                    <td>Maria Anders</td>
                    <td>Germany</td>
                  </tr>
                  <tr>
                    <td>Centro comercial Moctezuma</td>
                    <td>Francisco Chang</td>
                    <td>Mexico</td>
                  </tr>
                </table>
            </BODY>
        </HTML>
    </TEXT>
</DOCUMENT>
"""

In [80]:
soup = BeautifulSoup(content, 'html.parser')
soup

<document>
<type>10-Q
    <sequence>1
    <filename>d740164d10q.htm
    <description>10-Q
    <text>
<html>
<head><title>${DOCUMEHT_TYPE}</title></head>
<body bgcolor="WHITE">
<table>
<tr>
<th>Company</th>
<th>Contact</th>
<th>Country</th>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td>Maria Anders</td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td>Mexico</td>
</tr>
</table>
</body>
</html>
</text>
</description></filename></sequence></type></document>

In [81]:
soup.find('document').find('type').find(text=True, recursive=False).strip()

'10-Q'

In [83]:
html = soup.find('html')
print(html)
HTML(data=html.prettify())

<html>
<head><title>${DOCUMEHT_TYPE}</title></head>
<body bgcolor="WHITE">
<table>
<tr>
<th>Company</th>
<th>Contact</th>
<th>Country</th>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td>Maria Anders</td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td>Mexico</td>
</tr>
</table>
</body>
</html>


Company,Contact,Country
Alfreds Futterkiste,Maria Anders,Germany
Centro comercial Moctezuma,Francisco Chang,Mexico


---
# LXML Parser

* Change the original content structure
* Drops ```<HTML>```, ```</HTML>``` tags.


In [84]:
soup = BeautifulSoup(content, 'lxml')
soup

<html><body><document>
<type>10-Q
    <sequence>1
    <filename>d740164d10q.htm
    <description>10-Q
    <text>
<title>${DOCUMEHT_TYPE}</title>
<table>
<tr>
<th>Company</th>
<th>Contact</th>
<th>Country</th>
</tr>
<tr>
<td>Alfreds Futterkiste</td>
<td>Maria Anders</td>
<td>Germany</td>
</tr>
<tr>
<td>Centro comercial Moctezuma</td>
<td>Francisco Chang</td>
<td>Mexico</td>
</tr>
</table>
</text>
</description></filename></sequence></type></document>
</body></html>

In [86]:
soup.find('document').find('type').find(text=True, recursive=False).strip()

'10-Q'

In [59]:
HTML(data=html.prettify())

Company,Contact,Country
Alfreds Futterkiste,Maria Anders,Germany
Centro comercial Moctezuma,Francisco Chang,Mexico


HTML parser cannot handle non-closing tag