# Parsing HTML with BeautifulSoup

In this example, we want to look at a website and get a list of all the available downloadable files from that website.

https://catalog.data.gov/dataset?res_format=CSV&tags=hospital

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
r = requests.get('https://www.healthdata.gov/browse?q=new%20york%20city&sortBy=relevance')

In [None]:
r.status_code

In [None]:
soup = BeautifulSoup(r.text)

In [None]:
result_list = {}

for num, item in enumerate(soup.find_all('a', {'class': 'browse2-result-name-link'})):
    if num > 10:
        break
    result_list[item.text] = item.attrs.get('href')
    
result_list

In [None]:
r = requests.get('https://fhir.epic.com/Specifications?api=981')

In [None]:
r.status_code

In [None]:
soup = BeautifulSoup(r.text)

In [None]:
# {
#   '4100': ['Fatal', 'The resource request contained an invalid parmater', 'example']
# }

output = {}

for table in soup.find_all('table', {'class': 'api-documentation-table'}):
#     print("Found a table")
    
    headers = []
    for header in table.find_all('th'):
        headers.append(header.text)
        
    if headers == ['Error Code', 'Severity', 'Description', 'Example']:
        for row in table.find_all('tr'):
            data = [x.text for x in list(row.find_all('td'))]
            if len(data) != 0:
                code = data[0]
                severity = data[1]
                description = data[2]
                example = data[3]
                output[code] = [severity, description, example]
             
import json

print(json.dumps(output, indent=4))

In [None]:
import pandas as pd

tables = pd.read_html('https://fhir.epic.com/Specifications?api=981')

In [None]:
len(tables)

In [None]:
tables[6]

In [None]:
r = requests.get('https://catalog.data.gov/dataset?res_format=CSV&tags=hospital')

In [None]:
r.status_code

In [None]:
print(r.text[0:1000])

In [None]:
soup = BeautifulSoup(r.text)

In [None]:
for link in soup.find_all('h3'):
    print(link.a.text)

In [None]:
for element in soup.find_all('li', 'dataset-item'):
    name = element.h3.text.strip()
    resources = element.ul
    for item in resources.find_all('li'):
        if item.text.strip() == 'CSV':
            print("Download information about '{}' from {}".format(name,item.a.attrs['href']))
            
    


# Getting Table Data


In this example, we're going to find an HTML table and extract the data from that table

https://open.epic.com/Clinical/Allergy - Error Codes

In [None]:
import requests
from bs4 import BeautifulSoup
import json

In [None]:
url = 'https://open.epic.com/Clinical/Allergy'
r  = requests.get(url)
data = r.text

soup = BeautifulSoup(data)

table = soup.find('table',id='errors')
print(table)

In [None]:
# In HTML tables, there is usually a <thead> section to tell us what the column headers are.
# Let's load those into a simple list of headers[]
headers = []
for cell in table.thead.tr.find_all('th'):
    headers.append(cell.text)

headers

In [None]:
# In HTML tables, the rows are in a <tbody> section
errors = {}
for row in table.tbody.find_all('tr'):
    colnum = 0
    for cell in row.find_all('td'):
        if colnum == 0:
            error_cd = cell.text
            errors.setdefault(error_cd, {})
        else:
            column = headers[colnum]
            errors[error_cd][column] = cell.text
        colnum += 1

In [None]:
print(json.dumps(errors, indent=4))

In [None]:
errors.get('4119')

In [None]:
errors.get('4119')['Severity']

## Reading HTML Tables with Pandas

Pandas has the ability to read HTML, too.  In ideal circumstances, it will scour whatever page you give it and find all of the tables there.  The result from `read_html()` will be a list of dataframes.

https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.read_html.html

In [None]:
import pandas as pd

In [None]:
dfs = pd.read_html('https://open.epic.com/Clinical/Allergy')

In [None]:
dfs

In [None]:
dfs[4]