### HTML Table Parser & Dataframe to HTML Converter (Beautifulsoup)

<br>

### Development Environment

In [1]:
import re
import html
import requests
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup
import pandas.io.formats.style

### Page Information

In [2]:
def make_page_soup(link):
    target = link
    response = requests.get(target)
    response.raise_for_status()
    response.encoding='EUC-KR'
    page = str(response.content)
    soup = BeautifulSoup(response.text, 'html.parser') 
    return page, soup

In [3]:
link = "http://web.stanford.edu/class/cs224n/"
page, soup = make_page_soup(link)
tables = soup.find_all('table', attrs={'class':'table'})
table = tables[1]

### HTML Table Parser

In [65]:
def parse_by_pattern(text):
    link_list = []
    final_link_list = []
    href_pattern = re.findall(r'href=[\'"]?([^\'" >]+)', text)
    if href_pattern != None:
        number_of_link = len(href_pattern)
        for i in range(number_of_link):
            link = href_pattern[i]
            link_list.append(link)
        for link in link_list:
            if "slides/" in link or "project/" in link or \
                "readings/" in link or 'assignments/' in link:
                link = "http://web.stanford.edu/class/cs224n/" + link
                final_link_list.append(link)
    html_pattern = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
    text = re.sub(html_pattern, "", text) 
    text = text.replace("\n", " ")
    try:
        if len(final_link_list) > 0:
            for link in final_link_list:
                text +=  "<br>" + '<a href="'+ link + '">link</a>' 
        else:
            for link in link_list:
                text +=  "<br>" + '<a href="'+ link + '">link</a>' 
    except:
        pass  
    text = text.strip()
    text += "<br>"
    return text

In [66]:
date_list = []
slides_tutorial_list = []
readings_list = []
assignment_list = []

table_tr = table('tr')
for i in range(len(table_tr)):
    if i > 0:
        try:
            date = str(table_tr[i]('td')[0])            
            date = parse_by_pattern(date)
        except:
            date = None        
        try:
            slides_tutorial = str(table_tr[i]('td')[1])
            slides_tutorial = parse_by_pattern(slides_tutorial)
        except:
            slides_tutorial = None
        try:    
            readings = str(table_tr[i]('td')[2])
            readings = parse_by_pattern(readings)
        except:
            readings = None
        try:   
            assignment = str(table_tr[i]('td')[3])
            assignment = parse_by_pattern(assignment)
        except:
            assignment = None

        date_list.append(date)
        slides_tutorial_list.append(slides_tutorial)
        readings_list.append(readings)
        assignment_list.append(assignment)  

In [67]:
slides_tutorial_df = pd.DataFrame({'date':date_list, 'content':slides_tutorial_list})
readings_df = pd.DataFrame({'date':date_list, 'content':readings_list})
assignment_df = pd.DataFrame({'date':date_list, 'content':assignment_list})

### Save HTML Dataframe

In [68]:
def write_to_html_file(df, title='', filename='out.html'):
    '''
    Write an entire dataframe to an HTML file with nice formatting.
    '''

    result = '''
<html>
<head>
<style>

    h2 {
        text-align: center;
        font-family: Helvetica, Arial, sans-serif;
    }
    table { 
        margin-left: auto;
        margin-right: auto;
    }
    table, th, td {
        border: 1px solid black;
        border-collapse: collapse;
    }
    th, td {
        word-break: break-all
        padding: 5px;
        text-align: center;
        font-family: Helvetica, Arial, sans-serif;
        font-size: 90%;
    }
    table tbody tr:hover {
        background-color: #dddddd;
    }
    .wide {
        width: 90%; 
    }

</style>
</head>
<body>
    '''
    result += '<h2> %s </h2>\n' % title
    if type(df) == pd.io.formats.style.Styler:
        result += df.render()
    else:
        result += df.to_html(classes='wide', escape=False)
    result += '''
</body>
</html>
'''
    with open(filename, 'w') as f:
        f.write(result)

In [72]:
write_to_html_file(slides_tutorial_df, title='', filename='slides_tutorial.html')
write_to_html_file(readings_df, title='', filename='readings.html')
write_to_html_file(assignment_df, title='', filename='assignment.html')

<br>

### Reference

<br><b>MOOC<b>
<br>[CS224N: Natural Language Processing with Deep Learning, Stanford University](http://web.stanford.edu/class/cs224n/)

<br><b>Stackoverflow<b>
<br>[Applying styling to Pandas dataframe saved to HTML file](https://stackoverflow.com/questions/47704441/applying-styling-to-pandas-dataframe-saved-to-html-file)