### HTML Table to Dataframe Converter (Beautifulsoup)

<br>

### Development Environment

In [2]:
import re
import requests
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup

### Page Information

In [3]:
def make_page_soup(link):
    target = link
    response = requests.get(target)
    response.raise_for_status()
    response.encoding='EUC-KR'
    page = str(response.content)
    soup = BeautifulSoup(response.text, 'html.parser') 
    return page, soup

In [4]:
link = "http://web.stanford.edu/class/cs224n/"
page, soup = make_page_soup(link)
tables = soup.find_all('table', attrs={'class':'table'})
table = tables[1]

### HTML Table to Dataframe Converter

In [33]:
def parse_by_pattern(text):
    link_list = []
    final_link_list = []
    href_pattern = re.findall(r'href=[\'"]?([^\'" >]+)', text)
    if href_pattern != None:
        number_of_link = len(href_pattern)
        for i in range(number_of_link):
            link = href_pattern[i]
            link_list.append(link)
        for link in link_list:
            if "slides/" in link or "project/" in link or \
                "readings/" in link or 'assignments/' in link:
                link = "http://web.stanford.edu/class/cs224n/" + link
                final_link_list.append(link)
    html_pattern = "<(?:\"[^\"]*\"['\"]*|'[^']*'['\"]*|[^'\">])+>"
    text = re.sub(html_pattern, ' ', text) 
    try:
        if len(final_link_list) > 0:
            for link in final_link_list:
                text +=  " " + link
        else:
            for link in link_list:
                text +=  " " + link
    except:
        pass  
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

In [34]:
date_list = []
slides_tutorial_list = []
readings_list = []
assignment_list = []

table_tr = table('tr')
for i in range(len(table_tr)):
    if i > 0:
        try:
            date = str(table_tr[i]('td')[0])            
            date = parse_by_pattern(date)
        except:
            date = None        
        try:
            slides_tutorial = str(table_tr[i]('td')[1])
            slides_tutorial = parse_by_pattern(slides_tutorial)
        except:
            slides_tutorial = None
        try:    
            readings = str(table_tr[i]('td')[2])
            readings = parse_by_pattern(readings)
        except:
            readings = None
        try:   
            assignment = str(table_tr[i]('td')[3])
            assignment = parse_by_pattern(assignment)
        except:
            assignment = None

        date_list.append(date)
        slides_tutorial_list.append(slides_tutorial)
        readings_list.append(readings)
        assignment_list.append(assignment)  

In [35]:
slides_tutorial_list = [v for v in slides_tutorial_list if v]
readings_list = [v for v in readings_list if v]
assignment_list = [v for v in assignment_list if v]

In [36]:
slides_tutorial_df = pd.DataFrame({'content':slides_tutorial_list})
readings_df = pd.DataFrame({'content':readings_list})
assignment_df = pd.DataFrame({'content':assignment_list})

In [37]:
slides_tutorial_df.to_excel("slides_tutorial.xlsx")
readings_df.to_excel("readings.xlsx")
assignment_df.to_excel("assignment.xlsx")

In [38]:
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', 180)

In [39]:
slides_tutorial_df

Unnamed: 0,content
0,Word Vectors (by John Hewitt) [ slides ] video ]--> [ notes ] Gensim word vectors example: [ code ] [ preview ] http://web.stanford.edu/class/cs224n/slides/cs224n-2023-lecture0...
1,"Word Vectors, Word Window Classification, Language Models [ slides ] video ]--> [ notes ] http://web.stanford.edu/class/cs224n/slides/cs224n-2023-lecture02-wordvecs2.pdf http:/..."
...,...
25,Final Project Emergency Assistance (no lecture) [ slides ]--> http://web.stanford.edu/class/cs224n/slides/cs224n-2021-lecture18-future.pdf
26,Poster Session [ code ] [ preview ]--> http://web.stanford.edu/class/cs224n/readings/cs224n-python-review-code-updated.zip http://web.stanford.edu/class/cs224n/readings/cs224n-...


In [40]:
readings_df

Unnamed: 0,content
0,Suggested Readings: Efficient Estimation of Word Representations in Vector Space (original word2vec paper) Distributed Representations of Words and Phrases and their Compositio...
1,Suggested Readings: GloVe: Global Vectors for Word Representation (original GloVe paper) Improving Distributional Similarity with Lessons Learned from Word Embeddings Evaluatio...
...,...
21,"Extra project office hours available during usual lecture time, see Ed."
22,5pm-9pm [ More details ] Location: Tressider Oak Lounge project.html


In [41]:
assignment_df

Unnamed: 0,content
0,Assignment 1 out [ code ] [ preview ] http://web.stanford.edu/class/cs224n/assignments/a1.zip http://web.stanford.edu/class/cs224n/assignments/a1_preview/exploring_word_vectors...
1,Assignment 2 out (Not yet released) --> [ code ] [ handout ] [ latex template ] http://web.stanford.edu/class/cs224n/assignments/a2.zip http://web.stanford.edu/class/cs224n/ass...
...,...
6,Colab https://colab.research.google.com/drive/1pxc-ehTtnVM72-NViET_D2ZqOlpOi2LH?usp=sharing
7,Project Milestone out [ Instructions ] http://web.stanford.edu/class/cs224n/project/CS224N_Final_Project_Milestone_Instructions.pdf


<br>

### Reference

<br><b>MOOC<b>
<br>[CS224N: Natural Language Processing with Deep Learning, Stanford University](http://web.stanford.edu/class/cs224n/)