## Web Scraping

1. Querying web page
1. Parse the DOM tree
1. Get the data we want from the HTML code

In [None]:
from bs4 import BeautifulSoup
import requests


res = requests.get("https://news.gov.mo/home/zh-hant")
soup = BeautifulSoup(res.text, "html.parser")

for h5 in soup.select("h5"):
    print(h5.getText().strip())


## Extra: Fetching with try-except

In [None]:
from bs4 import BeautifulSoup
import requests

try:
    res = requests.get("https://news.gov.mo/home/zh-hant")
except requests.exceptions.ConnectionError:
    print("Error: Invalid URL")
    exit()


soup = BeautifulSoup(res.text, "html.parser")

for h5 in soup.select("h5"):
    print(h5.getText().strip())


## When is the next holiday?

In [None]:
url = f"https://www.gov.mo/zh-hant/public-holidays/year-{datetime.date.today().year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

print(soup.select("#public-holidays")[0].text.replace('\n',''))

In [None]:
month = soup.select("#public-holidays .month")[0].text
day = soup.select("#public-holidays .day")[0].text
weekday = soup.select("#public-holidays .weekday")[0].text
description = soup.select("#next-holiday-description strong")[0].text

print(f"接下來的公眾假期：{description}, {month}{day}日{weekday}")

## A list of holidays in Macao

In [None]:
import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/")
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

for row in tables[0].select("tr"):
    if len(row.select("td")) > 0:
        date = row.select("td")[1].text
        name = row.select("td")[3].text
        print(f"{date}: {name}")
  

Only listing obligatory holidays

In [None]:
import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/")
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

for row in tables[0].select("tr"):
    if len(row.select("td")) > 0:
        is_obligatory = (row.select("td")[0].text == "*")
        if is_obligatory:
            date = row.select("td")[1].text
            name = row.select("td")[3].text
            print(f"{date}: {name}")
  

## Is today government holiday?

In [None]:
import requests
from bs4 import BeautifulSoup
import datetime

# Get today's year, month and day
today = datetime.date.today()
year = today.year
month = today.month
day = today.day
today_weekday = today.weekday()
today_date = f"{month}月{day}日"


# Fetch gov.mo
url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

holidays = {}

for table in tables:
    for row in table.select("tr"):
        if len(row.select("td")) > 0:    
            date = row.select("td")[1].text
            weekday = row.select("td")[2].text
            name = row.select("td")[3].text
            holidays[date] = name


# Query holidays
print(today_date)
if today_date in holidays:
    holiday = holidays[today_date]
    print(f"今天是公眾假期：{holiday}")
elif today_weekday == 0:
    print("今天是星期日，但不是公眾假期。")
elif today_weekday == 6:
    print("今天是星期六，但不是公眾假期。")  
else:
    print("今天不是公眾假期。")

In [None]:
def is_macao_holiday(query_date):    
    # Fetch gov.mo
    url = f"https://www.gov.mo/zh-hant/public-holidays/year-{query_date.year}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.select(".table")

    holidays = {}

    for table in tables:
        for row in table.select("tr"):
            if len(row.select("td")) > 0:    
                date = row.select("td")[1].text
                weekday = row.select("td")[2].text
                name = row.select("td")[3].text
                holidays[date] = name


    # Query holidays
    date_key = f"{query_date.month}月{query_date.day}日"

    if date_key in holidays:        
        holiday = holidays[date_key]
        print(f"{date_key}是公眾假期：{holiday}")
    elif query_date.weekday() == 0:
        print(f"{date_key}是星期日，但不是公眾假期。")
    elif query_date.weekday() == 6:
        print(f"{date_key}是星期六，但不是公眾假期。")  
    else:
        print(f"{date_key}不是公眾假期。")

In [None]:
is_macao_holiday(datetime.date.today())

### Picking a date other than today

In [None]:
import dateutil
date = dateutil.parser.parse("2020-01-01")
is_macao_holiday(date)

In [None]:
import dateutil
date = dateutil.parser.parse("2020-10-26")
is_macao_holiday(date)

Futhermore, we can store the result in dictionary for further querying.

In [1]:
import requests
from bs4 import BeautifulSoup
import datetime

# Get today's year, month and day
today = datetime.date.today()
year = today.year
month = today.month
day = today.day
today_weekday = today.weekday()
today_date = f"{month}月{day}日"


# Fetch gov.mo
url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

holidays = {}

for table in tables:
    for row in table.select("tr"):
        if len(row.select("td")) > 0:    
            is_obligatory = (row.select("td")[0].text == "*")
            date = row.select("td")[1].text
            weekday = row.select("td")[2].text
            name = row.select("td")[3].text
            holidays[date] = {
                'date': date,
                'weekday': weekday,
                'name': name,
                'is_obligatory': is_obligatory,
            }

# Query holidays
print(today_date)
if today_date in holidays:
    holiday = holidays[today_date]
    if holiday['is_obligatory']:
        print(f"今天是強制公眾假期：{holiday['name']}")
    else:
        print(f"今天是公眾假期：{holiday['name']}")
elif today_weekday == 0:
    print("今天是星期日，但不是公眾假期。")
elif today_weekday == 6:
    print("今天是星期六，但不是公眾假期。")  
else:
    print("今天不是公眾假期。")

6月18日
今天不是公眾假期。


In [2]:
holidays

{'1月1日': {'date': '1月1日',
  'weekday': '星期三',
  'name': '元旦',
  'is_obligatory': True},
 '1月25日': {'date': '1月25日',
  'weekday': '星期六',
  'name': '農曆正月初一',
  'is_obligatory': True},
 '1月26日': {'date': '1月26日',
  'weekday': '星期日',
  'name': '農曆正月初二',
  'is_obligatory': True},
 '1月27日': {'date': '1月27日',
  'weekday': '星期一',
  'name': '農曆正月初三',
  'is_obligatory': True},
 '4月4日': {'date': '4月4日',
  'weekday': '星期六',
  'name': '清明節',
  'is_obligatory': True},
 '4月10日': {'date': '4月10日',
  'weekday': '星期五',
  'name': '耶穌受難日',
  'is_obligatory': False},
 '4月11日': {'date': '4月11日',
  'weekday': '星期六',
  'name': '復活節前日',
  'is_obligatory': False},
 '4月30日': {'date': '4月30日',
  'weekday': '星期四',
  'name': '佛誕節',
  'is_obligatory': False},
 '5月1日': {'date': '5月1日',
  'weekday': '星期五',
  'name': '勞動節',
  'is_obligatory': True},
 '6月25日': {'date': '6月25日',
  'weekday': '星期四',
  'name': '端午節',
  'is_obligatory': False},
 '10月1日': {'date': '10月1日',
  'weekday': '星期四',
  'name': '中華人民共和國國慶日',
  'is_ob