# 空氣污染監測網 網路爬蟲實作練習


* 能夠利用 selenium + BeautifulSoup 撰寫爬蟲，並存放到合適的資料結構


## 作業目標

根據範例 ，完成以下問題：

* ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料
* ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料





### ① 取出 台北市士林區 2018/01 – 2018/08 的 SO2 資料

In [9]:
# 打開瀏覽器
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from bs4 import BeautifulSoup

browser = webdriver.Chrome(executable_path='chromedriver')
browser.get("http://taqm.epa.gov.tw/taqm/tw/MonthlyAverage.aspx")

In [10]:
# 模擬使用者操作行為，選擇/點擊
selectSite = Select(browser.find_element_by_id("ctl05_ddlSite"))
selectSite.select_by_value('11')
selectYear = Select(browser.find_element_by_id("ctl05_ddlYear"))
selectYear.select_by_value('2019')

browser.find_element_by_id('ctl05_btnQuery').click()

In [67]:
# 取得資料，丟到 BeautifulSoup 解析
html_source = browser.page_source
soup = BeautifulSoup(html_source, 'html.parser')
measure_item = soup.find('table', class_='TABLE_G').find_all('tr')

data = {}
for n, row in enumerate(measure_item):
    if n == 0:
        continue
    for m, i in enumerate(row.find_all('td')):
        if n == 1:
            if m == 0:
                item = i.text
                data[item] = {}
            if m == 1:
                continue
            if m == 2:
                date = i.text
            if m == 3:
                data[item][date] = i.text
        elif n <= 8:
            if m == 0:
                date = i.text
            if m == 1:
                data[item][date] = i.text
data

{'SO2': {'2019/01': '1.80',
  '2019/02': '1.60',
  '2019/03': '1.90',
  '2019/04': '2.20',
  '2019/05': '1.70',
  '2019/06': '1.90',
  '2019/07': '2.10',
  '2019/08': '2'}}

In [33]:
import pandas as pd

df = pd.DataFrame(data)
df

Unnamed: 0,SO2
2019/01,1.8
2019/02,1.6
2019/03,1.9
2019/04,2.2
2019/05,1.7
2019/06,1.9
2019/07,2.1
2019/08,2.0


### ② 取出 台北市士林區 2018/01 – 2018/08 的 SO2、CO 資料

In [73]:
# 取得資料，丟到 BeautifulSoup 解析
html_source = browser.page_source
soup = BeautifulSoup(html_source, 'html.parser')
measure_item = soup.find('table', class_='TABLE_G').find_all('tr')

data = {}
flag = False
for n, row in enumerate(measure_item):
    num = n % 11
    if n == 0:
        continue
    for m, i in enumerate(row.find_all('td')):
        if num == 1:
            if m == 0:
                item = i.text
                if item == 'THC':
                    flag = True
                    break
                data[item] = {}
            if m == 1:
                continue
            if m == 2:
                date = i.text
            if m == 3:
                data[item][date] = i.text
        elif num <= 8 and num > 0:
            if m == 0:
                date = i.text
            if m == 1:
                data[item][date] = i.text
    if flag:
        break
data

{'SO2': {'2019/01': '1.80',
  '2019/02': '1.60',
  '2019/03': '1.90',
  '2019/04': '2.20',
  '2019/05': '1.70',
  '2019/06': '1.90',
  '2019/07': '2.10',
  '2019/08': '2'},
 'CO': {'2019/01': '0.40',
  '2019/02': '0.43',
  '2019/03': '0.40',
  '2019/04': '0.41',
  '2019/05': '0.32',
  '2019/06': '0.35',
  '2019/07': '0.28',
  '2019/08': '0.21'},
 'O3': {'2019/01': '35',
  '2019/02': '33.40',
  '2019/03': '45.50',
  '2019/04': '41.70',
  '2019/05': '42.40',
  '2019/06': '27.30',
  '2019/07': '23.80',
  '2019/08': '24.70'},
 'PM10': {'2019/01': '28',
  '2019/02': '29',
  '2019/03': '37',
  '2019/04': '36',
  '2019/05': '32',
  '2019/06': '25',
  '2019/07': '22',
  '2019/08': '17'},
 'NOx': {'2019/01': '14.95',
  '2019/02': '15.39',
  '2019/03': '15.43',
  '2019/04': '16.25',
  '2019/05': '13.75',
  '2019/06': '17.71',
  '2019/07': '13.93',
  '2019/08': '10.05'},
 'NO': {'2019/01': '3.54',
  '2019/02': '2.76',
  '2019/03': '2.74',
  '2019/04': '2.82',
  '2019/05': '2.45',
  '2019/06': '3.

In [74]:
df = pd.DataFrame(data)
df

Unnamed: 0,SO2,CO,O3,PM10,NOx,NO,NO2
2019/01,1.8,0.4,35.0,28,14.95,3.54,11.42
2019/02,1.6,0.43,33.4,29,15.39,2.76,12.63
2019/03,1.9,0.4,45.5,37,15.43,2.74,12.69
2019/04,2.2,0.41,41.7,36,16.25,2.82,13.43
2019/05,1.7,0.32,42.4,32,13.75,2.45,11.3
2019/06,1.9,0.35,27.3,25,17.71,3.91,13.81
2019/07,2.1,0.28,23.8,22,13.93,2.87,11.06
2019/08,2.0,0.21,24.7,17,10.05,2.33,7.73


In [76]:
df[['SO2', 'CO']]

Unnamed: 0,SO2,CO
2019/01,1.8,0.4
2019/02,1.6,0.43
2019/03,1.9,0.4
2019/04,2.2,0.41
2019/05,1.7,0.32
2019/06,1.9,0.35
2019/07,2.1,0.28
2019/08,2.0,0.21
