#### 確認 libraries

In [None]:
from bs4 import BeautifulSoup
import requests
import os
import re
import json

#### 爬蟲範例#1 (政府電子採購網)

##### 1. 分析網址
- 採購網:
  - https://web.pcc.gov.tw/prkms/tender/common/basic/readTenderBasic?pageSize=50&firstSearch=true&searchType=basic&isBinding=N&isLogIn=N&level_1=on&orgName=%E4%B8%AD%E5%A4%AE%E7%A0%94%E7%A9%B6%E9%99%A2&orgId=2.1&tenderName=&tenderId=&tenderType=TENDER_DECLARATION&tenderWay=TENDER_WAY_ALL_DECLARATION&dateType=isDate&tenderStartDate=2024%2F03%2F21&tenderEndDate=2024%2F03%2F27&radProctrgCate=&policyAdvocacy=
- url pretty:
  - https://urlprettyprint.com/
- Example(等標期內標案):
  - ![](img/2.url_ex.png)

In [None]:
# variable of url
start_date = "2024%2F03%2F01" # 2024/03/21
end_date = "2024%2F03%2F27"   # 2024/03/27
url = f"https://web.pcc.gov.tw/prkms/tender/common/basic/readTenderBasic?pageSize=50&firstSearch=true&searchType=basic&isBinding=N&isLogIn=N&level_1=on&orgName=%E4%B8%AD%E5%A4%AE%E7%A0%94%E7%A9%B6%E9%99%A2&orgId=2.1&tenderName=&tenderId=&tenderType=TENDER_DECLARATION&tenderWay=TENDER_WAY_ALL_DECLARATION&dateType=isDate&tenderStartDate={start_date}&tenderEndDate={end_date}&radProctrgCate=&policyAdvocacy="
print(url)

##### 2. 取得 Response
- 使用 requests 取得 response
- 剖析內容
- 將結果存到 .html 檔，用以分析

In [None]:
# get response
response2 = requests.get(url)

# parse html
soup2 = BeautifulSoup(response2.text, "html.parser")

# Save to file "output/pcc.html"
if not os.path.exists("output"):
    os.makedirs("output")
with open("output/pcc.html", "w", encoding="utf-8") as file:
    file.write(soup2.prettify())


##### 3. 取得需要的html原始碼
- 分析`html`原始碼
- 我們所需要的資料在**第二個**`tbody`中

In [None]:
# get second tbody
tbody = soup2.find_all("tbody")[1] # 取得第二個 tbody

print(tbody.prettify()) # print result

# Save to file "output/pcc_tbody.html"
with open("output/pcc_tbody.html", "w", encoding="utf-8") as file:
    file.write(tbody.prettify())

##### 5. 分析剩下的html原始碼

- 需要的欄位
    ![](img/2.data.png)
- 從 `pcc_tbody`中，我們發現需要的欄位東西`tobdy`之下的`tr`
- 每個`tr`都是一個row，所以使用**迴圈**取得每個`tr`的`td`

###### 先以第一筆資料為範例分析每個欄位

In [None]:
# get first tr
tr = tbody.find("tr")
print(tr.prettify())

- 取得「項次」

In [None]:
# 1. Get number 
number = tr.find("td").text.strip()
print(number)


- 取得「標案案號」和「標案名稱」
  - 因為`html`原始碼中，內容被分開了，所以要自己分析和拼湊

In [None]:
# 2. Get title
# Get title-num
title_num = tr.find_all("td")[2].text.strip()

# Get title-name in script
title_name = tr.find_all("td")[2].find("script").text.strip()

# Use regex to extract title-name
pattern = r'"(.*?)"' # regex pattern
matches = re.findall(pattern, title_name)
if matches:
    title_name = matches[0]
else:
    title_name = ""

# Combine title-num and title-name
title = f"{title_num} {title_name}"

print(title)


- 取得「招標方式」、「公告日期」、「截止日期」和「預算金額」

In [None]:
# 3. Get trending method
method = tr.find_all("td")[4].text.strip()
print(method)

# 4. Get start date
start_date = tr.find_all("td")[6].text.strip()
print(start_date)

# 5. Get end date
end_date = tr.find_all("td")[7].text.strip()
print(end_date)

# 6. Get money
money = tr.find_all("td")[8].text.strip()
print(money)

- 取得標案「鏈接」
  - 因為`html`原始碼只有「部分」鏈接，所以要自己拼湊鏈接上去

In [None]:
# 7. Get url (只有給部分網址，要自己加上去)
# get partial url
case_url = tr.find_all("td")[9].find("a")["href"]

# extract case_url
start_index = case_url.find('=')+1
end_index = case_url.find('"', start_index)
case_url = case_url[start_index:end_index]

# combine real_case_url
real_case_url = f"https://web.pcc.gov.tw/tps/QueryTender/query/searchTenderDetail?pkPmsMain={case_url}="

print(real_case_url)

##### 6. 使用迴圈取得所有資料

In [None]:
# empty list
data = []

# loop each tr
for tr in tbody.find_all("tr"):
    # 1. Get serial number
    num = tr.find_all("td")[0].text.strip()
    
    # 2. Get title
    # Get title-num
    title_num = tr.find_all("td")[2].text.strip()

    # Get title-name in script
    title_name = tr.find_all("td")[2].find("script").text.strip()
    pattern = r'"(.*?)"'
    matches = re.findall(pattern, title_name)
    if matches:
        title_name = matches[0]
    else:
        title_name = ""

    # Combine title-num and title-name
    title = f"{title_num} {title_name}"

    # 3. Get trending method
    method = tr.find_all("td")[4].text.strip()
    
    # 4. Get start date
    start_date = tr.find_all("td")[6].text.strip()

    # 5. Get end date
    end_date = tr.find_all("td")[7].text.strip()

    # 6. Get money
    money = tr.find_all("td")[8].text.strip()

    # 7. Get url (只有給部分網址，要自己加上去)
    case_url = tr.find_all("td")[9].find("a")["href"]
    start_index = case_url.find('=')+1
    end_index = case_url.find('"', start_index)
    case_url = case_url[start_index:end_index]
    real_case_url = f"https://web.pcc.gov.tw/tps/QueryTender/query/searchTenderDetail?pkPmsMain={case_url}="

    # Append to data list
    data.append({
        "num": num,
        "title": title,
        "method": method,
        "start_date": start_date,
        "end_date": end_date,
        "money": money,
        "case_url": real_case_url
    })

# Print data list
for d in data:
    print(d)


##### 7. 將結果存到 .json

In [None]:
# save to file "output/pcc_data.json"
with open("output/pcc_data.json", "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=2)