## 安裝套件

In [None]:
pip install requests beautifulsoup4
pip install PyPDF2
pip install readability-lxml


## 測試網站能否爬取以及資料型態

In [None]:
import requests
from bs4 import BeautifulSoup

url = "https://www.hch.gov.tw/?aid=626&pid=8&page_name=sub_list&type=0&pageNo=1"
headers = {"User-Agent": "Mozilla/5.0"}

res = requests.get(url, headers=headers)
print(res.text[:])

## 爬蟲主程式

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re
from requests.exceptions import RequestException

# 建立資料夾儲存檔案
os.makedirs("articles_csv", exist_ok=True)

base_url = "https://www.bh.ntuh.gov.tw/"
list_url = "https://www.bh.ntuh.gov.tw/?aid=509&pid=6&page_name=sub_list&pageNo=1"
headers = {"User-Agent": "Mozilla/5.0"}

def safe_request(url, headers, max_retries=3, timeout=15):
    """帶重試的 requests.get"""
    for attempt in range(max_retries):
        try:
            res = requests.get(url, headers=headers, timeout=timeout)
            res.raise_for_status()
            return res
        except RequestException as e:
            print(f"請求失敗（{e}），第 {attempt+1} 次重試...")
            time.sleep(3)
    return None

# 取得列表頁
res = safe_request(list_url, headers)
if res is None:
    print("無法取得列表頁，程式結束。")
    exit()

soup = BeautifulSoup(res.text, "html.parser")

for index, row in enumerate(soup.select("tr"), start=1):
    a_tag = row.select_one("a[title]")

    if a_tag:
        title = a_tag.get_text(strip=True)
        relative_link = a_tag["href"]
        full_link = base_url + relative_link if relative_link.startswith("/") else relative_link

        # 取得文章內容（加重試）
        article_res = safe_request(full_link, headers)
        if article_res is None:
            print(f"無法取得文章：{full_link}")
            continue

        article_soup = BeautifulSoup(article_res.text, "html.parser")
        p_tags = article_soup.select("p")
        content = "\n".join(p.get_text(strip=True) for p in p_tags) if p_tags else "找不到內容"

        # 檔名處理：移除非法字元
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
        filename = f"articles_csv/{index}_{safe_title}.csv"

        # 存成單一 csv
        df = pd.DataFrame([{
            "title": title,
            "url": full_link,
            "content": content
        }])
        df.to_csv(filename, encoding="utf-8-sig", index=False)

        print(f"已儲存：{filename}")
        time.sleep(5)
