## 安裝套件

In [None]:
pip install requests beautifulsoup4
pip install PyPDF2
pip install readability-lxml


## 測試網站能否爬取以及資料型態

In [1]:
import requests
from bs4 import BeautifulSoup

url = "https://www.hch.gov.tw/?aid=626&pid=8&page_name=sub_list&type=0&pageNo=1"
headers = {"User-Agent": "Mozilla/5.0"}

res = requests.get(url, headers=headers)
print(res.text[:])

<!DOCTYPE html>
<html lang="zh-TW">
<head>
	<title>衛教資訊| 腎臟科| 醫療單位-內科部次專科| 國立臺灣大學醫學院附設醫院新竹臺大分院</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<meta name="Description" content="國立臺灣大學醫學院附設醫院新竹臺大分院">
<meta name="Keywords" content="臺大醫院,新竹,臺大新竹分院,Hsin-Chu Branch">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<meta property="og:image" content="/images/favicon_hch.png">
<link rel="shortcut icon" href="/images/favicon_hch.png" type="image/x-icon">

<base href="/images/templates/mold_01/" >
<link href="/assets/plugins/jquery-ui/jquery-ui.css" rel="stylesheet">
<link href="/assets/plugins/bootstrap/bootstrap.css" rel="stylesheet">
<link href="/assets/plugins/cookieconsent/cookieconsent.css" rel="stylesheet">
<link href="/assets/plugins/fontawesome/all.css" rel="stylesheet">
<link href="/assets/plugins/owl.carousel/owl.theme.default.css" re

## 爬蟲主程式

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re
from requests.exceptions import RequestException

# 建立資料夾儲存檔案
os.makedirs("articles_csv", exist_ok=True)

base_url = "https://www.bh.ntuh.gov.tw/"
list_url = "https://www.bh.ntuh.gov.tw/?aid=509&pid=6&page_name=sub_list&pageNo=1"
headers = {"User-Agent": "Mozilla/5.0"}

def safe_request(url, headers, max_retries=3, timeout=15):
    """帶重試的 requests.get"""
    for attempt in range(max_retries):
        try:
            res = requests.get(url, headers=headers, timeout=timeout)
            res.raise_for_status()
            return res
        except RequestException as e:
            print(f"請求失敗（{e}），第 {attempt+1} 次重試...")
            time.sleep(3)
    return None

# 取得列表頁
res = safe_request(list_url, headers)
if res is None:
    print("無法取得列表頁，程式結束。")
    exit()

soup = BeautifulSoup(res.text, "html.parser")

for index, row in enumerate(soup.select("tr"), start=1):
    a_tag = row.select_one("a[title]")

    if a_tag:
        title = a_tag.get_text(strip=True)
        relative_link = a_tag["href"]
        full_link = base_url + relative_link if relative_link.startswith("/") else relative_link

        # 取得文章內容（加重試）
        article_res = safe_request(full_link, headers)
        if article_res is None:
            print(f"無法取得文章：{full_link}")
            continue

        article_soup = BeautifulSoup(article_res.text, "html.parser")
        p_tags = article_soup.select("p")
        content = "\n".join(p.get_text(strip=True) for p in p_tags) if p_tags else "找不到內容"

        # 檔名處理：移除非法字元
        safe_title = re.sub(r'[\\/*?:"<>|]', "", title)
        filename = f"articles_csv/{index}_{safe_title}.csv"

        # 存成單一 csv
        df = pd.DataFrame([{
            "title": title,
            "url": full_link,
            "content": content
        }])
        df.to_csv(filename, encoding="utf-8-sig", index=False)

        print(f"已儲存：{filename}")
        time.sleep(5)


請求失敗（("Connection broken: InvalidChunkLength(got length b'\\xbf\\xa3>\\x9c/_\\xbb\\x18\\x8d4\\x98\\x1b5hY\\xf9\\xee\\x97\\xca\\xd8\\x15(\\x02\\xf9~{y\\xa99\\x80\\xebs\\xf9\\x8d${3\\xd9d\\x94\\xe9\\xef\\xf5\\xf9\\xfb\\xfc\\x81\\x08\\xeb\\x8e\\xb0\\xf4,\\xff\\xe5\\x17\\xe5\\x9f\\x1e\\xd6\\xbc4\\xe0\\x96\\xc7~.\\xfdr\\x8dd\\x81\\xc6Te\\xb2p\\xde\\x8ar0fTs\\xb9i|\\xa2\\xac', 0 bytes read)", InvalidChunkLength(got length b'\xbf\xa3>\x9c/_\xbb\x18\x8d4\x98\x1b5hY\xf9\xee\x97\xca\xd8\x15(\x02\xf9~{y\xa99\x80\xebs\xf9\x8d${3\xd9d\x94\xe9\xef\xf5\xf9\xfb\xfc\x81\x08\xeb\x8e\xb0\xf4,\xff\xe5\x17\xe5\x9f\x1e\xd6\xbc4\xe0\x96\xc7~.\xfdr\x8dd\x81\xc6Te\xb2p\xde\x8ar0fTs\xb9i|\xa2\xac', 0 bytes read))），第 1 次重試...
請求失敗（("Connection broken: InvalidChunkLength(got length b'\\xbf\\xa3>\\x9c/_\\xbb\\x18\\x8d4\\x98\\x1b5hY\\xf9\\xee\\x97\\xca\\xd8\\x15(\\x02\\xf9~{y\\xa99\\x80\\xebs\\xf9\\x8d${3\\xd9d\\x94\\xe9\\xef\\xf5\\xf9\\xfb\\xfc\\x81\\x08\\xeb\\x8e\\xb0\\xf4,\\xff\\xe5\\x17\\xe5\\x9f\\x1e\\xd6\\xbc