In [7]:
# 多平台保險討論爬蟲系統（PTT, Mobile01, my83, finfo）
# ---- 安裝必要套件 ----
# pip install requests beautifulsoup4 tqdm

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from time import sleep
from tqdm import tqdm
import csv
import random

# ---- 共用設定 ----
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
KEYWORDS = ["國泰 保險", "國泰人壽", "保單", "保險理賠", "保險詐騙", "業務推薦"]
MAX_PER_KEYWORD = 300  # 每個關鍵字最多幾篇

# ---- 儲存資料工具 ----
def save_to_csv(filename, rows):
    with open(filename, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["來源", "標題", "連結", "內容"])
        writer.writerows(rows)

# ---- PTT 爬蟲 ----
def crawl_ptt(keyword):
    base_url = "https://www.ptt.cc"
    search_url = f"https://www.ptt.cc/bbs/Insurance/search?q={quote(keyword)}"
    rows = []

    res = requests.get(search_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select(".title a")

    for a in tqdm(links[:MAX_PER_KEYWORD], desc=f"PTT: {keyword}"):
        try:
            url = base_url + a['href']
            title = a.text.strip()
            post = requests.get(url, headers=HEADERS)
            post_soup = BeautifulSoup(post.text, "html.parser")
            content = post_soup.select_one("#main-content").text
            rows.append(["PTT", title, url, content])
            sleep(random.uniform(0.5, 1.2))
        except:
            continue
    return rows

# ---- Mobile01 爬蟲 ----
def crawl_mobile01(keyword):
    base_url = "https://www.mobile01.com"
    search_url = f"https://www.mobile01.com/googlesearch.php?query={quote(keyword)}"
    rows = []

    res = requests.get(search_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select(".c-listTableTd__title a")

    for a in tqdm(links[:MAX_PER_KEYWORD], desc=f"Mobile01: {keyword}"):
        try:
            url = base_url + a['href']
            title = a.text.strip()
            post = requests.get(url, headers=HEADERS)
            post_soup = BeautifulSoup(post.text, "html.parser")
            content = post_soup.get_text("\n", strip=True)
            rows.append(["Mobile01", title, url, content])
            sleep(random.uniform(0.8, 1.5))
        except:
            continue
    return rows

# ---- my83.com.tw 爬蟲 ----
def crawl_my83(keyword):
    search_url = f"https://my83.com.tw/question/search?q={quote(keyword)}"
    rows = []

    res = requests.get(search_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select(".question-item .title a")

    for a in tqdm(links[:MAX_PER_KEYWORD], desc=f"my83: {keyword}"):
        try:
            url = a['href']
            title = a.text.strip()
            post = requests.get(url, headers=HEADERS)
            post_soup = BeautifulSoup(post.text, "html.parser")
            content = post_soup.get_text("\n", strip=True)
            rows.append(["my83", title, url, content])
            sleep(random.uniform(0.8, 1.3))
        except:
            continue
    return rows

# ---- finfo.tw 爬蟲 ----
def crawl_finfo(keyword):
    search_url = f"https://finfo.tw/posts?post_q%5Bkeyword%5D={quote(keyword)}&button="
    rows = []

    res = requests.get(search_url, headers=HEADERS)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select(".card-title a")

    for a in tqdm(links[:MAX_PER_KEYWORD], desc=f"finfo: {keyword}"):
        try:
            url = a['href'] if a['href'].startswith("http") else "https://finfo.tw" + a['href']
            title = a.text.strip()
            post = requests.get(url, headers=HEADERS)
            post_soup = BeautifulSoup(post.text, "html.parser")
            content = post_soup.get_text("\n", strip=True)
            rows.append(["finfo", title, url, content])
            sleep(random.uniform(0.6, 1.2))
        except:
            continue
    return rows

# ---- 主流程 ----
all_data = []
for kw in KEYWORDS:
    all_data += crawl_ptt(kw)
    all_data += crawl_mobile01(kw)
    all_data += crawl_my83(kw)
    all_data += crawl_finfo(kw)

save_to_csv("保險社群語料.csv", all_data)
print(f"共抓取 {len(all_data)} 筆資料！已儲存為 CSV。")

PTT: 國泰 保險: 100%|██████████| 1/1 [00:01<00:00,  1.50s/it]
Mobile01: 國泰 保險: 0it [00:00, ?it/s]
my83: 國泰 保險: 0it [00:00, ?it/s]
finfo: 國泰 保險: 0it [00:00, ?it/s]
PTT: 國泰人壽: 100%|██████████| 20/20 [00:29<00:00,  1.45s/it]
Mobile01: 國泰人壽: 0it [00:00, ?it/s]
my83: 國泰人壽: 0it [00:00, ?it/s]
finfo: 國泰人壽: 0it [00:00, ?it/s]
PTT: 保單: 100%|██████████| 20/20 [00:27<00:00,  1.39s/it]
Mobile01: 保單: 0it [00:00, ?it/s]
my83: 保單: 0it [00:00, ?it/s]
finfo: 保單: 0it [00:00, ?it/s]
PTT: 保險理賠: 100%|██████████| 20/20 [00:30<00:00,  1.51s/it]
Mobile01: 保險理賠: 0it [00:00, ?it/s]
my83: 保險理賠: 0it [00:00, ?it/s]
finfo: 保險理賠: 0it [00:00, ?it/s]
PTT: 保險詐騙: 100%|██████████| 1/1 [00:01<00:00,  1.77s/it]
Mobile01: 保險詐騙: 0it [00:00, ?it/s]
my83: 保險詐騙: 0it [00:00, ?it/s]
finfo: 保險詐騙: 0it [00:00, ?it/s]
PTT: 業務推薦: 100%|██████████| 4/4 [00:06<00:00,  1.52s/it]
Mobile01: 業務推薦: 0it [00:00, ?it/s]
my83: 業務推薦: 0it [00:00, ?it/s]
finfo: 業務推薦: 0it [00:00, ?it/s]

共抓取 66 筆資料！已儲存為 CSV。





In [6]:
pip install requests bs4 tqdm

Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# 多平台保險討論爬蟲系統（PTT, Mobile01, my83, finfo）＋留言擴充＋多版面支援
# ---- 安裝必要套件 ----
# pip install requests beautifulsoup4 tqdm

import requests
from bs4 import BeautifulSoup
from urllib.parse import quote
from time import sleep
from tqdm import tqdm
import csv
import random

# ---- 共用設定 ----
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
}
KEYWORDS = ["保險", "國泰", "國泰人壽", "保單", "理賠", "詐騙", "推薦", "壽險", "產險"]
PTT_BOARDS = ["Insurance", "WomenTalk", "Stock", "Lifeismoney", "Gossiping"]
M01_FORUMS = ["566", "376", "315"]  # 理財, 職場甘苦, 生活綜合
FIN_TAGS = ["保險", "保單比較", "醫療險", "業務"]
MAX_PER_KEYWORD = 5000  # 每個關鍵字最多幾篇

# ---- 儲存資料工具 ----
def save_to_csv(filename, rows):
    with open(filename, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.writer(f)
        writer.writerow(["來源", "標題", "連結", "內容"])
        writer.writerows(rows)

# ---- PTT 爬蟲 ----
def crawl_ptt(keyword):
    base_url = "https://www.ptt.cc"
    rows = []
    for board in PTT_BOARDS:
        search_url = f"https://www.ptt.cc/bbs/{board}/search?q={quote(keyword)}"
        try:
            res = requests.get(search_url, headers=HEADERS)
            soup = BeautifulSoup(res.text, "html.parser")
            links = soup.select(".title a")
            for a in links[:MAX_PER_KEYWORD // len(PTT_BOARDS)]:
                url = base_url + a['href']
                title = a.text.strip()
                post = requests.get(url, headers=HEADERS)
                post_soup = BeautifulSoup(post.text, "html.parser")
                content = post_soup.select_one("#main-content").text
                rows.append([f"PTT-{board}", title, url, content])
                # 抓留言
                pushes = post_soup.select(".push")
                for p in pushes:
                    msg = p.text.strip()
                    rows.append([f"PTT-{board}-回文", title, url, msg])
                sleep(random.uniform(0.5, 1.2))
        except:
            continue
    return rows

# ---- Mobile01 爬蟲 ----
def crawl_mobile01(keyword):
    base_url = "https://www.mobile01.com"
    rows = []
    for fid in M01_FORUMS:
        search_url = f"https://www.mobile01.com/forumtopic.php?c={fid}&s={quote(keyword)}"
        try:
            res = requests.get(search_url, headers=HEADERS)
            soup = BeautifulSoup(res.text, "html.parser")
            links = soup.select(".c-listTableTd__title a")
            for a in links[:MAX_PER_KEYWORD // len(M01_FORUMS)]:
                url = base_url + a['href']
                title = a.text.strip()
                post = requests.get(url, headers=HEADERS)
                post_soup = BeautifulSoup(post.text, "html.parser")
                content = post_soup.get_text("\n", strip=True)
                rows.append(["Mobile01", title, url, content])
                sleep(random.uniform(0.8, 1.5))
        except:
            continue
    return rows

# ---- my83.com.tw 爬蟲 ----
def crawl_my83(keyword):
    search_url = f"https://my83.com.tw/question/search?q={quote(keyword)}"
    rows = []
    try:
        res = requests.get(search_url, headers=HEADERS)
        soup = BeautifulSoup(res.text, "html.parser")
        links = soup.select(".question-item .title a")
        for a in links[:MAX_PER_KEYWORD]:
            url = a['href']
            title = a.text.strip()
            post = requests.get(url, headers=HEADERS)
            post_soup = BeautifulSoup(post.text, "html.parser")
            content = post_soup.get_text("\n", strip=True)
            rows.append(["my83", title, url, content])
            sleep(random.uniform(0.8, 1.3))
    except:
        pass
    return rows

# ---- finfo.tw 爬蟲 ----
def crawl_finfo(keyword):
    rows = []
    for tag in FIN_TAGS:
        search_url = f"https://finfo.tw/posts?post_q%5Bkeyword%5D={quote(keyword)}&tag={quote(tag)}"
        try:
            res = requests.get(search_url, headers=HEADERS)
            soup = BeautifulSoup(res.text, "html.parser")
            links = soup.select(".card-title a")
            for a in links[:MAX_PER_KEYWORD // len(FIN_TAGS)]:
                url = a['href'] if a['href'].startswith("http") else "https://finfo.tw" + a['href']
                title = a.text.strip()
                post = requests.get(url, headers=HEADERS)
                post_soup = BeautifulSoup(post.text, "html.parser")
                content = post_soup.get_text("\n", strip=True)
                rows.append([f"finfo-{tag}", title, url, content])
                sleep(random.uniform(0.6, 1.2))
        except:
            continue
    return rows

# ---- 主流程 ----
all_data = []
for kw in KEYWORDS:
    all_data += crawl_ptt(kw)
    all_data += crawl_mobile01(kw)
    all_data += crawl_my83(kw)
    all_data += crawl_finfo(kw)

save_to_csv("保險社群語料_擴充版.csv", all_data)
print(f"共抓取 {len(all_data)} 筆資料（含留言與多版面）！已儲存為 CSV。")