In [2]:
import cloudscraper
from bs4 import BeautifulSoup as bs
import webbrowser
import time
import random
import json
import os
# 網頁 URL
url = "https://www.mobile01.com/topiclist.php?f=291"

# 使用 cloudscraper 來繞過 Cloudflare 的挑戰
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36"
}
scraper = cloudscraper.create_scraper()
res = scraper.get(url, headers=headers).text

# 使用 BeautifulSoup 解析 HTML
soup = bs(res, "lxml")
# 找到所有分頁按鈕並提取最後一個頁碼
pagination_elements = soup.select('li.l-pagination__page > a.c-pagination[data-page]')
if not pagination_elements:
    raise ValueError("未找到任何分頁按鈕，請檢查選擇器是否正確。")

# 提取最後一個分頁的 data-page 值
last_page_number = int(pagination_elements[-1]["data-page"])
print("最後一個頁碼是:", last_page_number)

最後一個頁碼是: 30


In [8]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# ====== 設定看板編號 ======
board_id = "291"  # 自由修改看板代碼，例如 804、802 等

# ====== 建立 scraper ======
scraper = cloudscraper.create_scraper(browser={'browser': 'chrome', 'platform': 'windows', 'mobile': False})

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/123.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "zh-TW,zh;q=0.9",
    "Referer": "https://www.google.com"
}

# ====== 工具函式 ======

def get_total_pages(url):
    """取得主文的總頁數"""
    res = scraper.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    pagination_links = soup.select("a.c-pagination")
    pages = [int(re.search(r'p=(\d+)', a.get('href', '')).group(1)) for a in pagination_links if re.search(r'p=(\d+)', a.get('href', ''))]
    return max(pages) if pages else 1

def extract_article_pages(base_url):
    """抓取主文的所有分頁樓層內容"""
    all_floors = []
    total_pages = get_total_pages(base_url)

    for page in range(1, total_pages + 1):
        full_url = f"{base_url}&p={page}" if "?p=" not in base_url else base_url
        try:
            print(f"📄 抓取第 {page} 頁：{full_url}")
            res = scraper.get(full_url, headers=headers)
            soup = BeautifulSoup(res.text, "html.parser")

            title = soup.select_one("h1.t2")
            floor_blocks = soup.select('div[itemprop="articleBody"], article.c-articleLimit')
            time_tags = soup.select('span.o-fNotes.o-fSubMini')

            for i, block in enumerate(floor_blocks):
                content = block.get_text(strip=True)
                time_text = time_tags[i].text.strip() if i < len(time_tags) else ""
                all_floors.append({
                    "url": base_url,
                    "page": page,
                    "title": title.text.strip() if title else "",
                    "post_time": time_text,
                    "content": content
                })

            time.sleep(1)
        except Exception as e:
            print(f"❌ 發生錯誤於頁 {page}：{e}")
            break

    return all_floors

# ====== 抓主文連結 ======

base_url = "https://www.mobile01.com/"
pages = [f"https://www.mobile01.com/topiclist.php?f={board_id}&p={i}" for i in range(1, 100)]

article_links = []

for page in pages:
    res = scraper.get(page, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")
    links = soup.select(f'a.c-link.u-ellipsis[href*="topicdetail.php?f={board_id}"]')
    for link in links:
        href = link.get("href").split("#")[0]
        if href and "topicdetail.php" in href and "&p=" not in href:
            full_url = base_url + href
            if full_url not in article_links:
                article_links.append(full_url)

print(f"\n✅ 共抓到 {len(article_links)} 筆主文連結")

# ====== 抓每篇文章所有樓層 ======

all_results = []
skipped_links = []

for idx, link in enumerate(article_links):
    print(f"\n🔍 處理第 {idx+1}/{len(article_links)} 筆：{link}")
    try:
        floors = extract_article_pages(link)
        if floors:
            all_results.extend(floors)
        else:
            skipped_links.append(link)
    except Exception as e:
        print(f"[錯誤] {link} 發生異常：{e}")
        skipped_links.append(link)

# ====== 匯出結果 ======

output_csv = f"mobile01_full_articles_f{board_id}.csv"
df = pd.DataFrame(all_results)
df.to_csv(output_csv, index=False, encoding='utf-8-sig')
print(f"\n✅ 已成功輸出 {len(df)} 筆樓層資料至 {output_csv}")

if skipped_links:
    skipped_file = f"skipped_links_f{board_id}.txt"
    with open(skipped_file, "w", encoding="utf-8") as f:
        for link in skipped_links:
            f.write(link + "\n")
    print(f"⚠️ 有 {len(skipped_links)} 筆主文未成功處理，已記錄至 {skipped_file}")
else:
    print("🎉 所有主文皆成功擷取！")




✅ 共抓到 206 筆主文連結

🔍 處理第 1/206 筆：https://www.mobile01.com/topicdetail.php?f=291&t=4369995
📄 抓取第 1 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=1
📄 抓取第 2 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=2
📄 抓取第 3 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=3
📄 抓取第 4 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=4
📄 抓取第 5 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=5
📄 抓取第 6 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=6
📄 抓取第 7 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=7
📄 抓取第 8 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=8
📄 抓取第 9 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=9
📄 抓取第 10 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=10
📄 抓取第 11 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=11
📄 抓取第 12 頁：https://www.mobile01.com/topicdetail.php?f=291&t=4369995&p=12
📄 抓取第 13 頁：https://www.mobile01.com/topicdetail.php?f