In [1]:
import os
import shutil
import re
import subprocess
import urllib
import zipfile
import requests


"""
Scrapes and installs chromium from linux mint 21.3(virginia) packages site.
Link: http://packages.linuxmint.com/pool/upstream/c/chromium/
Scrapes and installs chromedriver from Chrome for Testing page.
Link: https://googlechromelabs.github.io/chrome-for-testing/
"""

class CantGetLatestChromiumVersionError(Exception):
    """Happens when regex failed"""

class ChromiumInstallationFailedException(Exception):
    """
    Happens when deb package not installed
    Check the downloaded chroumium deb file
    """

class CantGetChromeDriverError(Exception):
    """Happens when regex failed"""

main_url = "http://packages.linuxmint.com/pool/upstream/c/chromium/"
work_dir = "/content"

def get_chromium_latest_version() -> str:
    # A request to packages.linuxmint.com for getting latest version of chromium
    # e.g. "chromium_121.0.6167.160~linuxmint1+virginia_amd64.deb"
    r = requests.get(main_url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="(chromium_[^"]+linuxmint1%2Bvirginia_amd64.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise CantGetLatestChromiumVersionError("Failed to get latest chromium version!")
    return latest_version

def install_chromium(latest_version: str, deb_file: str, quiet: bool):
    # Full url of deb file
    url = f"{main_url}{latest_version}"

    # Download deb file
    if quiet:
        command = f"wget -q -O {work_dir}/{deb_file} {url}"
    else:
        command = f"wget -O {work_dir}/{deb_file} {url}"
    print(f"Downloading: {deb_file}")
    # os.system(command)
    !$command

    # Install deb file
    if quiet:
        command = f"apt-get install {work_dir}/{deb_file} >> apt.log"
    else:
        command = f"apt-get install {work_dir}/{deb_file}"
    print(f"Installing: {deb_file}")
    # os.system(command)
    !$command

def check_chromium_installation(deb_file: str):
    try:
        subprocess.call(["chromium"])
        print("Chromium installation successfull.\n")
        # If installation successfull we can remove deb file
        # Delete deb file from disk
        os.remove(f"{work_dir}/{deb_file}")
    except FileNotFoundError:
        raise ChromiumInstallationFailedException("Chromium Installation Failed!")

def get_chromedriver_url(deb_file: str) -> str:
    # Get content of crhomedriver page
    url = "https://googlechromelabs.github.io/chrome-for-testing/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Get chromium version from deb file's name
    version_number = deb_file.split("chromium_")[-1].split(".")[0]

    # Example: https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip
    pattern = f'https://[^<]+/{version_number}[^<]+/linux64/chromedriver-linux64.zip'
    # Find latest version
    chromedriver_url_search = re.search(pattern, text)
    if chromedriver_url_search:
        chromedriver_url = chromedriver_url_search.group()
        return chromedriver_url
    else:
        raise CantGetChromeDriverError("Failed to get chromedriver!")

def install_chromedriver(deb_file: str, quiet: bool):
    url = get_chromedriver_url(deb_file)
    file_name = url.split("/")[-1]
    # Download chromedriver
    chromedriver_zip = f"{work_dir}/{file_name}"
    if quiet:
        command = f"wget -q -O {chromedriver_zip} {url}"
    else:
        command = f"wget -O {chromedriver_zip} {url}"
    print(f"Downloading: {file_name}")
    # os.system(command)
    !$command

    # Extract chromedriver from zip
    with zipfile.ZipFile(chromedriver_zip) as zpf:
        zpf.extract(member="chromedriver-linux64/chromedriver", path=work_dir)

    # Remove chromedriver-linux64.zip file
    os.remove(chromedriver_zip)

    # Move extracted chromedriver binary file to /usr/bin directory
    source = f"{work_dir}/chromedriver-linux64/chromedriver"
    destination = "/usr/bin/chromedriver"
    os.rename(source, destination)

    # Make chromedriver binary executable
    os.system(f"chmod +x {destination}")

    # Remove empty chromedriver-linux64 folder
    shutil.rmtree(f"{work_dir}/chromedriver-linux64")

    print("Chromedriver installed")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium from linux mint packages site
    latest_version = get_chromium_latest_version()
    # Name of the deb file
    deb_file = urllib.parse.unquote(latest_version, "utf-8")
    # Download and install chromium for ubuntu 22.04
    install_chromium(latest_version, deb_file, quiet)
    # Check if installation succesfull
    check_chromium_installation(deb_file)
    # Install chromedriver
    install_chromedriver(deb_file, quiet)
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium_131.0.6778.85~linuxmint1+virginia_amd64.deb
Installing: chromium_131.0.6778.85~linuxmint1+virginia_amd64.deb
Chromium installation successfull.

Downloading: chromedriver-linux64.zip
Chromedriver installed


In [2]:
import os
import requests
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from threading import Thread

# 初始化 Selenium WebDriver
def init_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    service = Service("chromedriver")  # 替換成你的chromedriver路徑
    driver = webdriver.Chrome(options=chrome_options)
    return driver

# 通過年齡驗證
def pass_age_verification(driver):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.over18-button-container')))
    agree_button = driver.find_element(By.CSS_SELECTOR, '.over18-button-container button[name="yes"]')
    agree_button.click()
    time.sleep(3)
    page_source = driver.page_source
    if "Beauty" in page_source:
        print("成功通過年齡驗證並進入 PTT 表特版。")
    else:
        print("未成功通過年齡驗證，請檢查是否正確設置 cookies。")

# 獲取最新五篇文章的標題和連結
def fetch_latest_articles(driver, url, limit=5):
    driver.get(url)
    pass_age_verification(driver)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    articles = soup.find_all("div", class_="r-ent")[:limit]
    result = []
    for article in articles:
        title = article.find("div", class_="title").text.strip()
        link = article.find("a")["href"] if article.find("a") else None
        if link:
            full_link = "https://www.ptt.cc" + link
            result.append((title, full_link))
    return result

# 修正圖片 URL
def fix_image_url(img_url):
    if img_url.startswith("https://cache.ptt.cc/c/https/"):
        return img_url.replace("https://cache.ptt.cc/c/https/", "https://")
    return img_url

# 獲取文章中的圖片連結
def fetch_images_from_article(driver, url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    img_tags = soup.find_all("img")
    img_urls = [fix_image_url(img.get("src")) for img in img_tags if img.get("src")]
    return img_urls

# 下載圖片
def download_image(img_url, folder_name, img_name, max_retries=3):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"}
    for attempt in range(max_retries):
        try:
            response = requests.get(img_url, headers=headers, stream=True, timeout=10)
            if response.status_code == 200:
                with open(os.path.join(folder_name, img_name), "wb") as f:
                    f.write(response.content)
                print(f"Downloaded: {img_name}")
                return
            else:
                print(f"Failed to download (HTTP {response.status_code}): {img_url}")
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} failed: {e}")
        time.sleep(2)

    # 若全部重試失敗，記錄失敗鏈接
    print(f"Failed to download after {max_retries} attempts: {img_url}")
    with open("failed_downloads.txt", "a") as log_file:
        log_file.write(f"{img_url}\n")

# 多執行緒下載圖片
def download_images_multithread(img_urls, folder_name):
    seen_urls = set()
    threads = []
    for idx, img_url in enumerate(img_urls):
        if img_url in seen_urls:
            continue
        seen_urls.add(img_url)
        img_name = f"image_{idx}.jpg"
        thread = Thread(target=download_image, args=(img_url, folder_name, img_name))
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

# 主程式
def main():
    driver = init_driver()
    try:
        # 爬取最新五篇文章
        ptt_beauty_url = "https://www.ptt.cc/bbs/Beauty/index.html"
        articles = fetch_latest_articles(driver, ptt_beauty_url)
        time.sleep(3)
        # 建立主資料夾
        main_folder = "PTT_Beauty"
        if not os.path.exists(main_folder):
            os.makedirs(main_folder)

        for title, link in articles:
            print(f"Processing Article: {title}")
            print(f"Link: {link}")

            # 為每篇文章建立資料夾
            folder_name = os.path.join(main_folder, title.replace("/", "_"))
            if not os.path.exists(folder_name):
                os.makedirs(folder_name)

            # 獲取圖片資源
            img_urls = fetch_images_from_article(driver, link)
            print(f"Found {len(img_urls)} images:")

            # 多執行緒下載圖片
            download_images_multithread(img_urls, folder_name)
            print("-" * 50)
    finally:
        driver.quit()

if __name__ == "__main__":
    main()


成功通過年齡驗證並進入 PTT 表特版。
Processing Article: [正妹] 太瘦
Link: https://www.ptt.cc/bbs/Beauty/M.1733836884.A.D3E.html
Found 10 images:
Downloaded: image_0.jpg
Downloaded: image_5.jpg
Downloaded: image_1.jpg
Downloaded: image_3.jpg
Downloaded: image_7.jpg
Downloaded: image_6.jpgDownloaded: image_8.jpg

Downloaded: image_2.jpg
Downloaded: image_4.jpg
Downloaded: image_9.jpg
--------------------------------------------------
Processing Article: [正妹] 反差感
Link: https://www.ptt.cc/bbs/Beauty/M.1733843302.A.FA1.html
Found 10 images:
Downloaded: image_0.jpg
Downloaded: image_1.jpg
Downloaded: image_5.jpgDownloaded: image_2.jpg

Downloaded: image_3.jpg
Downloaded: image_6.jpg
Downloaded: image_4.jpg
Downloaded: image_7.jpg
Downloaded: image_8.jpg
Downloaded: image_9.jpg
--------------------------------------------------
Processing Article: [正妹] Cosplay 12225 日本 雷姆 加奈
Link: https://www.ptt.cc/bbs/Beauty/M.1733870647.A.9FD.html
Found 9 images:
Downloaded: image_0.jpg
Downloaded: image_1.jpg
Downloaded: im