In [5]:
import os
import shutil
import re
import subprocess
import urllib
import zipfile
import requests


"""
Scrapes and installs chromium from linux mint 21.3(virginia) packages site.
Link: http://packages.linuxmint.com/pool/upstream/c/chromium/
Scrapes and installs chromedriver from Chrome for Testing page.
Link: https://googlechromelabs.github.io/chrome-for-testing/
"""

class CantGetLatestChromiumVersionError(Exception):
    """Happens when regex failed"""

class ChromiumInstallationFailedException(Exception):
    """
    Happens when deb package not installed
    Check the downloaded chroumium deb file
    """

class CantGetChromeDriverError(Exception):
    """Happens when regex failed"""

main_url = "http://packages.linuxmint.com/pool/upstream/c/chromium/"
work_dir = "/content"

def get_chromium_latest_version() -> str:
    # A request to packages.linuxmint.com for getting latest version of chromium
    # e.g. "chromium_121.0.6167.160~linuxmint1+virginia_amd64.deb"
    r = requests.get(main_url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="(chromium_[^"]+linuxmint1%2Bvirginia_amd64.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise CantGetLatestChromiumVersionError("Failed to get latest chromium version!")
    return latest_version

def install_chromium(latest_version: str, deb_file: str, quiet: bool):
    # Full url of deb file
    url = f"{main_url}{latest_version}"

    # Download deb file
    if quiet:
        command = f"wget -q -O {work_dir}/{deb_file} {url}"
    else:
        command = f"wget -O {work_dir}/{deb_file} {url}"
    print(f"Downloading: {deb_file}")
    # os.system(command)
    !$command

    # Install deb file
    if quiet:
        command = f"apt-get install {work_dir}/{deb_file} >> apt.log"
    else:
        command = f"apt-get install {work_dir}/{deb_file}"
    print(f"Installing: {deb_file}")
    # os.system(command)
    !$command

def check_chromium_installation(deb_file: str):
    try:
        subprocess.call(["chromium"])
        print("Chromium installation successfull.\n")
        # If installation successfull we can remove deb file
        # Delete deb file from disk
        os.remove(f"{work_dir}/{deb_file}")
    except FileNotFoundError:
        raise ChromiumInstallationFailedException("Chromium Installation Failed!")

def get_chromedriver_url(deb_file: str) -> str:
    # Get content of crhomedriver page
    url = "https://googlechromelabs.github.io/chrome-for-testing/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Get chromium version from deb file's name
    version_number = deb_file.split("chromium_")[-1].split(".")[0]

    # Example: https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip
    pattern = f'https://[^<]+/{version_number}[^<]+/linux64/chromedriver-linux64.zip'
    # Find latest version
    chromedriver_url_search = re.search(pattern, text)
    if chromedriver_url_search:
        chromedriver_url = chromedriver_url_search.group()
        return chromedriver_url
    else:
        raise CantGetChromeDriverError("Failed to get chromedriver!")

def install_chromedriver(deb_file: str, quiet: bool):
    url = get_chromedriver_url(deb_file)
    file_name = url.split("/")[-1]
    # Download chromedriver
    chromedriver_zip = f"{work_dir}/{file_name}"
    if quiet:
        command = f"wget -q -O {chromedriver_zip} {url}"
    else:
        command = f"wget -O {chromedriver_zip} {url}"
    print(f"Downloading: {file_name}")
    # os.system(command)
    !$command

    # Extract chromedriver from zip
    with zipfile.ZipFile(chromedriver_zip) as zpf:
        zpf.extract(member="chromedriver-linux64/chromedriver", path=work_dir)

    # Remove chromedriver-linux64.zip file
    os.remove(chromedriver_zip)

    # Move extracted chromedriver binary file to /usr/bin directory
    source = f"{work_dir}/chromedriver-linux64/chromedriver"
    destination = "/usr/bin/chromedriver"
    os.rename(source, destination)

    # Make chromedriver binary executable
    os.system(f"chmod +x {destination}")

    # Remove empty chromedriver-linux64 folder
    shutil.rmtree(f"{work_dir}/chromedriver-linux64")

    print("Chromedriver installed")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium from linux mint packages site
    latest_version = get_chromium_latest_version()
    # Name of the deb file
    deb_file = urllib.parse.unquote(latest_version, "utf-8")
    # Download and install chromium for ubuntu 22.04
    install_chromium(latest_version, deb_file, quiet)
    # Check if installation succesfull
    check_chromium_installation(deb_file)
    # Install chromedriver
    install_chromedriver(deb_file, quiet)
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium_130.0.6723.91~linuxmint1+virginia_amd64.deb
Installing: chromium_130.0.6723.91~linuxmint1+virginia_amd64.deb
Chromium installation successfull.

Downloading: chromedriver-linux64.zip
Chromedriver installed


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
from bs4 import BeautifulSoup

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')

chrome_driver_path = 'chromedriver'
driver = webdriver.Chrome(options=options)

url = "https://www.momoshop.com.tw/category/DgrpCategory.jsp?d_code=2001201588&p_orderType=6&showType=chessboardType"
driver.get(url)

time.sleep(5)

try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, 'prdImg'))
    )
except Exception as e:
    print("未能成功加載商品列表:", e)
    driver.quit()
    exit()

page_source = driver.page_source
driver.quit()

soup = BeautifulSoup(page_source, 'html.parser')
products = []

for item in soup.find_all('li', class_='eachGood'):
    title = item.find('img', class_='prdImg').get('alt', '').strip() if item.find('img', class_='prdImg') else '無標題'
    price = item.find('b').get_text(strip=True) if item.find('b') else '無價格'

    product = {
        'id': len(products) + 1,
        'title': title,
        'price': price
    }
    print(product)
    products.append(product)

with open('momo_products.json', 'w', encoding='utf-8') as f:
    json.dump(products, f, ensure_ascii=False, indent=4)
    print("已成功保存商品數據到momo_products.json檔案中")


{'id': 1, 'title': '【福記】原味日式素豆干152gx1袋 植物性蛋白質', 'price': '95'}
{'id': 2, 'title': '【惠香】一口方塊乾(120g/包;人氣NO.1 傳承百年的道地手藝)', 'price': '41'}
{'id': 3, 'title': '【福記-官方直營】原味日式素豆干(152g/袋)(植物性蛋白質)', 'price': '85'}
{'id': 4, 'title': '【臻御行】千層豆干/方豆干/香辣方豆干/香辣豆干條/小豆干丁300g(道地古早味)', 'price': '114'}
{'id': 5, 'title': '【同正】沙茶豆干(80g/包)', 'price': '19'}
{'id': 6, 'title': '【惠香】小豆丁三角包220gx3包(台灣名產豆干 五香豆乾 肉粽包方便攜帶)', 'price': '265'}
{'id': 7, 'title': '【福記-官方直營】原味日式素豆干(152gX3袋)(組合系列)', 'price': '255'}
{'id': 8, 'title': '【4×6】五香豆干(35g×8p)', 'price': '89'}
{'id': 9, 'title': '【惠香】一口方塊乾(300g/包;人氣NO.1辣味豆干 300g大包裝夾鏈包)', 'price': '96'}
{'id': 10, 'title': '【老中醫】豆干-中丁250g(3包入/中元普渡/拜拜箱/澎派箱)', 'price': '267'}
{'id': 11, 'title': '【福記】香辣日式素豆干152gx1袋 植物性蛋白質', 'price': '90'}
{'id': 12, 'title': '【大溪廖心蘭】老道滷味豆干(110g/包)', 'price': '47'}
{'id': 13, 'title': '【惠香】素沙茶豆干(120g/包;特選黃豆與秘傳醬汁精心熬製 台灣名產)', 'price': '41'}
{'id': 14, 'title': '【臻御行】素蹄筋/辣味素蹄筋 250g(全素)', 'price': '129'}
{'id': 15, 'title': '【惠香】黑木柴豆干(120g/包;純素食 口感軟中帶Ｑ 