In [11]:
import os
import shutil
import re
import subprocess
import urllib
import zipfile
import requests


"""
Scrapes and installs chromium from linux mint 21.3(virginia) packages site.
Link: http://packages.linuxmint.com/pool/upstream/c/chromium/
Scrapes and installs chromedriver from Chrome for Testing page.
Link: https://googlechromelabs.github.io/chrome-for-testing/
"""

class CantGetLatestChromiumVersionError(Exception):
    """Happens when regex failed"""

class ChromiumInstallationFailedException(Exception):
    """
    Happens when deb package not installed
    Check the downloaded chroumium deb file
    """

class CantGetChromeDriverError(Exception):
    """Happens when regex failed"""

main_url = "http://packages.linuxmint.com/pool/upstream/c/chromium/"
work_dir = "/content"

def get_chromium_latest_version() -> str:
    # A request to packages.linuxmint.com for getting latest version of chromium
    # e.g. "chromium_121.0.6167.160~linuxmint1+virginia_amd64.deb"
    r = requests.get(main_url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="(chromium_[^"]+linuxmint1%2Bvirginia_amd64.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise CantGetLatestChromiumVersionError("Failed to get latest chromium version!")
    return latest_version

def install_chromium(latest_version: str, deb_file: str, quiet: bool):
    # Full url of deb file
    url = f"{main_url}{latest_version}"

    # Download deb file
    if quiet:
        command = f"wget -q -O {work_dir}/{deb_file} {url}"
    else:
        command = f"wget -O {work_dir}/{deb_file} {url}"
    print(f"Downloading: {deb_file}")
    # os.system(command)
    !$command

    # Install deb file
    if quiet:
        command = f"apt-get install {work_dir}/{deb_file} >> apt.log"
    else:
        command = f"apt-get install {work_dir}/{deb_file}"
    print(f"Installing: {deb_file}")
    # os.system(command)
    !$command

def check_chromium_installation(deb_file: str):
    try:
        subprocess.call(["chromium"])
        print("Chromium installation successfull.\n")
        # If installation successfull we can remove deb file
        # Delete deb file from disk
        os.remove(f"{work_dir}/{deb_file}")
    except FileNotFoundError:
        raise ChromiumInstallationFailedException("Chromium Installation Failed!")

def get_chromedriver_url(deb_file: str) -> str:
    # Get content of crhomedriver page
    url = "https://googlechromelabs.github.io/chrome-for-testing/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Get chromium version from deb file's name
    version_number = deb_file.split("chromium_")[-1].split(".")[0]

    # Example: https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip
    pattern = f'https://[^<]+/{version_number}[^<]+/linux64/chromedriver-linux64.zip'
    # Find latest version
    chromedriver_url_search = re.search(pattern, text)
    if chromedriver_url_search:
        chromedriver_url = chromedriver_url_search.group()
        return chromedriver_url
    else:
        raise CantGetChromeDriverError("Failed to get chromedriver!")

def install_chromedriver(deb_file: str, quiet: bool):
    url = get_chromedriver_url(deb_file)
    file_name = url.split("/")[-1]
    # Download chromedriver
    chromedriver_zip = f"{work_dir}/{file_name}"
    if quiet:
        command = f"wget -q -O {chromedriver_zip} {url}"
    else:
        command = f"wget -O {chromedriver_zip} {url}"
    print(f"Downloading: {file_name}")
    # os.system(command)
    !$command

    # Extract chromedriver from zip
    with zipfile.ZipFile(chromedriver_zip) as zpf:
        zpf.extract(member="chromedriver-linux64/chromedriver", path=work_dir)

    # Remove chromedriver-linux64.zip file
    os.remove(chromedriver_zip)

    # Move extracted chromedriver binary file to /usr/bin directory
    source = f"{work_dir}/chromedriver-linux64/chromedriver"
    destination = "/usr/bin/chromedriver"
    os.rename(source, destination)

    # Make chromedriver binary executable
    os.system(f"chmod +x {destination}")

    # Remove empty chromedriver-linux64 folder
    shutil.rmtree(f"{work_dir}/chromedriver-linux64")

    print("Chromedriver installed")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium from linux mint packages site
    latest_version = get_chromium_latest_version()
    # Name of the deb file
    deb_file = urllib.parse.unquote(latest_version, "utf-8")
    # Download and install chromium for ubuntu 22.04
    install_chromium(latest_version, deb_file, quiet)
    # Check if installation succesfull
    check_chromium_installation(deb_file)
    # Install chromedriver
    install_chromedriver(deb_file, quiet)
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium_131.0.6778.85~linuxmint1+virginia_amd64.deb
Installing: chromium_131.0.6778.85~linuxmint1+virginia_amd64.deb
Chromium installation successfull.

Downloading: chromedriver-linux64.zip
Chromedriver installed


In [12]:
import os
import time
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# 定義 Selenium 的選項
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 啟用無頭模式
options.add_argument('--no-sandbox')  # 避免權限問題
options.add_argument('--disable-dev-shm-usage')  # 防止資源問題
options.add_argument('--disable-gpu')  # 禁用 GPU 渲染（僅限 Windows）

# 設定 ChromeDriver 路徑
chrome_driver_path = 'chromedriver'
driver = webdriver.Chrome(options=options)

# 定義 Imgur 搜索 URL
search_url = "https://imgur.com/search?q=cat"
driver.get(search_url)

# 模擬滾動 10 次以加載更多圖片
for _ in range(10):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # 等待新內容加載

# 解析網頁內容
soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# 提取圖片 URL
image_elements = soup.find_all("img")
image_urls = [img["src"] for img in image_elements if "src" in img.attrs]

# 處理完整的圖片 URL（有些是相對 URL，需要補上 "https:"）
full_image_urls = ["https:" + url if url.startswith("//") else url for url in image_urls]

# 將圖片 URL 寫入 CSV 文件
csv_file = "cat_images.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["img-src"])  # 標題行
    writer.writerows([[url] for url in full_image_urls])

print(f"CSV 檔案已保存：{csv_file}")

CSV 檔案已保存：cat_images.csv


In [13]:
#使用requests
import csv
import requests
import os

# 創建文件夾
os.makedirs("images", exist_ok=True)

# 讀取 CSV
with open("cat_images.csv", "r") as file:
    reader = csv.DictReader(file)
    for row in reader:
        url = row["img-src"]
        filename = os.path.join("images", url.split("/")[-1])  # 使用圖片的名稱
        try:
            response = requests.get(url, stream=True)
            with open(filename, "wb") as img_file:
                for chunk in response.iter_content(1024):
                    img_file.write(chunk)
            print(f"下載完成: {filename}")
        except Exception as e:
            print(f"下載失敗: {url} - {e}")


下載完成: images/UXCBWlWb.jpg
下載完成: images/ZhP2zUWb.jpg
下載完成: images/NUyttbnb.jpg
下載完成: images/O3EIPHpb.jpg
下載完成: images/jIZKh2sb.jpg
下載完成: images/ZZpVeoYb.jpg
下載完成: images/6tCJw81b.jpg
下載完成: images/vGla0Wxb.jpg
下載完成: images/U0iADj9b.jpg
下載完成: images/ggQUrJ9b.jpg
下載完成: images/mtbl1crb.jpg
下載完成: images/lVlPvCBb.jpg
下載完成: images/c5puGf3b.jpg
下載完成: images/grXqcNwb.jpg
下載完成: images/0sa6jrVb.jpg
下載完成: images/pbao8mhb.jpg
下載完成: images/X6DesPeb.jpg
下載完成: images/jhcN1Keb.jpg
下載完成: images/35QDIiFb.jpg
下載完成: images/nqcMAk0b.jpg
下載完成: images/3Qmaql6b.jpg
下載完成: images/Ejn0Yvib.jpg
下載完成: images/fqd9uUjb.jpg
下載完成: images/DIhbSrbb.jpg
下載完成: images/ymg1iqyb.jpg
下載完成: images/frKhsH4b.jpg
下載完成: images/0LINzxsb.jpg
下載完成: images/f4jZRrtb.jpg
下載完成: images/VMDIqHjb.jpg
下載完成: images/0Y1b8Xvb.jpg
下載完成: images/jZiyxAUb.jpg
下載完成: images/L20W8JHb.jpg
下載完成: images/Jvh1OQmb.jpg
下載完成: images/KWvtdg0b.jpg
下載完成: images/lJJZVRvb.jpg
下載完成: images/JZn9Hysb.jpg
下載完成: images/qBtvSZsb.jpg
下載完成: images/vLPhacab.jpg
下載完成: images

In [14]:
#使用image_downloader
from image_downloader import download_csv_file_images
download_csv_file_images("cat_images.csv", output_folder="./images2")

ModuleNotFoundError: No module named 'image_downloader'