In [None]:
!pip install wget

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from time import sleep
import json
import os
import subprocess
import wget

'''
Selenium with Python 中文翻譯文檔
參考網頁：https://selenium-python-zh.readthedocs.io/en/latest/index.html
selenium 啓動 Chrome 的進階配置參數
參考網址：https://stackoverflow.max-everyday.com/2019/12/selenium-chrome-options/
'''

# 啟動瀏覽器工具的選項
my_options = webdriver.ChromeOptions()
# my_options.add_argument("--headless")                #不開啟實體瀏覽器背景執行
my_options.add_argument("--start-maximized")         #最大化視窗
my_options.add_argument("--incognito")               #開啟無痕模式
my_options.add_argument("--disable-popup-blocking") #禁用彈出攔截
my_options.add_argument("--disable-notifications")  #取消 chrome 推播通知
my_options.add_argument("--lang=zh-TW")  #設定為正體中文

# 使用 Chrome 的 WebDriver
driver = webdriver.Chrome(
    options = my_options,
    service = Service(ChromeDriverManager().install())
)

# 建立儲存圖片、影片的資料夾
folderPath = 'youtube'
if not os.path.exists(folderPath):
    os.makedirs(folderPath)

# 放置爬取的資料
listData = []

In [None]:
def visit():
    driver.get('https://www.youtube.com/');
    
def search():
    txtInput = driver.find_element(By.CSS_SELECTOR, "input#search")
    txtInput.send_keys("張學友")
    
    sleep(1)
    
    txtInput.submit()
    
    sleep(1)
    
def filterfunc():
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, "ytd-toggle-button-renderer.style-scope.ytd-search-sub-menu-renderer")
            )
        )
        driver.find_element(
            By.CSS_SELECTOR,
            "ytd-toggle-button-renderer.style-scope.ytd-search-sub-menu-renderer"
        ).click()
        
        sleep(2)
    
        driver.find_elements(
            By.CSS_SELECTOR,
            "a#endpoint.yt-simple-endpoint.style-scope.ytd-search-filter-renderer"
        )[9].click()
        
    except TimeoutException:
        print("等待逾時!")

def scroll():
    '''
    innerHeight => 瀏覽器內部的高度
    offset => 當前捲動的量(高度)
    count => 累計無效滾動次數
    limit => 最大無效滾動次數
    '''
    innerHeight = 0
    offset = 0
    count = 0
    limit = 3
    
    # 在捲動到沒有元素動態產生前，持續捲動
    while count <= limit:
        # 每次移動高度
        offset = driver.execute_script(
            'return document.documentElement.scrollHeight;'
        )

        '''
        或是每次只滾動一點距離，
        以免有些網站會在移動長距離後，
        將先前移動當中的元素隱藏

        例如將上方的 script 改成:
        offset += 600
        '''

        # 捲軸往下滑動
        driver.execute_script(f'''
            window.scrollTo({{
                top: {offset}, 
                behavior: 'smooth' 
            }});
        ''')
        
        '''
        [補充]
        如果要滾動的是 div 裡面的捲軸，可以使用以下的方法
        document.querySelector('div').scrollTo({...})
        '''
        
        # (重要)強制等待，此時若有新元素生成，瀏覽器內部高度會自動增加
        sleep(3)
        
        # 透過執行 js 語法來取得捲動後的當前總高度
        innerHeight = driver.execute_script(
            'return document.documentElement.scrollHeight;'
        );
        
        # 經過計算，如果滾動距離(offset)大於等於視窗內部總高度(innerHeight)，代表已經到底了
        if offset == innerHeight:
            count += 1
            
        # 為了實驗功能，捲動超過一定的距離，就結束程式
        if offset >= 600:
            break


def parse():
    global listData
    
    listData.clear()
    
    elements = driver.find_elements(
        By.CSS_SELECTOR,
        'ytd-video-renderer.style-scope.ytd-item-section-renderer'
    )
            
    for elm in elements:
        print("=" * 30)
        
        img = elm.find_element(
            By.CSS_SELECTOR,
            "a#thumbnail img"
        )
        imgSrc = img.get_attribute("src")
        print(imgSrc)
        
        a = elm.find_element(
            By.CSS_SELECTOR,
            "a#video-title"
        )
        aTitle = a.get_attribute("innerText")
        print(aTitle)
        
        aLink = a.get_attribute("href")
        print(aLink)
        
        strDelimiter = ""
        if "short" in aLink:
            strDelimiter = "/shorts/"
        else:
            strDelimiter = "v="
        youtube_id = aLink.split(strDelimiter)[1]
        youtube_id = youtube_id.split("&")[0]
        print(youtube_id)
        
        listData.append({
            "id": youtube_id,
            "title": aTitle,
            "link": aLink,
            "img": imgSrc    
        })

def JsonStatham():
    with open(f"{folderPath}/youtube.json", "w", encoding = "utf-8") as file:
        file.write(
            json.dumps( listData, ensure_ascii=False, indent=4)
        )
def close():
    driver.quit()

In [None]:
if __name__ == "__main__":
    visit()
    search()
    filterfunc()
    scroll()
    parse()
    JsonStatham()

In [None]:
close()

In [58]:
def download():
    if not os.path.exists("./yt-dlp.exe"):
        print("[下載 yt-dlp]")
        wget.download("https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp.exe")
        
    with open(f"{folderPath}/youtube.json", "r", encoding = "utf-8")as file:
        strJson = file.read()
        
    listResult = json.loads(strJson)
    
    for index, obj in enumerate(listResult):
        if index == 3:
            break
        print(f'正在下載:{obj["link"]}')
            
        cmd = [
            "./yt-dlp.exe",
            obj["link"],
            "-f",
            "b[ext=mp4]",
            "-o",
            f"{folderPath}/%(id)s.%(ext)s"      
        
        ]
              
        obj_sp = subprocess.run(cmd)
              
        if obj_sp.returncode == 0:
              print("complete")
        else:
              print("FAIL")    

In [59]:
download()

正在下載:https://www.youtube.com/watch?v=XHCBKSI1ppM&pp=ygUJ5by15a245Y-L
YO
正在下載:https://www.youtube.com/watch?v=1bDAC3gmO4A&pp=ygUJ5by15a245Y-L
YO
正在下載:https://www.youtube.com/watch?v=XNddVxU3Fzc&pp=ygUJ5by15a245Y-L
YO
