In [7]:
# !pip install -q selenium==4.3.0
# !pip install -q lxml==4.9.1
# !pip install -q beautifulsoup4==4.11.1
# !pip install -q backoff==2.1.2
# import backoff
# backoff.__version__
# !pip install -q pandas
# !pip install python-dotenv

In [8]:
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 鍵盤事件
import pandas as pd
import backoff
from datetime import datetime
import pytz


tz = pytz.timezone('Asia/Taipei')
today = datetime.now(tz)
date_ = today.strftime("%Y%m%d_%H%M")

In [9]:
def set_ua():
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    return user_agent

In [10]:
@backoff.on_exception(backoff.expo,
                        Exception, 
                      max_time=10)
def chrome_init():
    chrome_opt = webdriver.ChromeOptions()
    # chrome_opt.add_argument('--headless')
    chrome_opt.add_argument('--no-sandbox')
    chrome_opt.add_argument('--ignore-ssl-errors=yes')
    chrome_opt.add_argument('--ignore-certificate-errors')
    chrome_opt.add_argument(f'user-agent={set_ua()}')
    # chrome_opt.add_argument("--incognito")  # 使用無痕模式。用 selenium開瀏覽器已經很乾淨了，但疑心病重的可以用一下
    driver = webdriver.Remote(
        command_executor='http://selenium-hub:4444/wd/hub',
        options=chrome_opt
    )
    
    return driver

In [11]:
url = 'https://steamcommunity.com/?subsection=broadcasts' # 指定網址
driver = chrome_init()
driver.get(url) # 進入指定網址
driver.implicitly_wait(5) # 等待伺服器反應最多 5 秒，如果在時間到之前反應就提早結束等待

In [12]:
# # 使用臨時配置文件
# option.add_argument("--user-data-dir=/tmp/temporary-profile")
# # 設定預設語言為英文
# option.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})

In [13]:
# driver.定位.send_keys(Keys.特殊鍵, Keys.特殊鍵, Keys.特殊鍵) # 通式
for i in range(50):
    driver.find_element('css selector', 'html').send_keys(Keys.END) # 通常定位在最根部的 html 即可
    sleep(2)
    print(i, "success")

# 獲取頁面源代碼
html_source = driver.page_source

0 success
1 success
2 success
3 success
4 success
5 success
6 success
7 success
8 success
9 success
10 success
11 success
12 success
13 success
14 success
15 success
16 success
17 success
18 success
19 success
20 success
21 success
22 success
23 success
24 success
25 success
26 success
27 success
28 success
29 success
30 success
31 success
32 success
33 success
34 success
35 success
36 success
37 success
38 success
39 success
40 success
41 success
42 success
43 success
44 success
45 success
46 success
47 success
48 success
49 success


In [14]:

html_source = driver.page_source

# 使用 BeautifulSoup 解析 HTML 內容
soup = BeautifulSoup(html_source, 'lxml')

all = soup.find_all("div", class_ = "apphub_CardMetaData")
pid = soup.find_all("div", class_ = "apphub_CardContentAuthorName")

columns = ['GameName', 'SteamCurrentViewers', 'PersonID',"Datetime"]

save_lst = []
for stream, person in zip(all, pid):
    d1 = today.strftime("%Y-%m-%d %H:%M")
    title = stream.find("div", class_ = "apphub_CardContentTitle ellipsis").text
    Viewers = stream.find("div", class_ = "apphub_CardContentViewers ellipsis").text

    # 在父 div 中找到所有的 a 标签
    a_tags = person.find_all('a')

    # 提取并打印每个 a 标签的文本
    for a in a_tags:
        ppid_ = a.text.strip()  # 使用 strip() 去掉可能的多余空格
        if ppid_:  # 如果找到不为空的文本，存储并退出循环
            ppid = ppid_
            break
        
    
    value = [title, Viewers, ppid, d1]
    save_lst.append(value)


df = pd.DataFrame(
    data = save_lst,
    columns = columns
)

# 去重複
df = df.drop_duplicates(subset=['GameName', 'PersonID'])

driver.quit()

In [15]:
df.to_csv(f'csv/steam_broadcast/steam_broadcast_{date_}.csv', index = 0, encoding = "utf-8")