In [12]:
# !pip install -q selenium==4.3.0
# !pip install lxml
# !pip install -q beautifulsoup4==4.11.1
# !pip install -q backoff==2.1.2
# import backoff
# backoff.__version__
# !pip install -q pandas
# !pip install python-dotenv

In [1]:
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 鍵盤事件
import pandas as pd
import backoff
from datetime import datetime
import pytz


tz = pytz.timezone('Asia/Taipei')
today = datetime.now(tz)
date_ = today.strftime("%Y%m%d_%H%M")

In [2]:
def set_ua():
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    return user_agent

In [3]:
@backoff.on_exception(backoff.expo,
                        Exception, 
                      max_time=10)
def chrome_init():
    chrome_opt = webdriver.ChromeOptions()
    # chrome_opt.add_argument('--headless')
    chrome_opt.add_argument('--no-sandbox')
    chrome_opt.add_argument('--ignore-ssl-errors=yes')
    chrome_opt.add_argument('--ignore-certificate-errors')
    chrome_opt.add_argument(f'user-agent={set_ua()}')
    # chrome_opt.add_argument("--incognito")  # 使用無痕模式。用 selenium開瀏覽器已經很乾淨了，但疑心病重的可以用一下
    driver = webdriver.Remote(
        command_executor='http://selenium-hub:4444/wd/hub',
        options=chrome_opt
    )
    
    return driver

In [4]:
url = 'https://store.steampowered.com/charts/mostplayed' # 指定網址
driver = chrome_init()
driver.get(url) # 進入指定網址
driver.implicitly_wait(5) # 等待伺服器反應最多 5 秒，如果在時間到之前反應就提早結束等待

In [5]:
driver.find_element('css selector', 'html').send_keys(Keys.END) # 通常定位在最根部的 html 即可
sleep(3)

In [6]:
# 獲取頁面源代碼
html_source = driver.page_source

# 使用 BeautifulSoup 解析 HTML 內容
soup = BeautifulSoup(html_source, 'lxml')

data = soup.find_all('tr', class_ = "_2-RN6nWOY56sNmcDHu069P")

columns = ['Rank', 'App_Id', 'Name', 'Current_Players', 'Peak_Players', "Datetime"]

save_lst = []
for stream in data:

    appid_href = stream.find('a', href=True)
    if appid_href:
        href = appid_href['href']
        app_id = href.split('/')[4]


    d1 = today.strftime("%Y-%m-%d %H:%M")
    rank = stream.find("td", class_ = "_34h48M_x9S-9Q2FFPX_CcU").text
    title = stream.find("div", class_ = "_1n_4-zvf0n4aqGEksbgW9N").text
    current = stream.find("td", class_ = "_3L0CDDIUaOKTGfqdpqmjcy").text
    day_peak = stream.find("td", class_ = "yJB7DYKsuTG2AYhJdWTIk").text
    
    value = [rank, app_id, title, current, day_peak, d1]
    save_lst.append(value)


df = pd.DataFrame(
    data = save_lst,
    columns = columns
)

driver.quit()

In [20]:
df.to_csv(f'csv/steam_mostplayed/steam_mostplayed_{date_}.csv', index = 0, encoding = "utf-8")

In [7]:
df

Unnamed: 0,Rank,App_Id,Name,Current_Players,Peak_Players,Datetime
0,1,730,Counter-Strike 2,666304,1273227,2024-08-04 08:56
1,2,2923300,Banana,404195,490769,2024-08-04 08:56
2,3,570,Dota 2,325043,726244,2024-08-04 08:56
3,4,1245620,ELDEN RING,117666,159961,2024-08-04 08:56
4,5,2139460,Once Human,117464,182655,2024-08-04 08:56
...,...,...,...,...,...,...
95,96,1151340,Fallout 76,10206,13090,2024-08-04 08:56
96,97,270880,American Truck Simulator,10146,10280,2024-08-04 08:56
97,98,386360,SMITE®,9978,10749,2024-08-04 08:56
98,99,292030,The Witcher 3: Wild Hunt,9860,24989,2024-08-04 08:56
