In [24]:
# !pip install -q selenium==4.3.0
# !pip install lxml
# !pip install -q beautifulsoup4==4.11.1
# !pip install -q backoff==2.1.2
# import backoff
# backoff.__version__
# !pip install -q pandas
# !pip install python-dotenv

In [25]:
from time import sleep
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # 鍵盤事件
import pandas as pd
import backoff
from datetime import datetime
import pytz


tz = pytz.timezone('Asia/Taipei')
today = datetime.now(tz)
date_ = today.strftime("%Y%m%d_%H%M")

In [26]:
def set_ua():
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
    return user_agent

In [27]:
@backoff.on_exception(backoff.expo,
                        Exception, 
                      max_time=10)
def chrome_init():
    chrome_opt = webdriver.ChromeOptions()
    # chrome_opt.add_argument('--headless')
    chrome_opt.add_argument('--no-sandbox')
    chrome_opt.add_argument('--ignore-ssl-errors=yes')
    chrome_opt.add_argument('--ignore-certificate-errors')
    chrome_opt.add_argument(f'user-agent={set_ua()}')
    # chrome_opt.add_argument("--incognito")  # 使用無痕模式。用 selenium開瀏覽器已經很乾淨了，但疑心病重的可以用一下
    driver = webdriver.Remote(
        command_executor='http://selenium-hub:4444/wd/hub',
        options=chrome_opt
    )
    
    return driver

In [28]:
url = 'https://store.steampowered.com/search/?filter=topsellers' # 指定網址
driver = chrome_init()
driver.get(url) # 進入指定網址
driver.implicitly_wait(5) # 等待伺服器反應最多 5 秒，如果在時間到之前反應就提早結束等待

In [29]:
# driver.定位.send_keys(Keys.特殊鍵, Keys.特殊鍵, Keys.特殊鍵) # 通式
for i in range(15):
    driver.find_element('css selector', 'html').send_keys(Keys.END) # 通常定位在最根部的 html 即可
    sleep(3)
    print(i)

html_source = driver.page_source

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [30]:
# 獲取頁面源代碼
html_source = driver.page_source

# 使用 BeautifulSoup 解析 HTML 內容
soup = BeautifulSoup(html_source, 'lxml')

all = soup.find('div',  id = "search_result_container")
data = all.find_all('div', class_ = "responsive_search_name_combined")
href_ = all.find_all('a', href=True)

columns = ['GameName', 'GameID', 'Platform', 'Datetime']

save_lst = []
for stream, hf in zip(data, href_):

    href = hf['href']
    app_id = href.split('/')[4]

    d1 = today.strftime("%Y-%m-%d %H:%M")

    title = stream.find("span", class_ = "title").text


    platforms = stream.find_all('span', class_='platform_img')
    platform_list = [platform['class'][1] for platform in platforms]


    
    value = [title, app_id, platform_list, d1]
    save_lst.append(value)

df = pd.DataFrame(
    data = save_lst,
    columns = columns
)

driver.quit()


In [32]:

df.to_csv(f'csv/steam_platform_info/steam_platform_info_{date_}.csv', index = 0, encoding = "utf-8")

In [33]:
df

Unnamed: 0,Name,App_Id,Platform,Datetime
0,NARAKA: BLADEPOINT,1203220,[win],2024-08-03 21:05
1,The First Descendant,2074920,[win],2024-08-03 21:05
2,Once Human,2139460,[win],2024-08-03 21:05
3,PUBG: BATTLEGROUNDS,578080,[win],2024-08-03 21:05
4,Counter-Strike 2,730,"[win, linux]",2024-08-03 21:05
...,...,...,...,...
895,Hollow Knight - Official Soundtrack,598190,[music],2024-08-03 21:05
896,Platform 8,2903560,[win],2024-08-03 21:05
897,Pathless Woods,1726130,[win],2024-08-03 21:05
898,ZOMBIE RAID: No One Survives,688540,[win],2024-08-03 21:05
