In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
# driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
links = pd.read_csv('gamesysreqlink230919.csv')
print(links.link)

0       https://gamesystemrequirements.com/game/a-bird...
1       https://gamesystemrequirements.com/game/a-blin...
2       https://gamesystemrequirements.com/game/a-boy-...
3       https://gamesystemrequirements.com/game/a-city...
4       https://gamesystemrequirements.com/game/a-fare...
                              ...                        
7708      https://gamesystemrequirements.com/game/99vidas
7709    https://gamesystemrequirements.com/game/9th-co...
7710    https://gamesystemrequirements.com/game/esz-kerek
7711          https://gamesystemrequirements.com/game/ete
7712      https://gamesystemrequirements.com/game/reality
Name: link, Length: 7713, dtype: object


In [5]:
header_req = ['name', 'cpu', 'ram', 'gpu', 'dx', 'os', 'sto', 'spec_class']
header_pro = ['name', 'usage', 'image']
specc = {'Minimum system requirements:': 0, 'Recommended system requirements:': 1}

In [6]:
requirement = dict()
program = dict()
for col in header_req:
    requirement[col] = []
for col in header_pro:
    program[col] = []

In [7]:
load_pro = pd.read_csv('gameprograms230920.csv')
load_req = pd.read_csv('gamerequirements230920.csv')
ntime = len(load_pro)
total = len(links)

In [8]:
for col in header_pro:
    program[col] = load_pro[col].to_list()
for col in header_req:
    requirement[col] = load_req[col].to_list()

In [9]:
# 크롤링 시작
key_not_in = []
url = f'https://gamesystemrequirements.com/'
driver.get(url)
driver.implicitly_wait(10)
driver.find_element(By.XPATH, '//*[@id="qc-cmp2-ui"]/div[2]/div/button[2]').click()
driver.implicitly_wait(10)

In [10]:
time.sleep(waiting_sec)
start = time.time()
nstart = ntime
for link in links.link[ntime:]:
    driver.get(link)
    driver.implicitly_wait(10)
    
    time.sleep(1)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    title = soup.select_one('body > div.main-container > div.headline > div > div.game_head > div.game_head_title').get_text()
    img = soup.select_one('body > div.main-container > div.headline > div > div.game_head > div.game_head_cover > img')
    

    if len(program['name']) >= ntime:
        for key in program.keys(): program[key] = program[key][:ntime]
        
    program['name'].append(title)
    program['usage'].append(None)
    program['image'].append(img['src'] if img is not None else None)
    
    reqs = soup.select('body > div.main-container > div.main-column > div:nth-child(1) > div.gsr_container > div.gsr_section')
    for req in reqs:
        if not req.h2: continue
        if req.h2.get_text() not in specc.keys():
            key_not_in.append((title, req.h2.get_text()))
            continue
        info = dict()
        glabel = req.find_all('div', {'class': 'gsr_label'})
        gtext = req.find_all('div', {'class': 'gsr_text'})
        for gl, gt in zip(glabel, gtext):
            info[gl.get_text()] = gt.get_text()
        
        requirement['spec_class'].append(specc[req.h2.get_text()])
        requirement['name'].append(title)
        requirement['cpu'].append(info.get('CPU:'))
        requirement['ram'].append(info.get('RAM:'))
        requirement['gpu'].append(info.get('GPU:'))
        requirement['dx'].append(info.get('DX:'))
        requirement['os'].append(info.get('OS:'))
        requirement['sto'].append(info.get('STO:'))
    
    ntime += 1
    if ntime % 10 == 0 and save_file:
        today = time.strftime('%y%m%d')
        pd.DataFrame(requirement).to_csv(f'gamerequirements{today}.csv', index=None)
        pd.DataFrame(program).to_csv(f'gameprograms{today}.csv', index=None)
    rest_time = (time.time() - start) / (ntime - nstart) * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:05.0f}s) / 예상 종료 시간: {time.strftime("%d, %H:%M:%S", time.localtime(time.time() + rest_time))})', end='')
    driver.implicitly_wait(10)
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

7713/7713 (100.00%) (남은 시간: 00000s) / 예상 종료 시간: 20, 22:08:31)
걸린 시간: 586.07 sec


In [11]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(requirement).to_csv(f'gamerequirements{today}.csv', index=None)
    pd.DataFrame(program).to_csv(f'gameprograms{today}.csv', index=None)

In [12]:
key_not_in

[]