In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
# driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
ntime = 0
start = time.time()
inf_scroll_wait = 2

In [5]:
text_file = open("./Danawa_quotation.html", encoding='utf-8')

In [6]:
html_text = text_file.read()
text_file.close()

In [7]:
soup = BeautifulSoup(html_text, 'html.parser')
quotation_list = soup.select('div.board_image_list.item_3 > div > a')
print(len(quotation_list))

1080


In [8]:
quotation_list[0]['href']

'http://pc26.danawa.com/bbs/?controller=boardReview&methods=assemblyGalleryDetail&orderGoodsSeq=28858801&orderNumberSeq=7273967'

In [9]:
header = ['cpu', 'mainboard', 'ram', 'gpu', 'case', 'power', 'cooler', 'ssd', 'sys_cooler']

In [10]:
quo = dict()

for col in header:
    quo[col] = []

In [11]:
ntime = 0
total = len(quotation_list)
start = time.time()

In [12]:
for quotation in quotation_list[ntime:]:
    url = quotation['href']
    driver.get(url)
    driver.implicitly_wait(10)

    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    tbl_names = soup.select('#boardContainer > div.detail_content_wrap > div > div.detail_body_wrap > div.detail_spec > table > tbody > tr > th')
    tbl_pseqs = soup.select('#boardContainer > div.detail_content_wrap > div > div.detail_body_wrap > div.detail_spec > table > tbody > tr > td')
    tbl = zip(tbl_names, tbl_pseqs)
    
    info = dict()
    for name, pseq in tbl:
        try:
            onclick = pseq.a['onclick']
            start_index = onclick.find('productSeq=') + len('productSeq=')
            end_index = start_index + onclick[start_index:].find('\'')
            seq = onclick[start_index:end_index]
            info[name.get_text()] = seq
        except:
            pass
    
    if len(quo['cpu']) >= ntime:
        for key in quo.keys(): quo[key] = quo[key][:ntime]
    quo['cpu'].append(info.get('CPU'))
    quo['mainboard'].append(info.get('메인보드'))
    quo['ram'].append(info.get('메모리'))
    quo['gpu'].append(info.get('그래픽카드'))
    quo['case'].append(info.get('케이스'))
    quo['power'].append(info.get('파워'))
    quo['cooler'].append(info.get('쿨러'))
    quo['ssd'].append(info.get('SSD'))
    quo['sys_cooler'].append(info.get('시스템 쿨러'))
    
    ntime += 1
    rest_time = (time.time() - start) / ntime * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:.0f}s)', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()

1080/1080 (100.00%) (남은 시간: 0s))
걸린 시간: 3502.50 sec


In [14]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(quo).to_csv(f'../quotations{today}.csv', index=None)