In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
# driver = webdriver.Chrome('./chromedriver/chromedriver.exe', options=chrome_options)
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
# seq 파일 읽기
category = 112753  # 그래픽 카드

seq_file = f'./seq_{category}.csv'
seq_df = pd.read_csv(seq_file)
print(seq_df)

           seq
0     18668606
1     20274764
2     19773293
3     13463144
4     20315144
...        ...
1316   7424305
1317   6643450
1318   5406903
1319   3928707
1320   1029593

[1321 rows x 1 columns]


In [5]:
header = ['name', 'price', 'link', 'company', 'product_seq', 'image', 'category',
          'cooling_type', 'aircool_form', 'tdp', 'intel_socket', 'amd_socket', 'fan_size',
          'fan_count', 'airflow', 'noise', 'width', 'length', 'height', 'radiator',
          'radiator_length', 'radiator_thickness', 'hose_length', 'feature', 'led_color',
          'as_years', 'reg_date', 'bookmark']

In [6]:
feat_bitmask = ['LED 라이트', 'PWM 지원', 'RGB 컨트롤러', '팬 컨트롤러', '리모콘 지원',
                '펌프속도조절', '워터블록/로고 회전', 'LCD', '인디게이터', '데이지체인',
                '제로팬(0-dB기술)', '수랭 커스텀', '속도조절스위치', '라디에이터 양면팬 지원',
                '자석 고정형']

In [7]:
cooler = dict()

for col in header:
    cooler[col] = []

In [8]:
ntime = 0
total = len(seq_df)
start = time.time()

In [10]:
# 크롤링 시작
for seq in seq_df.seq[ntime:]:
    url = f'https://prod.danawa.com/info/?pcode={seq}&cate={category}'
    driver.get(url)
    driver.implicitly_wait(10)
    
    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    spec_tbl = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody")
    spec_tbl_tit = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > th.tit")
    spec_tbl_dsc = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > td.dsc")
    name = soup.select_one("#blog_content > div.summary_info > div.top_summary > h3 > span")
    price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
    image_url = soup.select_one("#baseImage")
    as_year = soup.select("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > span > u:-soup-contains('A/S')")
    
    tbl = zip(spec_tbl_tit, spec_tbl_dsc)

    info = dict()
    info['제품명'] = name.get_text()
    info['최저가'] = price.get_text() if price is not None else None
    info['링크'] = url
    info['이미지'] = image_url['src']
    for tit, dsc in tbl:
        text = dsc.get_text()
        text = text.replace('\n','')
        text = text.replace('\t','')
        text = text.replace('(제조사 웹사이트 바로가기)','')
        info[tit.get_text()] = text.strip()
        
    feat_bit = 0

    if len(cooler['name']) >= ntime:
        for key in cooler.keys(): cooler[key] = cooler[key][:ntime]
    cooler['name'].append(name.get_text())
    cooler['price'].append(price.get_text().replace(',', '') if price is not None else None)
    cooler['link'].append(url)
    cooler['company'].append(info.get('제조회사'))
    cooler['product_seq'].append(seq)
    cooler['image'].append(image_url['src'])
    cooler['chipset_company'].append(info.get('칩셋 제조사'))
    if 'NVIDIA 칩셋' in info.keys():
        chipset = info.get('NVIDIA 칩셋')
    elif 'AMD 칩셋' in info.keys():
        chipset = info.get('AMD 칩셋')
    elif '기타 칩셋' in info.keys():
        chipset = info.get('기타 칩셋')
    elif '인텔 칩셋' in info.keys():
        chipset = info.get('인텔 칩셋')
    cooler['chipset'].append(chipset)
    cooler['nm'].append(info.get('GPU 제조 공정').replace('nm', '')
                         if 'GPU 제조 공정' in info.keys() else None)
    cooler['base_clock'].append(info.get('베이스클럭').replace('MHz', '')
                                 if '베이스클럭' in info.keys() else None)
    cooler['boost_clock'].append(info.get('부스트클럭').replace('MHz', '')
                                  if '부스트클럭' in info.keys() else None)
    cooler['cuda_processor'].append(info.get('쿠다 프로세서').replace('개', '')
                                     if '쿠다 프로세서' in info.keys() else None)
    cooler['stream_processor'].append(info.get('스트림 프로세서').replace('개', '')
                                       if '스트림 프로세서' in info.keys() else None)
    cooler['interface'].append(info.get('인터페이스'))
    cooler['memory_type'].append(info.get('메모리 종류'))
    mem_cap = info.get('memory_capacity')
    if not (mem_cap is None or mem_cap == '해당없음' or mem_cap == '정보없음'):
        if mem_cap.endswith('MB'): 
            mem_cap = str(float(mem_cap[:-2] / 1024.))
        if mem_cap.endswith('GB'):
            mem_cap = mem_cap[:-2]
    cooler['memory_capacity'].append(mem_cap)
    cooler['memory_clock'].append(info.get('메모리 클럭').replace('MHz', '')
                                   if '메모리 클럭' in info.keys() else None)
    cooler['memory_bus'].append(info.get('메모리 버스').replace('-bit', '')
                                 if '메모리 버스' in info.keys() else None)
    for i in range(len(port_bitmask)):
        if port_bitmask[i] in info.keys(): port_bit |= 1 << i
    cooler['port'].append(port_bit)
    cooler['monitor_support'].append(re.sub('(최대 모니터 |개)', '', info.get('모니터 지원'))
                                      if '모니터 지원' in info.keys() else None)
    for i in range(len(af_bitmask)):
        if af_bitmask[i] in info.keys(): af_bit |= 1 << i
    cooler['additional_function'].append(af_bit)
    cooler['usage_power'].append(re.sub('(최대 |W)', '', info.get('사용전력'))
                                  if '사용전력' in info.keys() else None) 
    cooler['recommend_power'].append(re.sub('(해당없음|정격파워|W|이상| )', '', info.get('권장 파워용량'))
                                      if '권장 파워용량' in info.keys() else None)
    for i in range(len(ct_bitmask)):
        if ct_bitmask[i] in info.keys(): ct_bit |= 1 << i
    cooler['cooling_type'].append(ct_bit)
    cooler['pan_number'].append(re.sub('(개| |팬)', '', info.get('팬 개수').replace('무', '0'))
                                 if '팬 개수' in info.keys() else None)
    cooler['length'].append(info.get('가로(길이)').replace('mm', '')
                             if '가로(길이)' in info.keys() else None)
    cooler['thickness'].append(info.get('두께').replace('mm', '')
                                if '두께' in info.keys() else None)
    if '전원 포트' in info.keys():
        pin_txt = info.get('전원 포트')
        pin_idx = [pin_txt.find('6핀 x'), pin_txt.find('8핀 x'),
                   pin_txt.find('12핀 x'), pin_txt.find('16핀(12VHPWR) x')]
        txt_len = [4, 4, 5, 14]
        for i in range(len(pin_idx)):
            if pin_idx[i] < 0: continue
            pin[i] = pin_txt[pin_idx[i] + txt_len[i]]
    cooler['pin'].append("".join(pin))
    for i in range(len(feat_bitmask)):
        if feat_bitmask[i] in info.keys(): feat_bit |= 1 << i
    cooler['feature'].append(feat_bit)
    cooler['as_years'].append(re.sub('(A/S |년)', '', as_year[0].get_text()) if as_year else None)
    cooler['bench_mark'].append(None)
    cooler['reg_date'].append(re.sub('(년| |월)', '', info.get('등록년월')))
    cooler['bookmark'].append(None)
    
    ## 테스트용
    ntime += 1
    rest_time = (time.time() - start) / ntime * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:.0f}s)', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

1321/1321 (100.00%)
걸린 시간: 8127.49 sec


In [11]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(cooler).to_csv(f'cooler{today}.csv', index=None)