In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
# service = Service(executable_path=ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
# seq 파일 읽기
category = 11236855  # 쿨러

seq_file = f'./seq_{category}.csv'
seq_df = pd.read_csv(seq_file)
print(seq_df)

           seq
0     14110475
1     17221538
2     17658305
3     17393768
4     19690001
...        ...
4855   6060000
4856  20074436
4857  14728952
4858  18746018
4859  17151911

[4860 rows x 1 columns]


In [5]:
header = ['name', 'price', 'link', 'company', 'product_seq', 'image', 'category',
          'cooling_type', 'aircool_form', 'tdp', 'intel_socket', 'amd_socket', 'fan_size',
          'fan_count', 'airflow', 'noise', 'width', 'length', 'height', 'radiator',
          'radiator_length', 'radiator_thickness', 'hose_length', 'feature',
          'as_years', 'reg_date', 'bookmark']

In [6]:
ctype_mask = {'공랭': 0, '수랭': 1}
aform_mask = {'싱글타워형': 0, '듀얼타워형': 1, '일반형': 2, '슬림형': 3, '서버형':4}
isocket_bitmask = ['LGA1700', 'LGA1200', 'LGA115x', 'LGA4677', 'LGA4189-4/5(소켓P4/P5)', 'LGA2066',
                   'LGA3647', 'LGA2011-V3', 'LGA2011', 'LGA1366', 'LGA775', 'LGA771', '소켓478',
                   '소켓370']
asocket_bitmask = ['AM5', 'AM4' ,'FMx/AMx(AM1/4외)', 'AM1', 'SP5' ,'TR4' ,'sWRX8', 'sTRX4',
                   'SP3', '소켓939', '소켓754', '소켓940', '소켓A', '소켓F']
feat_bitmask = ['LED 라이트', 'PWM 지원', 'RGB 컨트롤러', '팬 컨트롤러', '리모콘 지원',
                '펌프속도조절', '워터블록/로고 회전', 'LCD', '인디게이터', '데이지체인',
                '제로팬(0-dB기술)', '수랭 커스텀', '속도조절스위치', '라디에이터 양면팬 지원',
                '자석 고정형']

In [7]:
cooler = dict()

for col in header:
    cooler[col] = []

In [8]:
ntime = 0
total = len(seq_df)
start = time.time()

In [9]:
# 크롤링 시작
for seq in seq_df.seq[ntime:]:
    url = f'https://prod.danawa.com/info/?pcode={seq}&cate={category}'
    driver.get(url)
    driver.implicitly_wait(10)
    
    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    spec_tbl = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody")
    spec_tbl_tit = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > th.tit")
    spec_tbl_dsc = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > td.dsc")
    name = soup.select_one("#blog_content > div.summary_info > div.top_summary > h3 > span")
    price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
    image_url = soup.select_one("#baseImage")
    cate_u = soup.select_one("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > a:nth-child(1) > u")
    if cate_u is None:
        cate_u = soup.select_one("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > span:nth-child(1) > u")
    cate = cate_u.get_text()
    
    if cate is None or '쿨러' not in cate:
        ntime += 1
        rest_time = (time.time() - start) / ntime * (total - ntime)
        print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:.0f}s)', end='')
        continue
        
    tbl = zip(spec_tbl_tit, spec_tbl_dsc)

    info = dict()
    info['제품명'] = name.get_text()
    info['최저가'] = price.get_text() if price is not None else None
    info['링크'] = url
    info['이미지'] = image_url['src']
    for tit, dsc in tbl:
        text = dsc.get_text()
        text = text.replace('\n','')
        text = text.replace('\t','')
        text = text.replace('(제조사 웹사이트 바로가기)','')
        info[tit.get_text()] = text.strip()
        
    isocket_bit = 0
    asocket_bit = 0
    feat_bit = 0
    aform = None

    if len(cooler['name']) >= ntime:
        for key in cooler.keys(): cooler[key] = cooler[key][:ntime]
    cooler['name'].append(name.get_text())
    cooler['price'].append(price.get_text().replace(',', '') if price is not None else None)
    cooler['link'].append(url)
    cooler['company'].append(info.get('제조회사'))
    cooler['product_seq'].append(seq)
    cooler['image'].append(image_url['src'])
    cooler['category'].append(cate)
    cooler['cooling_type'].append(ctype_mask[info.get('냉각 방식')] if '냉각 방식' in info.keys() else None)
    aform_u = soup.select_one("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > a:nth-child(2) > u")
    if aform_u is not None:
        aform = aform_u.get_text()
    cooler['aircool_form'].append(aform_mask[aform] if aform in aform_mask.keys() else None)
    cooler['tdp'].append(info.get('TDP').replace('W', '') if 'TDP' in info.keys() else None)
    for i in range(len(isocket_bitmask)):
        if isocket_bitmask[i] in info.keys(): isocket_bit |= 1 << i
    cooler['intel_socket'].append(isocket_bit)
    for i in range(len(asocket_bitmask)):
        if asocket_bitmask[i] in info.keys(): asocket_bit |= 1 << i
    cooler['amd_socket'].append(asocket_bit)
    cooler['fan_size'].append(max(info.get('팬 크기').replace('mm', '').split(', ')) if '팬 크기' in info.keys() else None)
    cooler['fan_count'].append(info.get('팬 개수').replace('개', '') if '팬 개수' in info.keys() else None)
    cooler['airflow'].append(max(info.get('최대 풍량').replace(' CFM', '').split(', ')) if '최대 풍량' in info.keys() else None)
    cooler['noise'].append(info.get('최대 팬소음').replace('dBA', '') if '최대 팬소음' in info.keys() else None)
    cooler['width'].append(info.get('가로').replace('mm' ,'') if '가로' in info.keys() else None)
    cooler['length'].append(info.get('세로').replace('mm', '') if '세로' in info.keys() else None)
    cooler['height'].append(info.get('높이').replace('mm', '') if '높이' in info.keys() else None)
    cooler['radiator'].append(info.get('라디에이터').replace('열', '') if '라디에이터' in info.keys() else None)
    rad_len = soup.find('u', string='라디에이터 길이')
    rad_thick = soup.find('u', string='라디에이터 두께')
    hose_len = soup.find('u', string='호스 길이')
    cooler['radiator_length'].append(rad_len.find_parent('span').find_next_sibling().findChild('u').get_text()
                                     if rad_len is not None else None)
    cooler['radiator_thickness'].append(rad_thick.find_parent('span').find_next_sibling().findChild('u').get_text()
                                        if rad_thick is not None else None)
    cooler['hose_length'].append(hose_len.find_parent('span').find_next_sibling().findChild('u').get_text()
                                 if hose_len is not None else None)
    for i in range(len(feat_bitmask)):
        if feat_bitmask[i] in info.keys(): feat_bit |= 1 << i
    cooler['feature'].append(feat_bit)
    cooler['as_years'].append(re.sub('(년| |이상|유상|누수보상|\+|,)', '', info.get('A/S기간'))
                              if ('A/S기간' in info.keys() and '개월' not in info.get('A/S기간')) else None)
    cooler['reg_date'].append(re.sub('(년| |월)', '', info.get('등록년월')))
    cooler['bookmark'].append(None)
    
    ## 테스트용
    ntime += 1
    rest_time = (time.time() - start) / ntime * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:05.0f}s) / 예상 종료 시간: {time.strftime("%d, %H:%M:%S", time.localtime(time.time() + rest_time))})', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

4860/4860 (100.00%) (남은 시간: 00000s) / 예상 종료 시간: 19, 17:00:43)
걸린 시간: 24947.34 sec


In [10]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(cooler).to_csv(f'cooler{today}.csv', index=None)