In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
# driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
# seq 파일 읽기
category = 112777  # 파워

seq_file = f'./seq_{category}.csv'
seq_df = pd.read_csv(seq_file)
print(seq_df)

           seq
0     18872015
1     14677028
2     20662874
3     18901058
4     18503426
...        ...
1172   1623228
1173   1543686
1174    952397
1175     73511
1176     73081

[1177 rows x 1 columns]


In [5]:
header = ['name', 'price', 'link', 'company', 'product_seq', 'image', 'category', 'rated_power',
          '80plus_certification', 'eta_certification', 'lambda_certification', 'voltage_fluctuation',
          'output_method', 'availability', 'pfc_circuit', 'pf_factor', 'fan_size', 'fan_number',
          'bearing', 'output_12v', 'cable_connection', 'depth', 'main_power', 'sub_power', 'pcie_16pin',
          'pcie_8pin', 'pcie_6pin', 'sata', 'ide_4', 'rgb_connector', 'feature', 'inside', 'protection',
          'as_years', 'reg_date', 'bookmark']

In [6]:
feat_bitmask = ['팬리스모드', '자동 팬 조절', '수동 팬 조절', '대기전력 1W 미만', '프리볼트', '플랫케이블',
                'LED 라이트', '디지털제어']
inside_bitmask = ['85도 콘덴서', '105도 콘덴서', 'DC to DC 설계', 'LLC공진형컨버터', 'ACRF']
prot_bitmask = ['과전압(OVP)', '저전압(UVP)', '과전류(OCP)', '과전력(OPP)', '과열(OTP/OHP)', '단락(SCP)',
                '서지+인러쉬(SIP)', '과부하(OLP)', '무부하(NLP)', '공회전(NLO)', 'BOP']

In [7]:
power = dict()

for col in header:
    power[col] = []

In [8]:
ntime = 0
total = len(seq_df)
start = time.time()

In [9]:
# 크롤링 시작
for seq in seq_df.seq[ntime:]:
    url = f'https://prod.danawa.com/info/?pcode={seq}&cate={category}'
    driver.get(url)
    driver.implicitly_wait(10)
    
    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    spec_tbl = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody")
    spec_tbl_tit = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > th.tit")
    spec_tbl_dsc = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > td.dsc")
    name = soup.select_one("#blog_content > div.summary_info > div.top_summary > h3 > span")
    price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
    image_url = soup.select_one("#baseImage")
    
    tbl = zip(spec_tbl_tit, spec_tbl_dsc)

    info = dict()
    info['제품명'] = name.get_text()
    info['최저가'] = price.get_text() if price is not None else None
    info['링크'] = url
    info['이미지'] = image_url['src']
    for tit, dsc in tbl:
        text = dsc.get_text()
        text = text.replace('\n','')
        text = text.replace('\t','')
        text = text.replace('(제조사 웹사이트 바로가기)','')
        info[tit.get_text()] = text.strip()
    
    feat_bit = 0
    inside_bit = 0
    prot_bit = 0

    if len(power['name']) >= ntime:
        for key in power.keys(): power[key] = power[key][:ntime]
    power['name'].append(name.get_text())
    power['price'].append(price.get_text().replace(',', '') if price is not None else None)
    power['link'].append(url)
    power['company'].append(info.get('제조회사'))
    power['product_seq'].append(seq)
    power['image'].append(image_url['src'])
    power['category'].append(info.get('제품 분류'))
    power['rated_power'].append(info.get('정격출력').replace('W', '')
                                if '정격출력' in info.keys() else None)
    power['80plus_certification'].append(info.get('80PLUS인증'))
    power['eta_certification'].append(info.get('ETA인증'))
    power['lambda_certification'].append(info.get('LAMBDA인증'))
    power['voltage_fluctuation'].append(re.sub('(±|%)', '', info.get('전압변동'))
                                        if '전압변동' in info.keys() else None)
    power['output_method'].append(info.get('+12V 출력방식'))
    power['availability'].append(info.get('+12V 가용률').replace('%', '')
                                 if '+12V 가용률' in info.keys() else None)
    power['pfc_circuit'].append(info.get('PFC회로'))
    power['pf_factor'].append(info.get('PF(역률)').replace('%', '')
                              if 'PF(역률)' in info.keys() else None)
    power['fan_size'].append(re.sub('(mm| |팬)', '', info.get('쿨링팬 크기'))
                             if '쿨링팬 크기' in info.keys() else None)
    power['fan_number'].append(re.sub('(개| |팬)', '', info.get('쿨링팬 개수'))
                               if '쿨링팬 개수' in info.keys() else None)
    power['bearing'].append(info.get('베어링'))
    power['output_12v'].append(info.get('+12V 출력').replace('A', '')
                               if '+12V 출력' in info.keys() else None)
    power['cable_connection'].append(info.get('케이블연결'))
    power['depth'].append(info.get('깊이').replace('mm', '')
                          if '깊이' in info.keys() else None)
    power['main_power'].append(info.get('메인전원'))
    power['sub_power'].append(info.get('보조전원'))
    pcie16 = info.get('PCIe 16핀(12+4)')
    if pcie16 is not None:
        if pcie16 == '12VHPWR 1개': pcie16 = 1
        elif pcie16 == '12VHPWR 2개': pcie16 = 2
        elif pcie16 == '12V2x6 1개': pcie16 = 3
    power['pcie_16pin'].append(pcie16)
    power['pcie_8pin'].append(info.get('PCIe 8핀(6+2)').replace('개', '')
                              if 'PCIe 8핀(6+2)' in info.keys() else None)
    power['pcie_6pin'].append(info.get('PCIe 6핀').replace('개', '')
                              if 'PCIe 6핀' in info.keys() else None)
    power['sata'].append(info.get('SATA').replace('개', '')
                         if 'SATA' in info.keys() else None)
    power['ide_4'].append(info.get('IDE 4핀').replace('개', '')
                          if 'IDE 4핀' in info.keys() else None)
    power['rgb_connector'].append(info.get('RGB').replace('개', '')
                                  if 'RGB' in info.keys() else None)
    for i in range(len(feat_bitmask)):
        if feat_bitmask[i] in info.keys(): feat_bit |= 1 << i
    power['feature'].append(feat_bit)
    for i in range(len(inside_bitmask)):
        if inside_bitmask[i] in info.keys(): inside_bit |= 1 << i
    power['inside'].append(inside_bit)
    for i in range(len(prot_bitmask)):
        if prot_bitmask[i] in info.keys(): prot_bit |= 1 << i
    power['protection'].append(prot_bit)
    power['as_years'].append(re.sub('(무상| |년)', '', info.get('A/S 보증기간'))
                             if 'A/S 보증기간' in info.keys() else None)
    power['reg_date'].append(re.sub('(년| |월)', '', info.get('등록년월')))
    power['bookmark'].append(None)
    
    ## 테스트용
    ntime += 1
    rest_time = (time.time() - start) / ntime * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:.0f}s)', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

1177/1177 (100.00%) (남은 시간: 0s)))
걸린 시간: 7026.63 sec


In [10]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(power).to_csv(f'power{today}.csv', index=None)