In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
# service = Service(executable_path=ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
# seq 파일 읽기
category = 112775  # 케이스

seq_file = f'./seq_{category}.csv'
seq_df = pd.read_csv(seq_file)
print(seq_df)

           seq
0     18212909
1     18419840
2     11884207
3     14705840
4     10423167
...        ...
1441   2186578
1442   1983723
1443   1983654
1444   1517662
1445   1517658

[1446 rows x 1 columns]


In [5]:
header = ['name', 'price', 'link', 'company', 'product_seq', 'image', 'category', 'size',
          'power_included', 'power_support', 'board_support', 'bay_133', 'bay_89', 'bay_64',
          'pci_horizontal', 'pci_vertical', 'cooling_fan', 'led_fan', 'front_type', 'side_open',
          'side_type', 'back_vent', 'front_vent', 'top_vent', 'bottom_vent', 'external_port',
          'width', 'height', 'depth', 'gpu_size', 'cpu_cooler_size', 'power_size', 'liquid_cooler',
          'radiator_top', 'radiator_front', 'radiator_rear', 'radiator_side', 'feature', 'led_color',
          'reg_date', 'bookmark']

In [6]:
bs_bitmask = ['Extended-ATX', '표준-ATX', 'Micro-ATX', 'Flex-ATX', '표준-ITX', 'Mini-ITX',
              'SSI-CEB', 'SSI-EEB', 'Mini-DTX']
ep_bitmask = ['USB', 'USB 3.0', 'USB 3.0 (Type-C)', 'USB 3.1 (Type-C)', 'USB 3.1 (Type-A)',
              'USB 3.2 (Type-C)', 'eSATA', 'HDMI', '카드리더기']
feat_bitmask = ['LED 라이트', '방음패드 내장', '상태표시 창', 'HDD 도킹', 'PC방 전용', 
                '수랭쿨링홀', '채굴용(마이닝)', '켄싱턴 락', '상단 패널 탈착', '그래픽카드 지지대']

In [7]:
case = dict()

for col in header:
    case[col] = []

In [8]:
ntime = 0
total = len(seq_df)
start = time.time()

In [37]:
# 크롤링 시작
for seq in seq_df.seq[ntime:]:
    url = f'https://prod.danawa.com/info/?pcode={seq}&cate={category}'
    driver.get(url)
    driver.implicitly_wait(10)
    
    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    spec_tbl = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody")
    spec_tbl_tit = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > th.tit")
    spec_tbl_dsc = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > td.dsc")
    name = soup.select_one("#blog_content > div.summary_info > div.top_summary > h3 > span")
    price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
    image_url = soup.select_one("#baseImage")
    led_color = soup.select("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > span > u:-soup-contains('LED 색상')")
    
    tbl = zip(spec_tbl_tit, spec_tbl_dsc)

    info = dict()
    info['제품명'] = name.get_text()
    info['최저가'] = price.get_text() if price is not None else None
    info['링크'] = url
    info['이미지'] = image_url['src']
    for tit, dsc in tbl:
        text = dsc.get_text()
        text = text.replace('\n','')
        text = text.replace('\t','')
        text = text.replace('(제조사 웹사이트 바로가기)','')
        info[tit.get_text()] = text.strip()
    
    bs_bit = 0
    ep_bit = 0
    feat_bit = 0

    if len(case['name']) >= ntime:
        for key in case.keys(): case[key] = case[key][:ntime]
    case['name'].append(name.get_text())
    case['price'].append(price.get_text().replace(',', '') if price is not None else None)
    case['link'].append(url)
    case['company'].append(info.get('제조회사'))
    case['product_seq'].append(seq)
    case['image'].append(image_url['src'])
    case['category'].append(info.get('제품 분류'))
    case['size'].append(info.get('케이스 크기'))
    case['power_included'].append(info.get('파워포함여부'))
    case['power_support'].append(info.get('지원파워규격'))
    for i in range(len(bs_bitmask)):
        if bs_bitmask[i] in info.keys(): bs_bit |= 1 << i
    case['board_support'].append(bs_bit)
    case['bay_133'].append(info.get('13.3cm베이').replace('개', '')
                           if '13.3cm베이' in info.keys() else None)
    case['bay_89'].append(info.get('8.9cm베이').replace('개', '')
                           if '8.9cm베이' in info.keys() else None)
    case['bay_64'].append(info.get('6.4cm베이').replace('개', '')
                           if '6.4cm베이' in info.keys() else None)
    pci_h = re.sub('(개| |이하|LP|타입|\([^)]*\))', '', info.get('PCI 슬롯(수평)')) if 'PCI 슬롯(수평)' in info.keys() else None
    case['pci_horizontal'].append(eval(pci_h) if pci_h != None else None)
    case['pci_vertical'].append(info.get('PCI 슬롯(수직)').replace('개', '')
                                if 'PCI 슬롯(수직)' in info.keys() else None)
    case['cooling_fan'].append(re.sub('(총|개)', '', info.get('쿨링팬'))
                               if '쿨링팬' in info.keys() else None)
    case['led_fan'].append(info.get('LED팬').replace('개', '') if 'LED팬' in info.keys() else None)
    case['front_type'].append(info.get('전면 패널 타입'))
    case['side_open'].append(info.get('측면 개폐 방식'))
    case['side_type'].append(info.get('측면'))
    case['back_vent'].append(info.get('후면'))
    case['front_vent'].append(info.get('전면'))
    case['top_vent'].append(info.get('상단'))
    case['bottom_vent'].append(info.get('하단'))
    for i in range(len(ep_bitmask)):
        if ep_bitmask[i] in info.keys(): ep_bit |= 1 << i
    case['external_port'].append(ep_bit)
    case['width'].append(info.get('너비(W)').replace('mm' ,'')
                         if '너비(W)' in info.keys() else None)
    case['height'].append(info.get('높이(H)').replace('mm' ,'')
                          if '높이(H)' in info.keys() else None)
    case['depth'].append(info.get('깊이(D)').replace('mm' ,'')
                         if '깊이(D)' in info.keys() else None)
    case['gpu_size'].append(info.get('GPU 장착').replace('mm' ,'')
                            if 'GPU 장착' in info.keys() else None)
    case['cpu_cooler_size'].append(info.get('CPU쿨러 장착').replace('mm' ,'')
                         if 'CPU쿨러 장착' in info.keys() else None)
    case['power_size'].append(info.get('파워 장착').replace('mm' ,'')
                         if '파워 장착' in info.keys() else None)
    case['liquid_cooler'].append(re.sub('(최대|열 지원)', '', info.get('수랭쿨러 규격'))
                                 if '수랭쿨러 규격' in info.keys() else None)
    case['radiator_top'].append(max(list(map(int, re.sub('(최대| |mm|\([^)]*\))', '', info.get('라디에이터(상단)')).split(','))))
                                if '라디에이터(상단)' in info.keys() else None)
    case['radiator_front'].append(max(list(map(int, re.sub('(최대| |mm|x2|x3|\([^)]*\))', '', info.get('라디에이터(전면)')).split(','))))
                                  if '라디에이터(전면)' in info.keys() else None)
    case['radiator_rear'].append(max(list(map(int, re.sub('(최대| |mm|\([^)]*\))', '', info.get('라디에이터(후면)')).split(','))))
                                 if '라디에이터(후면)' in info.keys() else None)
    case['radiator_side'].append(max(list(map(int, re.sub('(최대| |mm|\([^)]*\))', '', info.get('라디에이터(측면)')).split(','))))
                                 if '라디에이터(측면)' in info.keys() else None)
    for i in range(len(feat_bitmask)):
        if feat_bitmask[i] in info.keys(): feat_bit |= 1 << i
    case['feature'].append(feat_bit)
    case['led_color'].append(led_color[0].get_text().replace('LED 색상: ', '') if led_color else None)
    case['reg_date'].append(re.sub('(년| |월)', '', info.get('등록년월')))
    case['bookmark'].append(None)
    
    ## 테스트용
    ntime += 1
    rest_time = (time.time() - start) / ntime * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time:.0f}s)', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

1446/1446 (100.00%)
걸린 시간: 17568.48 sec


In [38]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(case).to_csv(f'case{today}.csv', index=None)

ValueError: All arrays must be of the same length

In [40]:
for key in case.keys():
    print(f'{key}: {len(case[key])}')

name: 1446
price: 1446
link: 1446
company: 1446
product_seq: 1446
image: 1446
category: 1446
size: 1446
power_included: 1446
power_support: 1446
board_support: 1446
bay_133: 1446
bay_89: 1446
bay_64: 1446
pci_horizontal: 1446
pci_vertical: 1446
cooling_fan: 1446
led_fan: 1446
front_type: 1446
side_open: 1446
side_type: 1446
back_vent: 1446
front_vent: 1446
top_vent: 1446
bottom_vent: 1446
external_port: 1446
width: 1446
height: 1446
depth: 1446
gpu_size: 1446
cpu_cooler_size: 1446
power_size: 1446
liquid_cooler: 1446
radiator_top: 1446
radiator_front: 1446
radiator_rear: 1447
radiator_side: 0
feature: 1446
led_color: 1446
reg_date: 1446
bookmark: 1446
