In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
# service = Service(executable_path=ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2.5

In [4]:
# seq 파일 읽기
category = 112763  # HDD

seq_file = f'./seq_{category}.csv'
seq_df = pd.read_csv(seq_file)
print(seq_df)
# seq_file = 'hdd230918-seq.csv'
# seq_df = pd.read_csv(seq_file)['product_seq']
# print(seq_df)

          seq
0    15765761
1     6545078
2    13522751
3     5173062
4     9721383
..        ...
466   2154603
467   1001366
468    974589
469    161437
470     66982

[471 rows x 1 columns]


In [5]:
header = ['name', 'price', 'link', 'company', 'product_seq', 'image', 'size', 'capacity',
          'interface', 'rpm', 'transfer_rate', 'buffer_capacity', 'recording_method',
          'thickness', 'as_years', 'reg_date', 'bookmark']

rec_m = ['CMR', 'SMR', 'TGMR']

In [6]:
hdd = dict()

for col in header:
    hdd[col] = []

In [7]:
ntime = 0
total = len(seq_df)
start = time.time()

In [8]:
# 크롤링 시작
for seq in seq_df.seq[ntime:]:
    url = f'https://prod.danawa.com/info/?pcode={seq}&cate={category}'
    driver.get(url)
    driver.implicitly_wait(10)
    
    time.sleep(waiting_sec)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    spec_tbl = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody")
    spec_tbl_tit = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > th.tit")
    spec_tbl_dsc = soup.select("#productDescriptionArea > div > div.prod_spec > table > tbody > tr > td.dsc")
    name = soup.select_one("#blog_content > div.summary_info > div.top_summary > h3 > span")
    price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
    image_url = soup.select_one("#baseImage")
    as_year = soup.select("#blog_content > div.summary_info > div.top_summary > div > div.sub_dsc > div > dl > dd > div > div > span > u:-soup-contains('A/S')")
    
    tbl = zip(spec_tbl_tit, spec_tbl_dsc)

    rec = None
    
    info = dict()
    info['제품명'] = name.get_text()
    info['최저가'] = price.get_text() if price is not None else None
    info['링크'] = url
    info['이미지'] = image_url['src']
    for tit, dsc in tbl:
        text = dsc.get_text()
        text = text.replace('\n','')
        text = text.replace('\t','')
        text = text.replace('(제조사 웹사이트 바로가기)','')
        info[tit.get_text()] = text.strip()
    
    hdd_class = info.get('제품 분류')
    if info.get('제품 분류') != 'HDD (PC용)':
        ntime += 1
        continue

    if len(hdd['name']) >= ntime:
        for key in hdd.keys(): hdd[key] = hdd[key][:ntime]
    hdd['name'].append(name.get_text())
    hdd['price'].append(price.get_text().replace(',', '') if price is not None else None)
    hdd['link'].append(url)
    hdd['company'].append(info.get('제조회사'))
    hdd['product_seq'].append(seq)
    hdd['image'].append(image_url['src'])
    hdd['size'].append(info.get('디스크 크기')[:3] if '디스크 크기' in info.keys() else None)
    cap_str = info.get('디스크 용량')
    if cap_str is not None:
        cap_str = cap_str.replace('GB', '').replace(' 이상', '')
        if 'TB' in cap_str:
            cap_str = str(float(cap_str.replace('TB', '')) * 1024)
    hdd['capacity'].append(cap_str)
    hdd['interface'].append(info.get('인터페이스'))
    hdd['rpm'].append(info.get('회전수').replace('RPM', '').replace(',', '')
                      if '회전수' in info.keys() else None)
    hdd['transfer_rate'].append(info.get('전송 속도').replace('MB/s', '')
                                 if '전송 속도' in info.keys() else None)
    hdd['buffer_capacity'].append(info.get('버퍼 용량').replace('메모리', '').replace('MB', '')
                                  if '버퍼 용량' in info.keys() else None)
    rec_str = info.get('기록방식')
    if rec_str is not None:
        for i in range(len(rec_m)):
            if rec_m[i] in rec_str:
                rec = i
                break
    hdd['recording_method'].append(rec)
    hdd['thickness'].append(info.get('두께').replace('mm', '')
                                if '두께' in info.keys() else None)
    hdd['as_years'].append(re.sub('(A/S |년)', '', as_year[0].get_text()) if as_year else None)
    hdd['reg_date'].append(re.sub('(년| |월)', '', info.get('등록년월')))
    hdd['bookmark'].append(None)
    
    ## 테스트용
    ntime += 1
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%)', end='')
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

471/471 (100.00%)
걸린 시간: 2633.25 sec


In [9]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame(hdd).to_csv(f'hdd{today}_2.csv', index=None)