In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import pprint
import time
import csv
import re

# 크롬 드라이버 자동 업데이트
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
class RetriesExceededError(Exception):
    def __init__(self, message):
        super().__init__(message)

        
def find_element_with_retry(driver_, by, value, waiting_sec=2.5, max_retries=10):
    retries = 0
    while retries < max_retries:
        try:
            element = WebDriverWait(driver_, waiting_sec).until(EC.presence_of_element_located((by, value)))
            return element
        except (NoSuchElementException, StaleElementReferenceException):
            retries += 1
            
    raise RetriesExceededError(f'{value}를 찾을 수 없음\n재시도 횟수 초과: {waiting_sec}초로 {max_retries}회 시도')

In [3]:
# 브라우저 꺼짐 방지
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
chrome_options.add_argument(f'user-agent={user_agent}')
# 불필요한 에러 메세지 없애기
chrome_options.add_experimental_option("excludeSwitches", ["enable-logging"])
driver = webdriver.Chrome(ChromeDriverManager().install(), options=chrome_options)
# service = Service(executable_path=ChromeDriverManager().install())
# driver = webdriver.Chrome(service=service, options=chrome_options)

save_file = True  # 파일 저장 여부
waiting_sec = 2

In [4]:
# seq 파일 읽기
cate_dict = {'cpu': 112747, 'mainboard': 112751, 'ram': 112752, 'gpu': 112753, 'case': 112775,
             'power': 112777, 'cooler': 11236855, 'ssd': 112760}
quotations = pd.read_csv(f'../quotations231001.csv').drop(columns='sys_cooler')
quotations.head()

Unnamed: 0,cpu,mainboard,ram,gpu,case,power,cooler,ssd
0,18640280.0,18652877.0,16443341.0,27613427.0,21583046.0,19813847.0,21550985.0,12338942.0
1,18039569.0,18652877.0,18700841.0,18021914.0,18419840.0,19023932.0,,13538642.0
2,19627808.0,18021440.0,18911780.0,18668606.0,12969350.0,21206702.0,18714860.0,17454683.0
3,19174175.0,21628184.0,18911780.0,21629063.0,,19814027.0,,17001050.0
4,17913710.0,18021491.0,18911780.0,18303401.0,16571885.0,18442058.0,18076145.0,16587443.0


In [5]:
ntime = 0
total = len(quotations)
start = time.time()

In [6]:
headers = ['cpu', 'mainboard', 'gpu', 'case', 'power', 'cooler', 'ssd']

In [7]:
prices = dict()

for col in headers:
    prices[col] = []

In [8]:
prices

{'cpu': [],
 'mainboard': [],
 'gpu': [],
 'case': [],
 'power': [],
 'cooler': [],
 'ssd': []}

# 제품 번호만 뽑아서 번호 - 가격 연결 테이블 만들기

In [9]:
tbl = dict()

In [10]:
# 크롤링 시작
for i in range(len(quotations[ntime:])):
    quo = quotations.loc[i]
    for col in headers:
        category = cate_dict[col]
        pcode = quo[col]
        if pcode in tbl.keys():
            continue
        if np.isnan(pcode):
            prices[col].append(None)
            continue
        url = f'https://prod.danawa.com/info/?pcode={pcode}&cate={category}'
        driver.get(url)
        driver.implicitly_wait(10)
    
        time.sleep(waiting_sec)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        price = soup.select_one("#blog_content > div.summary_info > div.detail_summary > div.summary_left > div.lowest_area > div.lowest_top > div.row.lowest_price > span.lwst_prc > a > em")
        tbl[pcode] = price.get_text().replace(',', '') if price is not None else None
    ## 테스트용
    ntime += 1
    rest_time = int((time.time() - start) / ntime) * (total - ntime)
    print(f'\r{ntime}/{total} ({ntime/total*100:.2f}%) (남은 시간: {rest_time//3600}:{rest_time//3600%60}:{rest_time%60})', end='')
    # if ntime == 3: break
    
end = time.time()
print(f'\n걸린 시간: {end-start:.2f} sec')
driver.quit()
# 크롤링 끝

1080/1080 (100.00%) (남은 시간: 0:0:0)
걸린 시간: 6675.79 sec


In [21]:
pd.DataFrame.from_dict([tbl]).T

Unnamed: 0,0
18640280.0,269780
18652877.0,248980
27613427.0,545910
21583046.0,104160
19813847.0,118490
...,...
11578097.0,
10285407.0,
18894161.0,93250
18874781.0,201370


In [107]:
import pickle

In [108]:
pickle.dump(tbl, open("../quotation_price.pkl", "wb"))

In [105]:
# 파일 저장
if save_file:
    today = time.strftime('%y%m%d')
    pd.DataFrame.from_dict([tbl]).to_csv(f'../price{today}.csv', index=None)

In [23]:
quotations = pd.read_csv("../quotations231001.csv").drop(columns='sys_cooler')
quotations.head()

Unnamed: 0,cpu,mainboard,ram,gpu,case,power,cooler,ssd
0,18640280.0,18652877.0,16443341.0,27613427.0,21583046.0,19813847.0,21550985.0,12338942.0
1,18039569.0,18652877.0,18700841.0,18021914.0,18419840.0,19023932.0,,13538642.0
2,19627808.0,18021440.0,18911780.0,18668606.0,12969350.0,21206702.0,18714860.0,17454683.0
3,19174175.0,21628184.0,18911780.0,21629063.0,,19814027.0,,17001050.0
4,17913710.0,18021491.0,18911780.0,18303401.0,16571885.0,18442058.0,18076145.0,16587443.0


In [24]:
cpus = quotations.cpu

In [25]:
cpu_df = pd.read_csv("../part_cpu_wb230922.csv")
mainboard_df = pd.read_csv("../part_mainboard230926.csv")
ram_df = pd.read_csv("../part_ram230921.csv")
gpu_df = pd.read_csv("../part_gpu_wb230921.csv")
case_df = pd.read_csv("../part_case230920.csv")
power_df = pd.read_csv("../part_power230920.csv")
cooler_df = pd.read_csv("../part_cooler230920.csv")
ssd_df = pd.read_csv("../part_ssd230926.csv")

In [26]:
mcb = max(cpu_df['bench_mark'])
mgb = max(gpu_df['bench_mark'])
mrc = max(ram_df['capacity'])
mncb = min(cpu_df['bench_mark'])
mngb = min(gpu_df['bench_mark'])
mnrc = min(ram_df['capacity'])
mmp = max(mainboard_df['price'])
mnmp = min(mainboard_df['price'])
mpp = max(power_df['price'])
mnpp = min(power_df['price'])
mcap = max(case_df['price'])
mncap = min(case_df['price'])
mcop = max(cooler_df['price'])
mncop = min(cooler_df['price'])
msp = max(ssd_df['price'])
mnsp = min(ssd_df['price'])

In [47]:
type(quotations.values[0][0])

numpy.float64

In [55]:
pd.isnull(tbl[quotations.values[0][0]])

False

In [61]:
row[0] in tbl.keys()

True

In [93]:
prices_r = dict({'cpu': [], 'gpu': [], 'ram': [], 'mainboard': [], 'power': [], 'case': [], 'cooler': [], 'ssd': [], 'total': []})
prices = dict({'cpu': [], 'gpu': [], 'ram': [], 'mainboard': [], 'power': [], 'case': [], 'cooler': [], 'ssd': [], 'total': []})
pseqs = dict({'cpu': [], 'gpu': [], 'ram': [], 'mainboard': [], 'power': [], 'case': [], 'cooler': [], 'ssd': []})
bench_r = dict({'cpu': [], 'gpu': [], 'ram': [], 'total': []})
bench_rr = dict({'cpu': [], 'gpu': [], 'ram': [], 'total': []})

for row in quotations.values:
    try:
        cpu_bench = cpu_df.loc[cpu_df['product_seq'] == row[0], 'bench_mark'].iloc[0]
        ram_capa = ram_df.loc[ram_df['product_seq'] == row[2], 'capacity'].iloc[0]
        gpu_bench = gpu_df.loc[gpu_df['product_seq'] == row[3], 'bench_mark'].iloc[0]
        
        cpu_price = int(tbl[row[0]]) if row[0] in tbl.keys() else 0
        ram_price = int(tbl[row[2]]) if row[2] in tbl.keys() else 0
        gpu_price = int(tbl[row[3]]) if row[3] in tbl.keys() else 0
        mainboard_price = int(tbl[row[1]]) if row[1] in tbl.keys() else 0
        case_price = int(tbl[row[4]]) if row[4] in tbl.keys() else 0
        power_price = int(tbl[row[5]]) if row[5] in tbl.keys() else 0
        cooler_price = int(tbl[row[6]]) if row[6] in tbl.keys() else 0
        ssd_price = int(tbl[row[7]]) if row[7] in tbl.keys() else 0
        
        total = cpu_price+gpu_price+ram_price+mainboard_price+case_price+power_price+cooler_price+ssd_price
        prices_r['cpu'].append(cpu_price / total)
        prices_r['gpu'].append(gpu_price / total)
        prices_r['ram'].append(ram_price / total)
        prices_r['mainboard'].append(mainboard_price / total)
        prices_r['case'].append(case_price / total)
        prices_r['cooler'].append(cooler_price / total)
        prices_r['ssd'].append(ssd_price / total)
        prices_r['power'].append(power_price / total)
        prices_r['total'].append(np.log(total / 10000000))
        
        bench_r['cpu'].append(np.log((cpu_bench - mncb) / (mcb - mncb) + 1))
        bench_r['gpu'].append(np.log((gpu_bench - mngb) / (mgb - mngb) + 1))
        bench_r['ram'].append(np.log((ram_capa - mnrc) / (mrc - mnrc) + 1))
        bench_r['total'].append(np.log(total / 10000000 + 2))
        clog = np.log((cpu_bench - mncb) / (mcb - mncb) + 1)
        glog = np.log((gpu_bench - mngb) / (mgb - mngb) + 1)
        rlog = np.log((ram_capa - mnrc) / (mrc - mnrc) + 1)
        tlog = np.log(total / 10000000 + 2)
        total_log = clog + glog + rlog
        bench_rr['cpu'].append(clog / total_log)
        bench_rr['gpu'].append(glog / total_log)
        bench_rr['ram'].append(rlog / total_log)
        bench_rr['total'].append(tlog / total_log)
        
        prices['cpu'].append(cpu_price)
        prices['gpu'].append(gpu_price)
        prices['ram'].append(ram_price)
        prices['mainboard'].append(mainboard_price)
        prices['case'].append(case_price)
        prices['cooler'].append(cooler_price)
        prices['ssd'].append(ssd_price)
        prices['power'].append(power_price)
        prices['total'].append(total)
    except Exception as e:
        continue

In [94]:
bdata = np.array(pd.DataFrame(bench_rr))

In [95]:
bdata

array([[0.27222218, 0.54496674, 0.18281108, 0.91556764],
       [0.35896017, 0.5249914 , 0.11604843, 0.66269804],
       [0.30823763, 0.61068437, 0.08107799, 0.82250058],
       ...,
       [0.51326222, 0.38183498, 0.1049028 , 1.00129261],
       [0.33609368, 0.5860932 , 0.07781313, 0.79815448],
       [0.45078159, 0.40348498, 0.14573343, 0.75960378]])

In [97]:
price_rate = np.array(pd.DataFrame(prices_r))

In [100]:
len(bdata)

630

In [99]:
len(price_rate)

630

In [96]:
for i, row in enumerate(bdata):
    if 0 in row:
        print(i)

In [78]:
for key in prices_r.keys():
    print(key)
    print(len(prices_r[key]))
    print(len(prices[key]))
    print()
print(len(prices['total']))

cpu
948
948

gpu
948
948

ram
948
948

mainboard
948
948

power
948
948

case
948
948

cooler
948
948

ssd
948
948

total
948
948

948


In [80]:
pd.DataFrame(prices_r)

Unnamed: 0,cpu,gpu,ram,mainboard,power,case,cooler,ssd,total
0,0.175094,0.354310,0.0,0.161595,0.076903,0.067603,0.113288,0.051208,-1.870303
1,0.200560,0.596670,0.0,0.062435,0.074971,0.019374,0.000000,0.045990,-0.919338
2,0.263238,0.484896,0.0,0.112338,0.064798,0.021555,0.019747,0.033428,-1.481383
3,0.279504,0.513668,0.0,0.095678,0.057944,0.000000,0.000000,0.053205,-1.416927
4,0.144311,0.574209,0.0,0.098758,0.063479,0.024472,0.038592,0.056180,-1.266015
...,...,...,...,...,...,...,...,...,...
943,0.359504,0.276888,0.0,0.159570,0.041915,0.034728,0.000000,0.127395,-1.502950
944,0.242494,0.531932,0.0,0.097784,0.048209,0.037284,0.000000,0.042298,-1.573965
945,0.241134,0.483620,0.0,0.109239,0.101603,0.024973,0.000000,0.039430,-1.478749
946,0.228898,0.455157,0.0,0.119844,0.095259,0.051407,0.000000,0.049434,-1.729880
