### 상세 페이지에서 크롤링 할 정보

1. 스냅샷 
<img src="https://image.musinsa.com/mfile_s01/_street_images/51263/800.street_5c94a6a2d06aa.jpg?20190322182728" width=400>
2. 관련 정보 
<img src="https://www.dropbox.com/s/3edp92aczpiiwhl/Screenshot%202019-03-25%2013.00.28.png?raw=1" width=400>
3. 연결된 제품 
<img src="https://www.dropbox.com/s/qhkjwieb5fzjue6/Screenshot%202019-03-25%2013.01.31.png?raw=1" width=400>

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import time
from selenium import webdriver
from sub_crawler import *

In [2]:
def page_scrapper(page_number):
    """
    scrapping a given page 

    argument : 해당 페이지 번호

    return : 페이지 내의 사진 엘리먼트 리스트  
    """
    url = "https://www.musinsa.com/index.php?m=street&_mon=&p={}#listStart".format(
        page_number)
#     res = requests.session().post(url, data=login_info)
    res = requests.get(url)
    # raise error if response != 200
    res.raise_for_status()
    return BeautifulSoup(res.content, 'html.parser').select("div .articleImg")


def pic_link_parser(picture):
    picture.select_one("a")['href'].strip()
    base_url = "https://www.musinsa.com"
    return base_url + picture.select_one("a")['href'].strip()


def image_download(download_link, title, path, http=False):
    if http:
        pass
    else:
        download_link = "http:" + download_link
    res = requests.get(download_link, stream=True)

    path += "/{}".format(title)

    with open(path, 'wb') as f:

        for chunk in res.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)
    return


def get_styleInfo(soup):
    """
    스냅샷의 정보 와 관련 태그 
    """
    values = list(
        map(lambda x: x.text, soup.select_one("table").select("td > span")))
    keys = list(map(lambda x: x.text, soup.select_one(
        "table").select("th > span")))
    related_tags = list(
        map(lambda x: x.text, soup.select_one("table").select(".listItem span")))

    infoDict = {k: v for k, v in zip(keys, values)}
    infoDict['tags'] = (" ").join(
        list(map(lambda x: x.text, soup.select_one("table").select(".listItem span"))))

    return infoDict


def get_product_img(link):
    res = requests.get(link)
    soup = BeautifulSoup(res.content, 'html.parser')

    img_url = soup.select_one(r"div.product-img > img")["src"]

    return img_url


def item_spec(spec_url, id_=None, login_info=None, path=".", login_status=False, driver=None):
    """
    하나의 스냅샷 상세페이지에서 해당 사진과 연관된 패션아이템을 크롤링
    """
    if not login_info:
        print("you have no login infomation")
        raise ValueError

    sess = requests.session()
    sess.post("https://www.musinsa.com/?mod=login", data=login_info)

    res = sess.get(spec_url)
    soup = BeautifulSoup(res.content, 'html.parser')

    # 스냅샷 정보
    style_info = get_styleInfo(soup)
    # 스냅샷 이미지
    snap_img = soup.select_one("div.snapImg > a > img")['src']
    snap_file_name = "{}_snap.png".format(id_)
    style_info["Snapshot"] = snap_file_name
    image_download(snap_img, snap_file_name, path=path)

    # 스냅샷 아이템 확대 이미지 & 태그
    snap_part_imgs = soup.select_one(
        "div > .styleItem-list").select(".itemImg")
    part_file_names = []
    part_tags = []

    for i, part in enumerate(snap_part_imgs):
        part_img = part.select_one("a > img")["src"]
        part_file_name = "{}_{}_part.png".format(id_, i)
        image_download(part_img, part_file_name, path=path)
        part_file_names.append(part_file_name)

        part_spans = part.select("span")
        tags = []
        for tag in part_spans:
            tags.append(tag.get_text())

        part_tags.append("/".join(tags))

    style_info["Part_snap"] = (" ").join(part_file_names)
    style_info["Part_tag"] = ("&").join(part_tags)

    # 추천된 아이템 이미지
    if login_status:
        related_imgs_urls = get_item_img(driver, spec_url)
    else:
        related_imgs_urls = get_item_img(
            driver, spec_url, login_info=login_info)

    i = 0
    reco_imgs = []
    for urls in related_imgs_urls:
        block = []
        for url in urls:
            reco_img = get_product_img(url)
            reco_file_name = "{}_{}_reco.png".format(id_, i)
            image_download(reco_img, reco_file_name, path=path)
            block.append(reco_file_name)
            i += 1
        reco_imgs.append(" ".join(block))

    style_info["recommend_item"] = ("&").join(reco_imgs)

    return style_info


def main(start=1, end=10, filename="train", id_=None, login_info=None, driver=None):
    """
    arguments 

    start : starting page to scraping 
    end : ending page to scraping
    id_ : file_identifier 
    login_info : required dict object to login to the site 
    filename : train or test

    """

    dataset = []
    login_status = False

    for i in range(start, end+1):

        pages = page_scrapper(i)
        spec_urls = list(map(pic_link_parser, pages))

        for url in spec_urls:
            pre = time.time()

            style_info = item_spec(url, id_=id_, login_info=login_info,
                                   path="./data", login_status=login_status, driver=driver)

            dataset.append(style_info)

            login_status = True
            id_ += 1

            print("{} is done, taken time : {}".format(
                style_info["이름(나이)"], time.time() - pre))
                
            if id_ == 31 or id_ % 200 == 0:
                df = pd.DataFrame(dataset)
                df.to_csv("./data/{0}/{0}.csv".format(filename), index=False)
                print("***** {} dataset saved *****".format(filename))
    
            

    df = pd.DataFrame(dataset)
    df.to_csv("./data/{0}/{0}.csv".format(filename), index=False)
    print("{} dataset complete".format(filename))

    return df

In [3]:
%%time
login_info = {
    "r": "home",
    "a": "login",
    "referer": "https://www.musinsa.com/index.php?",
    "usessl": "0",
    "id": "",
    "pw": "",
}

driver = webdriver.Chrome()

main(start=1, end=10, filename="train", id_=0, login_info=login_info, driver=driver)

강다희 (23) is done, taken time : 6.90335488319397
김민규 (25) is done, taken time : 5.284601926803589
사라 (26) is done, taken time : 5.627511024475098
조정흠 (22) is done, taken time : 5.263004302978516
김나율 (26) is done, taken time : 5.632097005844116
이지인 (20) is done, taken time : 5.772680997848511
김보미 (24) is done, taken time : 5.422263145446777
김성영 (21) is done, taken time : 6.0132317543029785
Mathis Dhérine (18) is done, taken time : 6.273971319198608
조정현 (26) is done, taken time : 6.200055122375488
임재완 (26) is done, taken time : 6.042568922042847
심나영 (25) is done, taken time : 4.2533769607543945
조은하늘 (23) is done, taken time : 8.443376064300537
이현빈 (27) is done, taken time : 8.614420890808105
한상덕 (25) is done, taken time : 14.396399736404419
이한혁 (27) is done, taken time : 15.005563259124756
이성신 (연령미상) is done, taken time : 12.90702509880066
치히로 (28) is done, taken time : 18.300533294677734
정하영 (18) is done, taken time : 14.41082501411438
유솔이 (30) is done, taken time : 16.200976848602295
남지

KeyboardInterrupt: 