## 29cm 크롤링

- product_info
    - main_category
    - sub_category
    - gender: 
    - name: itemName
    - price: consumerPrice
    - quantity: 크롤링
    - brand_name: frontBrandNameKor, frontBrandNameEng
    - size: 크롤링
    - color: 크롤링
    - fee: 
    - image: https://img.29cm.co.kr/{imageUrl}
    - code:
    - url: https://product.29cm.co.kr/catalog/{item_no}
    - detail_images: 이미지 url 리스트
    - detail_html: 상품 상세 정보 html
    - reviews: product_reviews

- product_reviews
    - count: 댓글 개수
    - average_point: 평균 점수
    - contents: 댓글 내용
    - created_at: 게시일
    - images: 이미지 리스트 (있으면 가져오고 없으면 안 가져오기), 없으면 null
    - point: 점수 (있으면 가져오고 없으면 안 가져오기) int, 없으면 null
    - productOption: 상품 상세 정보 [{'color', 'size'}] 없으면 null
    - userSize: ['키', '몸무게'] 없으면 null

In [47]:
!pip install selenium
!pip install pymongo
!pip install python-dotenv

import requests
import json
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pickle
import re
import os
from dotenv import load_dotenv
import pymongo
import time



In [48]:
def get_twentyninecm_products_list(categoryLargeCode, categoryMediumCode, categorySmallCode, mainCategory, subCategory):
    print(f"{categorySmallCode} crawling...")
    products_list = []
    url = 'https://search-api.29cm.co.kr/api/v4/products/category/'
    params = {
        'categoryLargeCode': categoryLargeCode,
        'categoryMediumCode': categoryMediumCode,
        'categorySmallCode': categorySmallCode,
        'count': 2,
        'page': 1,
        'sort': 'latest',
        'init': 'T',
        'excludeSoldOut': False,
    }

    # 카테고리별로 상품 2개만 조회 후 res의 productsTotalCount를 저장
    response = requests.get(url, params=params)
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")
    
    results = response.json()
    params['count'] = results.get('data', {}).get('productsTotalCount', 0)
    
    # 전체 상품 조회
    response = requests.get(url, params=params)
    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")

    results = response.json()
    # 상품 리스트
    products = results.get('data', {}).get('products', [])
    for product in products:
        itemNo = product.get('itemNo', None)
        reviewCount = product.get('reviewCount', None)
        
        detail_info = crawling_twentyninecm_product_info(itemNo)    #  상품 상세 정보 크롤링

        # 상품 정보가 없는 경우
        if not detail_info:
            continue
            
        product_reviews = []
        # 5000개 이상의 리뷰를 한번에 가져올 시 500 에러 발생하기 때문에 5000개씩 처리함
        while reviewCount:
            if reviewCount > 5000:
                product_reviews += get_twentyninecm_reviews_list(itemNo, 5000)    # 상품 리뷰 api
                reviewCount -= 5000
            else:
                product_reviews += get_twentyninecm_reviews_list(itemNo, reviewCount)
                reviewCount = 0
        
        # categoryLargeCode가 272000000 이상이면 여자 상품
        gender = 'female'
        if (categoryLargeCode // 1000000) >= 272:
            gender = 'male'
        
        product_info = {    
            'main_category': mainCategory,
            'sub_category': subCategory,
            'gender': gender,
            'name': product.get('itemName', None),
            'price': product.get('consumerPrice', None),
            'brand_name_kr': product.get('frontBrandNameKor', None),
            'brand_name_en': product.get('frontBrandNameEng', None),
            'image': f"https://img.29cm.co.kr/{product.get('imageUrl', None)}",
            'url': f"https://product.29cm.co.kr/catalog/{itemNo}",
            'color': detail_info['color'],
            'size': detail_info['size'],
            'detail_images': detail_info['detail_images'],
            'detail_html': detail_info['detail_html'],
            'reviews': product_reviews
        }

        products_list.append(product_info)
        
    return products_list

In [49]:
def crawling_twentyninecm_product_info(itemNo):
    detail_info = {'color': [], 'size': [], 'detail_images': [], 'detail_html': ''}
    url = f'https://product.29cm.co.kr/catalog/{itemNo}'
    
    # 상품이 존재하는지 확인 (존재하지 않는다면 404 발생)
    try:
        response = requests.get(url)
        response.raise_for_status()
    except:
        return
        
    driver.get(url)

    # color 및 size 옵션 입력 요소 찾기
    i = 0
    while True:
        try:
            i += 1
            # dropdown 요소를 찾아 클릭
            option_selector = f'div.css-129gw94.e1yaqq956 > div > div:nth-child({i}) > div > input'
            option_element = driver.find_element(By.CSS_SELECTOR, option_selector)
            option_element.click()
            
            # dropdown ul태그의 li 요소 리스트를 찾음
            ul_selector = 'ul.css-1sxz8vl.e15gsm0h4'
            ul_element = driver.find_element(By.CSS_SELECTOR, ul_selector)
            li_elements = ul_element.find_elements(By.TAG_NAME, 'li')
            
            # 첫 번째 요소는 옵션명
            option_name = li_elements[0].text.lower()
            
            # 옵션 목록을 text로 변환한 후 저장
            option_values = list(map(lambda x: x.text, li_elements[1:]))
            detail_info[option_name] = option_values

            # 색상일 경우 옵션 선택 - 색상을 선택해야 사이즈가 나오기 때문 (품절이 아닌 것)
            if option_name == 'color':
                j = 1
                while j < len(detail_info['color']):
                    if '품절' in li_elements[j].text:
                        j += 1
                        continue
                    else:
                        li_elements[j].click()
            else:
                # dropdown 닫음
                option_element.click()
        
        # 요소를 찾을 수 없는 경우
        except:
            break
    
    # 상품 상세 정보 더보기 버튼 클릭
    detail_button_selector = 'button.efgb0b60.css-h7utre.e12h9sp60'
    try:
        detail_button_element = driver.find_element(By.CSS_SELECTOR, detail_button_selector)
        detail_button_element.click()
    except:
        pass

    # 상품 상세 정보를 포함하는 div 요소를 찾음
    detail_selector = 'div.e1jr1djm0.css-1wvn7e9.e1esfft0'
    try:
        detail_element = driver.find_element(By.CSS_SELECTOR, detail_selector)
        # 상품 상세 정보 HTML을 가져옴
        detail_info['detail_html'] = detail_element.get_attribute("innerHTML")
        
        # BeautifulSoup을 사용하여 HTML을 파싱
        soup = BeautifulSoup(detail_info['detail_html'], 'html.parser')
        
        # img 태그를 찾아서 이미지 URL을 추출하고 product_images에 추가
        img_tags = soup.find_all('img')
        for img_tag in img_tags:
            img_src = img_tag.get('src')
            if img_src:
                detail_info['detail_images'].append(img_src)
    except:
        pass
    
    return detail_info

In [50]:
def get_twentyninecm_reviews_list(itemNo, reviewCount):   
    url = 'https://review-api.29cm.co.kr/api/v4/reviews/'
    params = {
        'itemId': itemNo,
        'page': 0,
        'size': reviewCount,
    }
    response = requests.get(url, params=params)

    try:
        response.raise_for_status()
    except requests.exceptions.HTTPError as err:
        print(f"HTTP Error: {err}")

    results = response.json()
    reviews = results.get('data', {})

    count = reviews.get('count', 0)
    average_point = reviews.get('averagePoint', 0)

    product_reviews = {
        'count': count,
        'average_point': average_point,
        'reviews': []
    }

    for review in reviews.get('results', []):
        # 리뷰 이미지 url 리스트
        images = []
        for file in review.get('uploadFiles', []):
            images.append(f"https://img.29cm.co.kr/{file.get('url', None)}")

        # 상품 옵션
        product_option = []
        for option in review.get('optionValue', []):       
            # 색상과 사이즈 정보 추출
            matches = re.findall(r'\[(.*?)\]', option)
            
            color = matches[0] if matches else None
            size = matches[1] if len(matches) > 1 else None
            
            # 색상과 사이즈 정보를 option 딕셔너리에 할당
            product_option.append({'color': color, 'size': size})
        
        product_reviews['reviews'].append({
            'contents': review.get('contents', None),
            'created_at': review.get('insertTimestamp', None),
            'images': images,
            'point': review.get('point', None),
            'productOption': product_option,
            'userSize': review.get('userSize', None)     
        })
    
    return product_reviews

In [51]:
def save_to_mongodb(productList):
    load_dotenv()
    password = os.getenv("MONGODB_PASSWORD")
    
    # MongoDB 연결 설정
    client = pymongo.MongoClient(f"mongodb+srv://root:{password}@cluster0.stojj99.mongodb.net/?retryWrites=true&w=majority&appName=Cluster")
    db = client["29cm"]
    collection = db["products"]
    
    try:
        # MongoDB에 productInfo 추가
        collection.insert_many(productList)
        print("Product info saved successfully to MongoDB!")
    except Exception as e:
        print(f"An error occurred while saving to MongoDB: {e}")

In [None]:
# 시작 시간 기록
start_time = time.time()

# 카테고리 목록 불러오기
with open("29cm_category.pkl", "rb") as f:
    category = pickle.load(f)

# Chrome 옵션 설정
chrome_options = Options()
# chrome_options.add_experimental_option('detach', True)
chrome_options.add_argument('--headless')

# WebDriver 실행
driver = webdriver.Chrome(options=chrome_options)

for category1 in category:
    categoryLargeCode = category1.get('category_code', 0)
    
    for category2 in category1['category2']:
        categoryMediumCode = category2.get('category_code', 0)

        if category2['category3']:  # category3가 존재하는지 확인
            for category3 in category2['category3']:
                categorySmallCode = category3['category_code']
                mainCategory, subCategory = category3['main_category'], category3['sub_category']
                products_list = get_twentyninecm_products_list(categoryLargeCode, categoryMediumCode, categorySmallCode, mainCategory, subCategory)
                save_to_mongodb(products_list)
        else:
            mainCategory, subCategory = category2['main_category'], category2['sub_category']
            products_list = get_twentyninecm_products_list(categoryLargeCode, categoryMediumCode, '', mainCategory, subCategory)
            save_to_mongodb(products_list)

# 종료 시간 기록 및 소요 시간 계산
end_time = time.time()
execution_time = end_time - start_time
print("Total elapsed time: {:.2f} seconds".format(execution_time))

# WebDriver 종료
driver.quit()

268103101 crawling...
