### json file 관련 함수 

- save_json : 데이터를 json으로 저장
- load_json : json 파일 불러오기
- 기본 경로 현재 폴더인 'data'로 지정 해둠.

In [2]:
import os
import json

def save_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

def load_json(filename):
    if not os.path.exists(filename):
        return None
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

### 마켓컬리 식품 항목 상품 크롤링

- 큰 카테고리 : "대분류", 그 다음 카테고리 : "소분류" 라고 정의

In [None]:
import time
import logging
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] : %(message)s',
    handlers=[logging.StreamHandler()]
)

CATEGORY_IDS = ['722', '251','907', '908', '909', '910', '911', '912', '913', '914', '383', '249', '915', '018', '032']

# 로그인 쿠키
def login_and_save_cookies():
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    try:
        driver.get("https://www.kurly.com/member/login")
        logging.info("브라우저가 열렸습니다. 로그인 후 콘솔에 Enter를 입력하세요.")
        input("로그인 후 Enter 키를 누르세요...")

        cookies = driver.get_cookies()
        save_json(cookies, "cookies_kurly.json")
        logging.info("쿠키 저장 완료")

    finally:
        driver.quit()

# 크롤링
def crawl_products_kurly(cookies):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument("window-size=1920,1080")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    all_products = []
    error_products = []
    
    try:
        driver.get("https://www.kurly.com")
        time.sleep(2)

        for cookie in cookies:
            if 'sameSite' in cookie:
                del cookie['sameSite']
            if 'domain' in cookie:
                del cookie['domain']
            driver.add_cookie(cookie)

        driver.get("https://www.kurly.com/main")
        time.sleep(2)

        for category_id in CATEGORY_IDS:
            main_base_url = f'https://www.kurly.com/categories/{category_id}'
            driver.get(main_base_url)
            time.sleep(2)

            try:
                main_category = driver.find_element(By.XPATH, '//*[@id="container"]/h3').text.strip()
                logging.info(f'대분류({category_id}) - {main_category}')

                sub_category_count = len(driver.find_elements(By.CSS_SELECTOR, '#container > ul > li'))
                for i in range(2, sub_category_count + 1):
                    try:
                        sub_xpath = f'//*[@id="container"]/ul/li[{i}]/a'
                        sub_element = driver.find_element(By.XPATH, sub_xpath)
                        sub_category = sub_element.text.strip()
                        logging.info(f'소분류: {sub_category}')

                        sub_element.click()
                        time.sleep(2)

                        sub_base_url = driver.current_url.split('&page=')[0]
                        page = 1

                        while True:
                            page_url = f"{sub_base_url}&page={page}"
                            driver.get(page_url)
                            time.sleep(2)

                            if "상품이 없습니다" in driver.page_source:
                                logging.info("더 이상 상품 없음.")
                                break

                            logging.info(f'{page}페이지 수집 중...')

                            if sub_category.startswith("더퍼플"):
                                product_xpath_base = '//*[@id="container"]/div[3]/div[2]/div[2]/a'
                            else:
                                product_xpath_base = '//*[@id="container"]/div[2]/div[2]/div[2]/a'

                            product_elements = driver.find_elements(By.XPATH, product_xpath_base)

                            for idx in range(len(product_elements)):
                                try:
                                    product_elements = driver.find_elements(By.XPATH, product_xpath_base)
                                    product = product_elements[idx]
                                    driver.execute_script("arguments[0].scrollIntoView();", product)
                                    product.click()
                                    time.sleep(2)

                                    product_url = driver.current_url
                                    product_name = driver.find_element(
                                        By.XPATH,
                                        '//*[@id="product-atf"]/section/div[1]/div[1]/div[2]/h1'
                                    ).text.strip()

                                    # 기존에 있는지 확인(상품 URL 기준)
                                    existing = next((item for item in all_products if item['url'] == product_url), None)

                                    if existing:
                                        if sub_category not in existing['소분류']:
                                            existing['소분류'].append(sub_category)
                                    else:
                                        all_products.append({
                                            '상품명': product_name,
                                            'url': product_url,
                                            '대분류': main_category,
                                            '소분류': [sub_category]
                                        })

                                    driver.back()
                                    time.sleep(2)

                                except Exception as e:
                                    logging.warning(f'상품 오류: {e}')

                                    # 크롤링 오류 상품 url 저장
                                    error_url = driver.current_url
                                    if error_url not in error_products:
                                        error_products.append(error_url)
                                    driver.get(page_url)
                                    time.sleep(1)

                            page += 1

                        # 소분류 하나 끝날 때 저장
                        save_json(all_products, "crawling_products_kurly.json")
                        save_json(error_products, "crawling_error_products_kurly.json")
                        logging.info(f'현재까지 {len(all_products)}개 저장/{len(error_products)}개 오류')


                        driver.get(main_base_url)
                        time.sleep(2)

                    except Exception as e:
                        logging.error(f'소분류 {i} 오류: {e}')
                        driver.get(main_base_url)
                        time.sleep(2)

            except Exception as e:
                logging.error(f'대분류 {category_id} 오류: {e}')

    finally:
        driver.quit()

    return all_products

# 실행
if __name__ == "__main__":
    login_and_save_cookies()  
    cookies = load_json("cookies_kurly.json")  
    result = crawl_products_kurly(cookies)     
    logging.info(f"전체 크롤링 완료 - 총 {len(result)}개 상품 저장")

2025-06-12 22:47:06,622 [INFO] : Get LATEST chromedriver version for google-chrome
2025-06-12 22:47:06,648 [INFO] : Get LATEST chromedriver version for google-chrome
2025-06-12 22:47:06,675 [INFO] : Driver [C:\Users\User\.wdm\drivers\chromedriver\win64\137.0.7151.70\chromedriver.exe] found in cache
2025-06-12 22:47:10,417 [INFO] : 브라우저가 열렸습니다. 로그인 후 콘솔에 Enter를 입력하세요.
2025-06-12 22:47:24,212 [INFO] : 쿠키 저장 완료
2025-06-12 22:47:27,067 [INFO] : Get LATEST chromedriver version for google-chrome
2025-06-12 22:47:27,104 [INFO] : Get LATEST chromedriver version for google-chrome
2025-06-12 22:47:27,130 [INFO] : Driver [C:\Users\User\.wdm\drivers\chromedriver\win64\137.0.7151.70\chromedriver.exe] found in cache
2025-06-12 22:47:39,278 [INFO] : 대분류(722) - 와인·위스키·데낄라
2025-06-12 22:47:39,302 [INFO] : 소분류: ✨라빈리커 픽업 (NEW)✨
2025-06-12 22:47:43,894 [INFO] : 1페이지 수집 중...


KeyboardInterrupt: 