### 뷰티컬리 - 스킨/토너 100개 제품 리뷰 크롤링

In [None]:
# 필요 라이브러리 설치
import requests
import json

In [None]:
# 상품 정보 크롤링 예시
url = 'https://api.kurly.com/collection/v2/home/sites/beauty/product-categories/167001/products?sort_type=4&page=1&per_page=100&filters='
req = requests.request(method='GET', url=url)
print(req.content.decode('UTF-8'))

product_info = json.loads(req.content.decode('UTF-8'))
product_info = product_info['data']

In [None]:
len(product_info)

In [None]:
# product_info json 저장
with open("skin_product_info_example.json", "w", encoding="utf-8") as f:
    json.dump(product_info, f, ensure_ascii=False, indent=2)

#### 상품 number로 상품 고시 정보 크롤링

In [None]:
product_info[0]['no']

In [None]:
# product_no 추출
product_no = []
for i in range(len(product_info)):
    product_no.append(product_info[i]['no'])

In [None]:
# 상품 고시 정보 크롤링
product_notice_data = []
for i in range(len(product_no)):
    url = f"https://api.kurly.com/showroom/v2/products/{product_no[i]}?join_order_code="
    req = requests.request(method='GET', url=url)
    product_notice = json.loads(req.content.decode('utf-8'))
    product_notice_data.append(product_notice['data'])

In [None]:
# product_notice json 저장
with open("skin_product_notice_example.json", "w", encoding="utf-8") as f:
    json.dump(product_notice_data, f, ensure_ascii=False, indent=2)

#### 상품 number와 after 파라미터 리스트를 이용한 상품 리뷰 크롤링

In [None]:
# 상품 리뷰 크롤링 예시
url = "https://api.kurly.com/product-review/v3/contents-products/1000319181/reviews?sortType=RECOMMEND&size=10&onlyImage=false&after=MC4wOjE3NjkwNDY0MjkwMDA%3D&filters="
req = requests.request(method='GET', url=url)
reviews = json.loads(req.content.decode('UTF-8'))
reviews['data']

In [None]:
# after 파라미터 list 생성 예시
PRODUCT_NO = 1000319181
URL = f"https://api.kurly.com/product-review/v3/contents-products/{PRODUCT_NO}/reviews"

headers = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    # 필요하면 여기에 Cookie / Authorization 추가
}

params_base = {
    "sortType": "RECOMMEND",
    "size": 10,
    "onlyImage": "false",
    "filters": "",
}

after_list = []
after = None

for i in range(1000):  # 안전용 반복 제한
    params = dict(params_base)
    if after:
        params["after"] = after

    res = requests.get(URL, headers=headers, params=params)
    res.raise_for_status()

    data = res.json()

    # ✅ 핵심: after 뽑기
    after = data["meta"]["pagination"]["after"]
    after_list.append(after)

    print(f"{i+1}번째 after:", after)

    # 종료 조건
    if after is None:
        break

In [None]:
product_no_list = product_no.copy()

In [None]:
# 상품 리뷰 전체 크롤링
import requests
import json
import time
from pathlib import Path
from typing import List, Dict, Optional
import logging
from datetime import datetime

# 로깅 설정
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('kurly_crawling.log'),
        logging.StreamHandler()
    ]
)

class KurlyReviewCrawler:
    def __init__(self, 
                 product_no_list: List[int],
                 output_dir: str = "kurly_reviews",
                 checkpoint_file: str = "checkpoint.json"):
        """
        Args:
            product_no_list: 크롤링할 제품 번호 리스트
            output_dir: 중간 결과 저장 디렉토리
            checkpoint_file: 진행 상황 저장 파일
        """
        self.product_no_list = product_no_list
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        self.checkpoint_file = self.output_dir / checkpoint_file
        
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Accept": "application/json",
        }
        
        self.base_params = {
            "sortType": "RECOMMEND",
            "size": 10,
            "onlyImage": "false",
            "filters": "",
        }
        
        # 체크포인트 로드
        self.checkpoint = self._load_checkpoint()
    
    def _load_checkpoint(self) -> Dict:
        """저장된 체크포인트 로드"""
        if self.checkpoint_file.exists():
            with open(self.checkpoint_file, 'r', encoding='utf-8') as f:
                checkpoint = json.load(f)
                logging.info(f"체크포인트 로드: {len(checkpoint.get('completed', []))}개 제품 완료")
                return checkpoint
        return {"completed": [], "failed": []}
    
    def _save_checkpoint(self):
        """현재 진행 상황 저장"""
        with open(self.checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(self.checkpoint, f, ensure_ascii=False, indent=2)
    
    def _get_after_list(self, product_no: int, max_pages: int = 1000) -> List[Optional[str]]:
        """특정 제품의 모든 after 파라미터 수집"""
        url = f"https://api.kurly.com/product-review/v3/contents-products/{product_no}/reviews"
        after_list = []
        after = None
        
        for i in range(max_pages):
            try:
                params = dict(self.base_params)
                if after:
                    params["after"] = after
                
                res = requests.get(url, headers=self.headers, params=params, timeout=10)
                res.raise_for_status()
                
                data = res.json()
                after = data.get("meta", {}).get("pagination", {}).get("after")
                after_list.append(after)
                
                logging.debug(f"Product {product_no} - Page {i+1}: after={after}")
                
                # 다음 페이지가 없으면 종료
                if after is None:
                    break
                
                # 요청 간 딜레이 (Rate Limiting 방지)
                time.sleep(0.5)
                
            except requests.exceptions.RequestException as e:
                logging.warning(f"Product {product_no} - after 수집 중 에러 (페이지 {i+1}): {e}")
                time.sleep(2)  # 에러 시 더 긴 대기
                break
        
        logging.info(f"Product {product_no}: {len(after_list)}개 페이지 발견")
        return after_list
    
    def _get_reviews_by_after(self, product_no: int, after: Optional[str]) -> List[Dict]:
        """특정 after 파라미터로 리뷰 데이터 수집"""
        url = f"https://api.kurly.com/product-review/v3/contents-products/{product_no}/reviews"
        
        params = dict(self.base_params)
        if after:
            params["after"] = after
        
        try:
            res = requests.get(url, headers=self.headers, params=params, timeout=10)
            res.raise_for_status()
            
            data = res.json()
            reviews = data.get("data", [])
            return reviews
            
        except requests.exceptions.RequestException as e:
            logging.error(f"Product {product_no}, after={after}: 리뷰 수집 실패 - {e}")
            return []
    
    def crawl_product(self, product_no: int, retry_count: int = 3) -> Dict:
        """단일 제품의 모든 리뷰 수집"""
        logging.info(f"=== Product {product_no} 크롤링 시작 ===")
        
        for attempt in range(retry_count):
            try:
                # 1단계: after_list 수집
                after_list = self._get_after_list(product_no)
                
                if not after_list:
                    logging.warning(f"Product {product_no}: after_list가 비어있음")
                    return {"product_no": product_no, "reviews": [], "after_count": 0}
                
                # 2단계: 각 after로 리뷰 수집
                all_reviews = []
                for idx, after in enumerate(after_list):
                    reviews = self._get_reviews_by_after(product_no, after)
                    all_reviews.extend(reviews)
                    
                    logging.info(f"Product {product_no} - Page {idx+1}/{len(after_list)}: {len(reviews)}개 리뷰 수집")
                    
                    # 요청 간 딜레이
                    time.sleep(0.5)
                
                result = {
                    "product_no": product_no,
                    "reviews": all_reviews,
                    "after_count": len(after_list),
                    "total_reviews": len(all_reviews),
                    "crawled_at": datetime.now().isoformat()
                }
                
                # 중간 결과 저장 (개별 파일)
                self._save_product_data(product_no, result)
                
                logging.info(f"Product {product_no}: 총 {len(all_reviews)}개 리뷰 수집 완료")
                return result
                
            except Exception as e:
                logging.error(f"Product {product_no} - Attempt {attempt+1}/{retry_count} 실패: {e}")
                if attempt < retry_count - 1:
                    wait_time = (attempt + 1) * 5
                    logging.info(f"{wait_time}초 대기 후 재시도...")
                    time.sleep(wait_time)
                else:
                    logging.error(f"Product {product_no}: 최종 실패")
                    return {"product_no": product_no, "reviews": [], "error": str(e)}
    
    def _save_product_data(self, product_no: int, data: Dict):
        """개별 제품 데이터를 파일로 저장"""
        filepath = self.output_dir / f"product_{product_no}.json"
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    
    def crawl_all(self):
        """모든 제품 크롤링"""
        completed = set(self.checkpoint.get("completed", []))
        failed = set(self.checkpoint.get("failed", []))
        
        # 크롤링할 제품 필터링 (이미 완료된 것 제외)
        remaining = [pno for pno in self.product_no_list if pno not in completed]
        
        logging.info(f"총 {len(self.product_no_list)}개 제품 중 {len(remaining)}개 크롤링 예정")
        logging.info(f"완료: {len(completed)}개, 실패: {len(failed)}개")
        
        for idx, product_no in enumerate(remaining):
            logging.info(f"\n진행률: {idx+1}/{len(remaining)} ({(idx+1)/len(remaining)*100:.1f}%)")
            
            result = self.crawl_product(product_no)
            
            if result.get("error"):
                failed.add(product_no)
                self.checkpoint["failed"] = list(failed)
            else:
                completed.add(product_no)
                self.checkpoint["completed"] = list(completed)
            
            # 체크포인트 저장
            self._save_checkpoint()
            
            # 제품 간 딜레이 (서버 부하 방지)
            time.sleep(1)
        
        logging.info("\n=== 크롤링 완료 ===")
        logging.info(f"성공: {len(completed)}개")
        logging.info(f"실패: {len(failed)}개")
    
    def merge_all_reviews(self, output_file: str = "all_reviews.json"):
        """모든 개별 파일을 하나의 JSON으로 병합"""
        all_data = []
        
        for json_file in self.output_dir.glob("product_*.json"):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    all_data.append(data)
            except Exception as e:
                logging.error(f"파일 로드 실패 {json_file}: {e}")
        
        output_path = self.output_dir / output_file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(all_data, f, ensure_ascii=False, indent=2)
        
        total_reviews = sum(d.get("total_reviews", 0) for d in all_data)
        logging.info(f"병합 완료: {len(all_data)}개 제품, 총 {total_reviews}개 리뷰")
        logging.info(f"저장 위치: {output_path}")
        
        return output_path


if __name__ == "__main__":
    # 크롤러 초기화
    crawler = KurlyReviewCrawler(
        product_no_list=product_no_list,
        output_dir="kurly_reviews_data",
        checkpoint_file="checkpoint.json"
    )
    
    # 크롤링 실행
    crawler.crawl_all()
    
    # 모든 결과 병합
    crawler.merge_all_reviews("all_kurly_reviews.json")