In [None]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
import time
from datetime import datetime
from urllib3.exceptions import ProtocolError
import requests.exceptions
import os

def process_keywords(keywords_text):
    if not keywords_text:
        return ""
    keywords = keywords_text.split("요")
    keywords = [k.strip() + "요" for k in keywords if k.strip()]
    return ", ".join(keywords)

def retry_on_exception(func, max_attempts=3, delay=5):
    for attempt in range(max_attempts):
        try:
            return func()
        except (ProtocolError, WebDriverException, requests.exceptions.RequestException) as e:
            if attempt == max_attempts - 1:
                raise
            print(f"Error occurred: {e}. Retrying in {delay} seconds... (Attempt {attempt + 1}/{max_attempts})")
            time.sleep(delay)
            delay *= 2

def get_reviews(url, original_row):
    driver = None
    try:
        def crawl_reviews():
            options = webdriver.ChromeOptions()
            options.add_argument('--disable-gpu')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')
            options.add_argument('--disable-infobars')
            options.add_argument('--disable-extensions')
            options.add_argument('--disable-popup-blocking')
            
            nonlocal driver
            driver = webdriver.Chrome(options=options)
            driver.set_page_load_timeout(30)
            driver.get(url)
            
            # 1. "최신순" 버튼 클릭
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "#app-root"))
            )
            
            sort_button = WebDriverWait(driver, 20).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "#app-root > div > div > div > div:nth-child(6) > div:nth-child(2) > div.place_section.k1QQ5 > div.place_section_content > div.fHbwT > div.mlywZ > span:nth-child(2) > a"))
            )
            driver.execute_script("arguments[0].click();", sort_button)
            time.sleep(2)
            
            click_count = 0
            # 2. "더보기" 버튼 클릭
            while True:
                try:
                    review_elements = driver.find_elements(By.CSS_SELECTOR, "#app-root > div > div > div > div:nth-child(6) > div:nth-child(2) > div.place_section.k1QQ5 > div.place_section_content > ul > li")
                    
                    last_review_date = review_elements[-1].find_element(By.CSS_SELECTOR, "div.pui__QztK4Q > div.Vk05k > div > span:nth-child(1) > time").text
                    
                    click_count += 1
                    print(f"더보기 {click_count}번째 클릭 - 마지막 리뷰 날짜: {last_review_date}")
                    
                    # 24.4월 이전이면 중단
                    if last_review_date.startswith("24.4.") or last_review_date.startswith("24.3.") or last_review_date.startswith("24.2.") or last_review_date.startswith("24.1."):
                        print(f"목표 날짜(24.5.1) 이전 리뷰 발견. 더보기 중단")
                        break
                    
                    more_button = WebDriverWait(driver, 5).until(
                        EC.element_to_be_clickable((By.CSS_SELECTOR, "#app-root > div > div > div > div:nth-child(6) > div:nth-child(2) > div.place_section.k1QQ5 > div.NSTUp > div > a"))
                    )
                    driver.execute_script("arguments[0].click();", more_button)
                    time.sleep(2)
                except (TimeoutException, NoSuchElementException):
                    print("더 이상 더보기 버튼이 없습니다.")
                    break
            
            reviews = []
            review_elements = driver.find_elements(By.CSS_SELECTOR, "#app-root > div > div > div > div:nth-child(6) > div:nth-child(2) > div.place_section.k1QQ5 > div.place_section_content > ul > li")
            print(f"\n총 {len(review_elements)}개의 리뷰를 찾았습니다.")
            
            collected_count = 0
            for element in review_elements:
                try:
                    date = element.find_element(By.CSS_SELECTOR, "div.pui__QztK4Q > div.Vk05k > div > span:nth-child(1) > time").text
                    
                    # 24.5월 이후 리뷰만 수집
                    if date.startswith("24.4.") or date.startswith("24.3.") or date.startswith("24.2.") or date.startswith("24.1."):
                        continue
                    
                    # 리뷰 더보기 버튼
                    try:
                        more_text_button = element.find_element(By.CSS_SELECTOR, "div.pui__vn15t2 > a.pui__wFzIYl")
                        driver.execute_script("arguments[0].click();", more_text_button)
                        time.sleep(1)
                    except NoSuchElementException:
                        pass
                    
                    # 키워드 더보기 버튼
                    try:
                        keyword_more_button = element.find_element(By.CSS_SELECTOR, "div.pui__HLNvmI > a")
                        driver.execute_script("arguments[0].click();", keyword_more_button)
                        time.sleep(1)
                    except NoSuchElementException:
                        pass
                    
                    review_text = element.find_element(By.CSS_SELECTOR, "div.pui__vn15t2 > a:nth-child(1)").text
                    
                    try:
                        keywords = element.find_element(By.CSS_SELECTOR, "div.pui__HLNvmI").text
                        processed_keywords = process_keywords(keywords)
                    except NoSuchElementException:
                        processed_keywords = ""
                    
                    collected_count += 1
                    print(f"\n=== 리뷰 {collected_count} ===")
                    print(f"날짜: {date}")
                    print(f"리뷰: {review_text}")
                    print(f"키워드: {processed_keywords}")
                    print("="*50)
                    
                    review_data = original_row.copy()
                    review_data.update({
                        'review': review_text,
                        'keywords': processed_keywords,
                        'date': date
                    })
                    reviews.append(review_data)
                    
                except Exception as e:
                    print(f"Error processing review: {e}")
                    continue
            
            print(f"\n목표 날짜 이후의 리뷰 {collected_count}개를 수집했습니다.")
            return pd.DataFrame(reviews)
        
        df = retry_on_exception(crawl_reviews)
        return df
        
    except Exception as e:
        print(f"Error during crawling: {e}")
        raise
        
    finally:
        if driver:
            driver.quit()

def main():
    try:
        # 기존 체크포인트 파일이 있으면 삭제
        for file in os.listdir():
            if file.startswith('review_checkpoint_'):
                os.remove(file)
                print(f"기존 체크포인트 파일 삭제: {file}")
        
        urls_df = pd.read_csv('ncreview.csv')
        all_reviews = []
        url_count = 0
        
        for idx, row in urls_df.iterrows():
            try:
                print(f"\n처리 중인 URL ({idx+1}/{len(urls_df)}): {row['url']}")
                reviews_df = get_reviews(row['url'], row.to_dict())
                
                if not reviews_df.empty:
                    all_reviews.append(reviews_df)
                    url_count += 1
                    
                    # 50개의 URL마다 중간 저장
                    if url_count % 50 == 0:
                        checkpoint_df = pd.concat(all_reviews, ignore_index=True)
                        checkpoint_file = f'review_checkpoint_{int(url_count/50)}.csv'
                        checkpoint_df.to_csv(checkpoint_file, index=False, encoding='utf-8-sig')
                        print(f"\n중간 저장 완료: {checkpoint_file} ({url_count}개 URL 처리됨)")
                
                print(f"Successfully collected reviews for URL: {row['url']}")
                time.sleep(3)
            except Exception as e:
                print(f"Failed to process URL {row['url']}: {e}")
                continue
        
        # 마지막 저장
        if all_reviews:
            final_df = pd.concat(all_reviews, ignore_index=True)
            final_df.to_csv('naver_reviews_combined.csv', index=False, encoding='utf-8-sig')
            print("\n모든 데이터가 naver_reviews_combined.csv 파일로 저장되었습니다.")
            
            # 처리된 총 URL 수와 리뷰 수 출력
            print(f"\n처리된 총 URL 수: {url_count}")
            print(f"수집된 총 리뷰 수: {len(final_df)}")
        else:
            print("\n수집된 리뷰가 없습니다.")
            
    except Exception as e:
        print(f"Main execution error: {e}")

if __name__ == "__main__":
    main()