# 영풍문고 리뷰 크롤링

## 절차
- ISBN 리스트 로드
- 링크에 ISBN 하나씩 넣어서 접속
- 리뷰 탭 버튼 누르기
- 본문, 별점, 닉네임 수집
- 첫 페이지에서 신규 데이터가 0개면 해당 ISBN 스킵

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=options)

# 리뷰 크롤링 함수 (중복 제거 + 더보기 클릭 + 출력 로그)
def crawl_reviews_by_isbn(isbn):
    url = f"https://www.ypbooks.co.kr/search/book?word={isbn}"
    driver.get(url)
    time.sleep(2)
    print(f"\n[ISBN: {isbn}] 진입 완료")

    try:
        review_tab = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="content"]/div[1]/div[1]/div[4]/ul/li[1]/div[1]/div[1]/div/div[2]/div[2]/a'))
        )
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", review_tab)
        time.sleep(1)
        driver.execute_script("arguments[0].click();", review_tab)
        time.sleep(2)
        print("리뷰탭 클릭 완료")
    except Exception as e:
        print(f"리뷰탭 클릭 실패 ▶︎ {e}")
        return []

    reviews = []
    seen_reviews = set()
    page = 1

    while True:
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.review__item'))
            )
            time.sleep(1)
            review_elements = driver.find_elements(By.CSS_SELECTOR, 'div.review__item')
        except:
            break

        new_data = 0
        for elem in review_elements:
            try:
                try:
                    content = elem.find_element(By.CSS_SELECTOR, 'p.___reviewText.showAll').text.strip()
                except:
                    content = elem.find_element(By.CSS_SELECTOR, 'div.review__content').text.strip()

                if content in seen_reviews:
                    continue
                seen_reviews.add(content)

                star = elem.find_element(By.CSS_SELECTOR, 'span.book__votes-point').text.strip()
                nick = elem.find_elements(By.CSS_SELECTOR, 'span.list-info__text')[0].text.strip()

                reviews.append({
                    'ISBN': isbn,
                    'Star': star,
                    'Nick': nick,
                    'Content': content
                })
                new_data += 1
            except:
                continue

        if page == 1:
            if new_data == 0:
                print("리뷰 0개, 다음 ISBN으로 넘어갑니다")
                return []
            else:
                print(f"리뷰 {len(reviews)}개 수집 중...")

        try:
            more_button = driver.find_element(By.CSS_SELECTOR, 'button.btn--review-more')
            driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", more_button)
            time.sleep(1)
            driver.execute_script("arguments[0].click();", more_button)
            print(f"더보기 탭 클릭 (페이지 {page})")
            page += 1
            time.sleep(2.5)
        except NoSuchElementException:
            print(f"최종 리뷰 {len(reviews)}개 크롤링 완료\n")
            break

    return reviews

# ISBN 불러오기
df = pd.read_csv("베스트셀러_통합.csv")
isbn_list = df['ISBN'].dropna().astype(str).tolist()


# 전체 크롤링 + 200개마다 저장
all_reviews = []

for idx, isbn in enumerate(isbn_list, 1):
    print(f"[{idx}/{len(isbn_list)}] 크롤링 중: {isbn}")
    reviews = crawl_reviews_by_isbn(isbn)
    all_reviews.extend(reviews)

    # 200개마다 중간 저장
    if idx % 200 == 0 or idx == len(isbn_list):
        temp_df = pd.DataFrame(all_reviews)
        temp_df.to_csv("ypbooks_reviews_full_best.csv", index=False, encoding='utf-8-sig')
        print(f"중간 저장 완료 ({idx}개까지 저장됨)")

# 마무리
driver.quit()
print("전 ISBN 리뷰 크롤링 완료")