In [None]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
import shutil
import os

# Ensure ChromeDriver is installed
chromedriver_autoinstaller.install()

# Set Chrome binary location (adjust if needed)
chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
if not os.path.exists(chrome_path):
    chrome_path = shutil.which("google-chrome")

# Goodreads list URL
GOODREADS_URL = "https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once"

# Set up Selenium WebDriver options
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
if chrome_path:
    options.binary_location = chrome_path

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

titles = []
authors = []
avg_ratings = []
genres = []
first_published_list = []

books_scraped = 0
page = 1

while page <= 10:
    print(f"\n🔄 Scraping Page {page}...\n")
    driver.get(f"{GOODREADS_URL}?page={page}")
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, "table.tableList tr")))
    except:
        print(f"Timeout waiting for books on page {page}. Retrying after delay...")
        time.sleep(random.uniform(5, 10))
        continue

    soup = BeautifulSoup(driver.page_source, "html.parser")
    book_rows = soup.select("tr[itemtype='http://schema.org/Book']")

    if not book_rows:
        print(f"No books found on page {page}. Ending scraping.")
        break

    for row in book_rows:
        title_tag = row.select_one("a.bookTitle")
        author_tag = row.select_one("a.authorName")
        rating_tag = row.select_one("span.minirating")

        title = title_tag.text.strip() if title_tag else ""
        author = author_tag.text.strip() if author_tag else ""
        avg_rating = rating_tag.text.strip().split(" — ")[0] if rating_tag else ""

        # Click into book detail page
        book_url = f"https://www.goodreads.com{title_tag['href']}"
        driver.get(book_url)
        time.sleep(random.uniform(2, 4))
        book_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Genres
        genre_tags = book_soup.select("a.bookPageGenreLink")
        top_genres = list(dict.fromkeys([g.text.strip() for g in genre_tags]))[:3]
        genre_str = ", ".join(top_genres)

        # First published
        pub_info = book_soup.find("div", id="details")
        if pub_info:
            pub_text = pub_info.get_text(" ", strip=True)
            pub_split = [s for s in pub_text.split(" ") if s.isdigit() and len(s) == 4]
            first_pub = pub_split[0] if pub_split else "Unknown"
        else:
            first_pub = "Unknown"

        titles.append(title)
        authors.append(author)
        avg_ratings.append(avg_rating)
        genres.append(genre_str)
        first_published_list.append(first_pub)

        books_scraped += 1
        print(f"Scraped: {books_scraped} - {title}")

        driver.back()
        time.sleep(random.uniform(1, 2))

    print(f"✅ Finished Page {page}. Total books scraped so far: {books_scraped}\n")
    page += 1
    time.sleep(random.uniform(2, 4))

# Close driver
driver.quit()

# Save to CSV
df = pd.DataFrame({
    "Title": titles,
    "Author": authors,
    "Average Rating": avg_ratings,
    "Top 3 Genres": genres,
    "First Published": first_published_list
})

df.to_csv("goodreads_books.csv", index=False)
print("Scraping complete! Saved as goodreads_books.csv")
