In [None]:
import time
import random
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [None]:
# Configure Selenium to run with the browser visible (headless mode removed)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.maximize_window()

In [None]:
url = f"https://www.goodreads.com/user/sign_in"
driver.get(url)

#Sign in manually

In [None]:
# Initialize the WebDriver and the books list
books = []
main_window = driver.current_window_handle  # store the main window handle

def get_book_details(detail_url):
    """
    Extracts additional book details from the detail page.
    Opens the detail page in a new tab, scrapes the description, genres,
    page details and ISBN, then closes the tab and returns to the main list page.
    
    Returns:
        dict: A dictionary with keys "Description", "Genres",
              "Format_and_Page", and "ISBN".
    """
    # Open detail page in a new tab
    driver.execute_script("window.open('');")
    # Switch to the new tab
    driver.switch_to.window(driver.window_handles[-1])
    driver.get(detail_url)
    wait = WebDriverWait(driver, 20)
    details = {}
    
    # --- Extract Book Description ---
    try:
        description_container = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']"))
        )
        try:
            more_link = description_container.find_element(By.XPATH, ".//a[contains(text(), 'more')]")
            more_link.click()
            wait.until(EC.staleness_of(more_link))
            description_container = driver.find_element(By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']")
        except Exception as e:
            pass
        details["Description"] = description_container.text.strip()
    except Exception as e:
        details["Description"] = "Not found"
    
    # --- Extract Genres ---
    try:
        genres_section = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='genresList']"))
        )
        genre_elements = genres_section.find_elements(By.CSS_SELECTOR, "ul.CollapsableList span[tabindex='-1']")
        genres = [elem.text.strip() for elem in genre_elements if elem.text.strip()]
        details["Genres"] = genres
    except Exception as e:
        details["Genres"] = []
    
    # --- Reveal Hidden Book Details (ISBN, Page Numbers) ---
    try:
        details_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Book details and editions']"))
        )
        details_button.click()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.BookDetails")))
    except Exception as e:
        print("Details button not found or error clicking it:", e)
    
    # --- Extract ISBN and Page Numbers ---
    try:
        book_details_section = driver.find_element(By.CSS_SELECTOR, "div.BookDetails")
        pages_element = book_details_section.find_element(By.CSS_SELECTOR, "p[data-testid='pagesFormat']")
        pages = pages_element.text.strip()
        details["Format_and_Page"] = pages
        
        details_text = book_details_section.text
        isbn_match = re.search(r"ISBN\s*([\dX]+)", details_text)
        isbn = isbn_match.group(1) if isbn_match else "Not found"
        details["ISBN"] = isbn
    except Exception as e:
        details["Format_and_Page"] = "Not found"
        details["ISBN"] = "Not found"
    
    # Close the detail tab and switch back to the main window
    driver.close()
    driver.switch_to.window(main_window)
    
    return details

# --- Scrape the Main List Pages (pages 1 to 6) ---
for page in range(1):
    list_url = f"https://www.goodreads.com/list/show/1.Best_Books_Ever?page={page}&ref=ls_pl_car_0"
    driver.get(list_url)
    
    try:
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.tableList"))
        )
    except Exception as e:
        print(f"Table not found on page {page}: {e}")
        continue

    rows = table.find_elements(By.TAG_NAME, "tr")
    for row in rows:
        try:
            # Extract basic info from the list page
            rank = row.find_element(By.CLASS_NAME, "number").text.strip().strip('.')
            title_elem = row.find_element(By.CSS_SELECTOR, "a.bookTitle")
            title = title_elem.text.strip()
            detail_url = title_elem.get_attribute("href")
            author_elem = row.find_element(By.CSS_SELECTOR, "a.authorName")
            author = author_elem.text.strip()
            rating_text = row.find_element(By.CSS_SELECTOR, "span.minirating").text.strip()
            rating_parts = rating_text.split(" ")
            avg_rating = rating_parts[0]
            num_ratings_match = re.search(r"—\s([\d,]+)\sratings", rating_text)
            num_ratings = num_ratings_match.group(1).replace(',', '') if num_ratings_match else None

            # Retrieve additional details from the individual book page
            details = get_book_details(detail_url)
            
            books.append({
                "Rank": rank,
                "Title": title,
                "Author": author,
                "Avg_Rating": avg_rating,
                "Num_Ratings": num_ratings,
                "ISBN": details.get("ISBN"),
                "Format_and_Page": details.get("Format_and_Page"),
                "Genres": ", ".join(details.get("Genres"))
            })
        except Exception as e:
            print(f"Error processing a row: {e}")
            continue

    # Sleep a random interval between 2 to 4 seconds between list pages
    time.sleep(random.uniform(2, 4))

driver.quit()

# Create a pandas DataFrame with the scraped data and display the first 10 rows
df = pd.DataFrame(books)
print(df.head(10))