In [7]:
import time
import random
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure Selenium to run with the browser visible (headless mode removed)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.maximize_window()

url = "https://www.goodreads.com/user/sign_in"
driver.get(url)

# Sign in manually

In [9]:
books = []
main_window = driver.current_window_handle  # store the main window handle

def get_book_details(detail_url):
    """
    Extracts additional book details from the detail page.
    Opens the detail page in a new tab, scrapes the description, genres,
    page details, ISBN and ASIN, then closes the tab and returns to the main list page.
    
    Returns:
        dict: A dictionary with keys "Description", "Genres",
              "Format_and_Page", "ISBN", and "ASIN".
    """
    # Open detail page in a new tab
    driver.execute_script("window.open('');")
    # Switch to the new tab
    driver.switch_to.window(driver.window_handles[-1])
    driver.get(detail_url)
    wait = WebDriverWait(driver, 20)
    details = {}
    
    # --- Extract Book Description ---
    try:
        description_container = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']"))
        )
        try:
            more_link = description_container.find_element(By.XPATH, ".//a[contains(text(), 'more')]")
            more_link.click()
            wait.until(EC.staleness_of(more_link))
            description_container = driver.find_element(By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']")
        except Exception as e:
            pass
        details["Description"] = description_container.text.strip()
    except Exception as e:
        details["Description"] = "Not found"
    
    # --- Extract Genres ---
    try:
        genres_section = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='genresList']"))
        )
        genre_elements = genres_section.find_elements(By.CSS_SELECTOR, "ul.CollapsableList span[tabindex='-1']")
        genres = [elem.text.strip() for elem in genre_elements if elem.text.strip()]
        details["Genres"] = genres
    except Exception as e:
        details["Genres"] = []
    
    # --- Reveal Hidden Book Details (ISBN, ASIN, Page Numbers) ---
    try:
        details_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Book details and editions']"))
        )
        details_button.click()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.BookDetails")))
    except Exception as e:
        print("Details button not found or error clicking it:", e)
    
    # --- Extract ISBN, ASIN and Page Numbers ---
    try:
        book_details_section = driver.find_element(By.CSS_SELECTOR, "div.BookDetails")
        pages_element = book_details_section.find_element(By.CSS_SELECTOR, "p[data-testid='pagesFormat']")
        pages = pages_element.text.strip()
        details["Format_and_Page"] = pages
        
        details_text = book_details_section.text
        isbn_match = re.search(r"ISBN\s*([\dX]+)", details_text)
        asin_match = re.search(r"ASIN\s*([\w\d]+)", details_text)
        isbn = isbn_match.group(1) if isbn_match else "Not found"
        asin = asin_match.group(1) if asin_match else "Not found"
        details["ISBN"] = isbn
        details["ASIN"] = asin
    except Exception as e:
        details["Format_and_Page"] = "Not found"
        details["ISBN"] = "Not found"
        details["ASIN"] = "Not found"
    
    # Close the detail tab and switch back to the main window
    driver.close()
    driver.switch_to.window(main_window)
    
    return details

# --- Scrape the Main List Pages (aiming for 1000 books) ---
page = 1
while len(books) < 1000:
    list_url = f"https://www.goodreads.com/list/show/1.Best_Books_Ever?page={page}&ref=ls_pl_car_0"
    driver.get(list_url)
    
    try:
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.tableList"))
        )
    except Exception as e:
        print(f"Table not found on page {page}: {e}")
        page += 1
        continue

    rows = table.find_elements(By.TAG_NAME, "tr")
    for row in rows:
        if len(books) >= 1000:
            break  # Stop if we've reached 1000 books
        try:
            # Extract basic info from the list page
            rank = row.find_element(By.CLASS_NAME, "number").text.strip().strip('.')
            title_elem = row.find_element(By.CSS_SELECTOR, "a.bookTitle")
            title = title_elem.text.strip()
            detail_url = title_elem.get_attribute("href")
            author_elem = row.find_element(By.CSS_SELECTOR, "a.authorName")
            author = author_elem.text.strip()
            rating_text = row.find_element(By.CSS_SELECTOR, "span.minirating").text.strip()
            rating_parts = rating_text.split(" ")
            avg_rating = rating_parts[0]
            num_ratings_match = re.search(r"—\s([\d,]+)\sratings", rating_text)
            num_ratings = num_ratings_match.group(1).replace(',', '') if num_ratings_match else None

            # Retrieve additional details from the individual book page
            details = get_book_details(detail_url)
            
            books.append({
                "Rank": rank,
                "Title": title,
                "Author": author,
                "Avg_Rating": avg_rating,
                "Num_Ratings": num_ratings,
                "ISBN": details.get("ISBN"),
                "ASIN": details.get("ASIN"),
                "Format_and_Page": details.get("Format_and_Page"),
                "Genres": ", ".join(details.get("Genres"))
            })
        except Exception as e:
            print(f"Error processing a row: {e}")
            continue
    
    print(f"Scraped {len(books)} books so far (from page {page}).")
    page += 1

driver.quit()

# Create a pandas DataFrame with the scraped data and display the first 10 rows
df = pd.DataFrame(books)
print(df.head(10))

Details button not found or error clicking it: Message: 
Stacktrace:
0   chromedriver                        0x0000000101111798 cxxbridge1$str$ptr + 2785964
1   chromedriver                        0x0000000101109d70 cxxbridge1$str$ptr + 2754692
2   chromedriver                        0x0000000100c5dea8 cxxbridge1$string$len + 92928
3   chromedriver                        0x0000000100ca51d0 cxxbridge1$string$len + 384552
4   chromedriver                        0x0000000100ce6678 cxxbridge1$string$len + 651984
5   chromedriver                        0x0000000100c9935c cxxbridge1$string$len + 335796
6   chromedriver                        0x00000001010d6c68 cxxbridge1$str$ptr + 2545532
7   chromedriver                        0x00000001010d9f34 cxxbridge1$str$ptr + 2558536
8   chromedriver                        0x00000001010b6c98 cxxbridge1$str$ptr + 2414508
9   chromedriver                        0x00000001010da794 cxxbridge1$str$ptr + 2560680
10  chromedriver                        0x00

In [11]:
display(df)

Unnamed: 0,Rank,Title,Author,Avg_Rating,Num_Ratings,ISBN,ASIN,Format_and_Page,Genres
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,4.34,9318734,9780439023481,0439023483,"374 pages, Hardcover",Genres\nYoung Adult\nFiction\nFantasy\nScience...
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.50,3616863,Not found,0439686520,"912 pages, Paperback",Genres\nFantasy\nYoung Adult\nFiction\nMagic\n...
2,3,Pride and Prejudice,Jane Austen,4.29,4517843,9781441341709,1441341706,"279 pages, Paperback",Genres\nFiction\nHistorical Fiction\nHistorica...
3,4,To Kill a Mockingbird,Harper Lee,4.26,6564282,9780060935467,0060935464,"323 pages, Paperback",Genres\nFiction\nHistorical Fiction\nSchool\nL...
4,5,The Book Thief,Markus Zusak,4.39,2746351,Not found,0375831002,"592 pages, Kindle Edition",Genres\nHistorical Fiction\nFiction\nYoung Adu...
5,6,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,3.66,7001576,9780316015844,0316015849,"498 pages, Paperback",Genres\nFantasy\nYoung Adult\nRomance\nFiction...
6,7,Animal Farm,George Orwell,really,4227918,9780451526342,B0D7LQMF4N,"141 pages, Mass Market Paperback",Genres\nFiction\nDystopia\nFantasy\nSchool\nLi...
7,8,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit an...,J.R.R. Tolkien,4.61,139673,9780345538376,0345538374,"1728 pages, Mass Market Paperback",Genres\nFantasy\nFiction\nClassics\nAdventure\...
8,9,The Chronicles of Narnia (The Chronicles of Na...,C.S. Lewis,4.28,685690,9780066238500,0066238501,"767 pages, Paperback",Genres\nFantasy\nClassics\nFiction\nYoung Adul...
9,10,The Fault in Our Stars,John Green,4.13,5480695,Not found,0525478817,"313 pages, Hardcover",Genres\nYoung Adult\nFiction\nContemporary\nRe...
