In [25]:
import time
import random
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Configure Selenium to run with the browser visible (headless mode removed)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)
driver.maximize_window()

url = "https://www.goodreads.com/user/sign_in"
driver.get(url)

# Sign in manually

In [29]:
books = []
main_window = driver.current_window_handle  # store the main window handle

def get_book_details(detail_url):
    """
    Extracts additional book details from the detail page.
    Opens the detail page in a new tab, scrapes the description, genres,
    page details, ISBN and ASIN, then closes the tab and returns to the main list page.
    
    Returns:
        dict: A dictionary with keys "Description", "Genres",
              "Format_and_Page", "ISBN", and "ASIN".
    """
    # Open detail page in a new tab
    driver.execute_script("window.open('');")
    # Switch to the new tab
    driver.switch_to.window(driver.window_handles[-1])
    driver.get(detail_url)
    wait = WebDriverWait(driver, 20)
    details = {}
    
    # --- Extract Book Description ---
    try:
        description_container = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']"))
        )
        try:
            more_link = description_container.find_element(By.XPATH, ".//a[contains(text(), 'more')]")
            more_link.click()
            wait.until(EC.staleness_of(more_link))
            description_container = driver.find_element(By.CSS_SELECTOR, "div.TruncatedContent[tabindex='-1']")
        except Exception as e:
            pass
        details["Description"] = description_container.text.strip()
    except Exception as e:
        details["Description"] = "Not found"
    
    # --- Extract Genres ---
    try:
        genres_section = wait.until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-testid='genresList']"))
        )
        genre_elements = genres_section.find_elements(By.CSS_SELECTOR, "ul.CollapsableList span[tabindex='-1']")
        genres = [elem.text.strip() for elem in genre_elements if elem.text.strip()]
        details["Genres"] = genres
    except Exception as e:
        details["Genres"] = []
    
    # --- Reveal Hidden Book Details (ISBN, ASIN, Page Numbers) ---
    try:
        details_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='Book details and editions']"))
        )
        details_button.click()
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.BookDetails")))
    except Exception as e:
        print("Details button not found or error clicking it:", e)
    
    # --- Extract ISBN, ASIN and Page Numbers ---
    try:
        book_details_section = driver.find_element(By.CSS_SELECTOR, "div.BookDetails")
        pages_element = book_details_section.find_element(By.CSS_SELECTOR, "p[data-testid='pagesFormat']")
        pages = pages_element.text.strip()
        details["Format_and_Page"] = pages
        
        details_text = book_details_section.text
        isbn_match = re.search(r"ISBN\s*([\dX]+)", details_text)
        asin_match = re.search(r"ASIN\s*([\w\d]+)", details_text)
        isbn = isbn_match.group(1) if isbn_match else "Not found"
        asin = asin_match.group(1) if asin_match else "Not found"
        details["ISBN"] = isbn
        details["ASIN"] = asin
    except Exception as e:
        details["Format_and_Page"] = "Not found"
        details["ISBN"] = "Not found"
        details["ASIN"] = "Not found"
    
    # Close the detail tab and switch back to the main window
    driver.close()
    driver.switch_to.window(main_window)
    
    return details

# --- Scrape the Main List Pages (aiming for 1000 books) ---
page = 7
while len(books) < 1000:
    list_url = f"https://www.goodreads.com/list/show/1.Best_Books_Ever?page={page}&ref=ls_pl_car_0"
    driver.get(list_url)
    
    try:
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.tableList"))
        )
    except Exception as e:
        print(f"Table not found on page {page}")
        page += 1
        continue

    rows = table.find_elements(By.TAG_NAME, "tr")
    for row in rows:
        if len(books) >= 1000:
            break  # Stop if we've reached 1000 books
        try:
            # Extract basic info from the list page
            rank = row.find_element(By.CLASS_NAME, "number").text.strip().strip('.')
            title_elem = row.find_element(By.CSS_SELECTOR, "a.bookTitle")
            title = title_elem.text.strip()
            detail_url = title_elem.get_attribute("href")
            author_elem = row.find_element(By.CSS_SELECTOR, "a.authorName")
            author = author_elem.text.strip()
            rating_text = row.find_element(By.CSS_SELECTOR, "span.minirating").text.strip()
            rating_parts = rating_text.split(" ")
            avg_rating = rating_parts[0]
            num_ratings_match = re.search(r"—\s([\d,]+)\sratings", rating_text)
            num_ratings = num_ratings_match.group(1).replace(',', '') if num_ratings_match else None

            # Retrieve additional details from the individual book page
            details = get_book_details(detail_url)
            
            books.append({
                "Rank": rank,
                "Title": title,
                "Author": author,
                "Avg_Rating": avg_rating,
                "Num_Ratings": num_ratings,
                "ISBN": details.get("ISBN"),
                "ASIN": details.get("ASIN"),
                "Format_and_Page": details.get("Format_and_Page"),
                "Genres": ", ".join(details.get("Genres"))
            })
        except Exception as e:
            print(f"Error processing a row: {e}")
            continue
    
    print(f"Scraped {len(books)} books so far (from page {page}).")
    page += 1

driver.quit()

# Create a pandas DataFrame with the scraped data and display the first 10 rows
df = pd.DataFrame(books)
print(df.head(10))

Details button not found or error clicking it: Message: 
Stacktrace:
0   chromedriver                        0x0000000104a6b6c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x0000000104a63c9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x00000001045b5e30 cxxbridge1$string$len + 92928
3   chromedriver                        0x00000001045fd170 cxxbridge1$string$len + 384576
4   chromedriver                        0x000000010463e5f8 cxxbridge1$string$len + 651976
5   chromedriver                        0x00000001045f12fc cxxbridge1$string$len + 335820
6   chromedriver                        0x0000000104a306c4 cxxbridge1$str$ptr + 2549544
7   chromedriver                        0x0000000104a33988 cxxbridge1$str$ptr + 2562540
8   chromedriver                        0x0000000104a1071c cxxbridge1$str$ptr + 2418560
9   chromedriver                        0x0000000104a341e8 cxxbridge1$str$ptr + 2564684
10  chromedriver                        0x00

InvalidSessionIdException: Message: invalid session id
Stacktrace:
0   chromedriver                        0x0000000104a6b6c8 cxxbridge1$str$ptr + 2791212
1   chromedriver                        0x0000000104a63c9c cxxbridge1$str$ptr + 2759936
2   chromedriver                        0x00000001045b5ca4 cxxbridge1$string$len + 92532
3   chromedriver                        0x00000001045f0674 cxxbridge1$string$len + 332612
4   chromedriver                        0x0000000104618ecc cxxbridge1$string$len + 498588
5   chromedriver                        0x00000001046181cc cxxbridge1$string$len + 495260
6   chromedriver                        0x0000000104585c60 chromedriver + 89184
7   chromedriver                        0x0000000104a306c4 cxxbridge1$str$ptr + 2549544
8   chromedriver                        0x0000000104a33988 cxxbridge1$str$ptr + 2562540
9   chromedriver                        0x0000000104a1071c cxxbridge1$str$ptr + 2418560
10  chromedriver                        0x0000000104a341e8 cxxbridge1$str$ptr + 2564684
11  chromedriver                        0x0000000104a01750 cxxbridge1$str$ptr + 2357172
12  chromedriver                        0x000000010458465c chromedriver + 83548
13  dyld                                0x0000000198944274 start + 2840


In [31]:
df = pd.DataFrame(books)
display(df)

df.to_csv("GoodReads_500_pt3.csv")

Unnamed: 0,Rank,Title,Author,Avg_Rating,Num_Ratings,ISBN,ASIN,Format_and_Page,Genres
0,601,Much Ado About Nothing,William Shakespeare,4.06,252564,Not found,B0DSZLYTS1,"249 pages, Paperback",Genres\nPlays\nFiction\nClassics\nRomance\nSch...
1,602,Far From the Madding Crowd,Thomas Hardy,3.97,160741,Not found,Not found,"433 pages, Paperback",Genres\nClassics\nFiction\nRomance\nHistorical...
2,603,"Call Me By Your Name (Call Me By Your Name, #1)",André Aciman,4.1,553652,9781786495259,1786495252,"248 pages, Paperback",Genres\nFiction\nLGBT\nQueer\nContemporary\nAu...
3,604,"Empire of Storms (Throne of Glass, #5)",Sarah J. Maas,4.63,1002224,Not found,Not found,"689 pages, Hardcover",Genres\nFantasy\nYoung Adult\nRomance\nRomanta...
4,605,"Cannery Row (Cannery Row, #1)",John Steinbeck,4.06,146681,9780142000687,014200068X,"181 pages, Paperback",Genres\nFiction\nClassics\nLiterature\nHistori...
5,606,"Anne of Avonlea (Anne of Green Gables, #2)",L.M. Montgomery,4.23,214244,9780553213140,0553213148,"276 pages, Mass Market Paperback",Genres\nFiction\nYoung Adult\nHistorical Ficti...
6,607,"Bared to You (Crossfire, #1)",Sylvia Day,4.13,541984,Not found,Not found,"352 pages, Paperback",Genres\nRomance\nErotica\nContemporary\nContem...
7,608,"Perfect Chemistry (Perfect Chemistry, #1)",Simone Elkeles,4.04,262072,9780802798237,0802798233,"368 pages, Hardcover",Genres\nRomance\nYoung Adult\nContemporary\nHi...
8,609,A Monster Calls,Patrick Ness,4.35,279755,9781406361803,1406361801,"237 pages, Paperback",Genres\nFantasy\nYoung Adult\nFiction\nHorror\...
9,610,"Cry, the Beloved Country",Alan Paton,3.92,77614,9780743261951,074326195X,"316 pages, Hardcover",Genres\nFiction\nClassics\nHistorical Fiction\...
