In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import random
import re

In [8]:
# Configure Selenium to run with the browser visible (headless mode removed)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(options=options)

In [10]:
url = f"https://www.goodreads.com/user/sign_in"
driver.get(url)

#Sign in manually

In [12]:
books = []  # List to store book data

for page in range(1, 11):  # Loop through pages 1 to 10
    url = f"https://www.goodreads.com/list/show/1.Best_Books_Ever?page={page}&ref=ls_pl_car_0"
    driver.get(url)
    
    # Wait until the table of books loads on the page
    try:
        table = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "table.tableList"))
        )
    except Exception as e:
        print(f"Table not found on page {page}: {e}")
        continue
    
    # Each book is in a table row (<tr>) within the table
    rows = table.find_elements(By.TAG_NAME, "tr")
    for row in rows:
        try:
            # Get ranking number (remove any trailing punctuation)
            rank = row.find_element(By.CLASS_NAME, "number").text.strip().strip('.')
            
            # Get the book title
            title_elem = row.find_element(By.CSS_SELECTOR, "a.bookTitle")
            title = title_elem.text.strip()
            
            # Get the author name
            author_elem = row.find_element(By.CSS_SELECTOR, "a.authorName")
            author = author_elem.text.strip()
            
            # Get the rating information from the span with class "minirating"
            rating_text = row.find_element(By.CSS_SELECTOR, "span.minirating").text.strip()
            # The text is usually like: "4.18 avg rating — 6,302,680 ratings"
            rating_parts = rating_text.split(" ")
            avg_rating = rating_parts[0]  # The first part is the average rating

            #Get ratings with regex
            num_ratings_match = re.search(r"—\s([\d,]+)\sratings", rating_text)
            num_ratings = num_ratings_match.group(1).replace(',', '') if num_ratings_match else None
            
            books.append({
                "Rank": rank,
                "Title": title,
                "Author": author,
                "Avg_Rating": avg_rating,
                "Num_Ratings": num_ratings
            })
        except Exception as e:
            print(f"Error processing a row: {e}")
            continue
    
    time.sleep(random.uniform(2, 5))

driver.quit()

# Create a pandas DataFrame from the list of dictionaries
df = pd.DataFrame(books)

In [14]:
display(df)

Unnamed: 0,Rank,Title,Author,Avg_Rating,Num_Ratings
0,1,The Hunger Games (The Hunger Games #1),Suzanne Collins,4.34,9317387
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling,4.50,3616457
2,3,Pride and Prejudice,Jane Austen,4.29,4517317
3,4,To Kill a Mockingbird,Harper Lee,4.26,6563540
4,5,The Book Thief,Markus Zusak,4.39,2746000
...,...,...,...,...,...
595,596,Vanity Fair,William Makepeace Thackeray,3.81,131019
596,597,Snow Crash,Neal Stephenson,4.02,290095
597,598,"Wizard's First Rule (Sword of Truth, #1)",Terry Goodkind,4.12,250619
598,599,The Left Hand of Darkness,Ursula K. Le Guin,4.10,201939


In [16]:
df.to_csv("GoodReads_Best_Books_Ever.csv")