In [1]:
import time
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import chromedriver_autoinstaller
from bs4 import BeautifulSoup
import shutil
import os

# Ensure ChromeDriver is installed
chromedriver_autoinstaller.install()

# Set Chrome binary location (adjust if needed)
chrome_path = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
if not os.path.exists(chrome_path):
    chrome_path = shutil.which("google-chrome")

# URL of The Greatest Books list
GREATEST_BOOKS_URL = "https://thegreatestbooks.org/v/table/lists/381"
BASE_URL = "https://thegreatestbooks.org"

# Set up Selenium WebDriver options
options = Options()
options.add_argument("--headless")  # Run in headless mode (no browser UI)
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920x1080")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
if chrome_path:
    options.binary_location = chrome_path

# Automatically install ChromeDriver
service = Service(ChromeDriverManager().install())
try:
    driver = webdriver.Chrome(service=service, options=options)
except Exception as e:
    print("Error: ChromeDriver failed to start.", e)
    exit()

driver.get(GREATEST_BOOKS_URL)

# Wait for the table class to load
try:
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".table")))
    print("Table loaded successfully.")
except:
    print("Error: Table did not load.")
    driver.quit()
    exit()

# Scroll to the table to ensure it's visible
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(random.uniform(3, 6))  # Add random delay

# Get page source and parse with BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")

# Lists to store data
titles = []
authors = []
ranks = []
years = []
genres = []

# Find the table containing book data
table = soup.find("table", class_="table")
if not table:
    print("Error: Could not find the table on the page.")
    driver.quit()
    exit()

# Extract data from the table
for row in table.find_all("tr")[1:]:  # Skip header row
    columns = row.find_all("td")
    if len(columns) >= 4:
        rank = columns[0].text.strip()
        title = columns[1].text.strip()
        author = columns[2].text.strip()
        year = columns[3].text.strip()
        
        # Get book details page link
        book_link_tag = columns[1].find("a")
        if book_link_tag:
            book_url = BASE_URL + book_link_tag["href"]

            # Navigate to book detail page in the same browser
            driver.get(book_url)
            time.sleep(random.uniform(3, 7))  # Add random delay before parsing
            book_soup = BeautifulSoup(driver.page_source, "html.parser")

            genre_tag = book_soup.find("div", class_="category")
            genre = genre_tag.text.strip() if genre_tag else "Unknown"

            # Return to the table page to continue loop
            driver.get(GREATEST_BOOKS_URL)
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".table")))
            time.sleep(random.uniform(2, 5))  # Add delay before parsing table again
            soup = BeautifulSoup(driver.page_source, "html.parser")
            table = soup.find("table", class_="table")
        else:
            genre = "Unknown"
        
        ranks.append(rank)
        titles.append(title)
        authors.append(author)
        years.append(year)
        genres.append(genre)

# Close the browser after all scraping is complete
driver.quit()

# Create a DataFrame
df_books = pd.DataFrame({
    "Rank": ranks,
    "Title": titles,
    "Author": authors,
    "Year": years,
    "Genre": genres
})

# Save the data
df_books.to_csv("greatest_books.csv", index=False)

print("Scraping complete! Data saved as greatest_books.csv")



Error: Table did not load.


MaxRetryError: HTTPConnectionPool(host='localhost', port=57469): Max retries exceeded with url: /session/5ce45594930f2d8ffdc5ce44ec503f3a/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x12ca1ef90>: Failed to establish a new connection: [Errno 61] Connection refused'))