In [1]:
import time
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Goodreads list base URL (with pagination)
BASE_URL = "https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once?page={}" 

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.88 Safari/537.36"
}

# Lists to store data
titles = []
authors = []
avg_ratings = []
genres = []
first_published_list = []

for page in range(1, 11):  # Pages 1 to 10
    print(f"\n🔄 Scraping page {page}...")
    response = requests.get(BASE_URL.format(page), headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")

    book_rows = soup.select("tr[itemtype='http://schema.org/Book']")
    print(f"Found {len(book_rows)} books on page {page}.")

    for row in book_rows:
        title_tag = row.select_one("a.bookTitle")
        author_tag = row.select_one("a.authorName")
        rating_tag = row.select_one("span.minirating")

        title = title_tag.text.strip() if title_tag else ""
        author = author_tag.text.strip() if author_tag else ""
        avg_rating = rating_tag.text.strip().split(" — ")[0] if rating_tag else ""

        book_url = f"https://www.goodreads.com{title_tag['href']}"
        detail_resp = requests.get(book_url, headers=headers)
        detail_soup = BeautifulSoup(detail_resp.content, "html.parser")

        # Genres scraping
        genre_section = detail_soup.find("span", string="Genres")
        top_genres = []
        if genre_section:
            genre_spans = genre_section.find_all_next("span", class_="Button__labelItem")
            seen = set()
            for g in genre_spans:
                genre = g.get_text(strip=True)
                if genre not in seen:
                    top_genres.append(genre)
                    seen.add(genre)
                if len(top_genres) == 3:
                    break
        genre_str = ", ".join(top_genres) if top_genres else "Unknown"

        # First published scraping
        first_pub = "Unknown"
        pub_tag = detail_soup.find("p", attrs={"data-testid": "publicationInfo"})
        if pub_tag:
            pub_text = pub_tag.text.strip()
            pub_split = [s for s in pub_text.split() if s.isdigit() and len(s) == 4]
            if pub_split:
                first_pub = pub_split[0]

        titles.append(title)
        authors.append(author)
        avg_ratings.append(avg_rating)
        genres.append(genre_str)
        first_published_list.append(first_pub)

        print(f"Scraped: {title}")
        time.sleep(random.uniform(1, 2))

# Save to CSV
df = pd.DataFrame({
    "Title": titles,
    "Author": authors,
    "Average Rating": avg_ratings,
    "Top 3 Genres": genres,
    "First Published": first_published_list
})

output_file = "goodreads_top1000_bs.csv"
df.to_csv(output_file, index=False)
print(f"\n✅ Scraping complete! Saved as {output_file}")



🔄 Scraping page 1...
Found 100 books on page 1.
Scraped: To Kill a Mockingbird
Scraped: Pride and Prejudice
Scraped: The Diary of a Young Girl
Scraped: Harry Potter and the Sorcerer’s Stone (Harry Potter, #1)
Scraped: Animal Farm
Scraped: 1984
Scraped: The Little Prince
Scraped: The Great Gatsby
Scraped: The Catcher in the Rye
Scraped: The Lord of the Rings
Scraped: The Book Thief
Scraped: Jane Eyre
Scraped: The Chronicles of Narnia (The Chronicles of Narnia, #1-7)
Scraped: Lord of the Flies
Scraped: Romeo and Juliet
Scraped: Harry Potter and the Deathly Hallows (Harry Potter, #7)
Scraped: The Kite Runner
Scraped: The Giver (The Giver, #1)
Scraped: The Giving Tree
Scraped: Little Women (Little Women, #1)
Scraped: Charlotte’s Web
Scraped: The Hunger Games (The Hunger Games, #1)
Scraped: The Hobbit, or There and Back Again
Scraped: Fahrenheit 451
Scraped: Of Mice and Men
Scraped: Green Eggs and Ham
Scraped: Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
Scraped: Wuthering H