In [14]:
# !pip install selenium

In [11]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By

# driver = webdriver.Chrome()
# driver.get("https://www.selenium.dev/selenium/web/web-form.html")

# title = driver.title
# driver.implicitly_wait(0.5)

# text_box = driver.find_element(by=By.NAME, value="my-text")
# submit_button = driver.find_element(by=By.CSS_SELECTOR, value="button")

# text_box.send_keys("Selenium")
# submit_button.click()

# message = driver.find_element(by=By.ID, value="message")
# text = message.text

# driver.quit()
# print(text)

## Scraping Goodreads

In [16]:
# Make imports
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

In [17]:
## Scrape the full page for most popular books published in a specific year
def scrape_full_page(year):
    # Initialize the Selenium webdriver
    # service = Service('PATH_TO_CHROMEDRIVER')
    driver = webdriver.Chrome()

    # Load the web page
    driver.get(f"https://www.goodreads.com/book/popular_by_date/{year}")

    # Let the page load
    time.sleep(5)

    # Mark the Show more books button
    show_more_books_xpath = "//button[@class='Button Button--secondary Button--small' and contains(span[@class='Button__labelItem'], 'Show more books')]"
    show_more_button = driver.find_element(by=By.XPATH, value=show_more_books_xpath)
    bottom_left_x = 0
    bottom_left_y = driver.execute_script("return window.innerHeight")
    
    # Click the show more books button until all the books are loaded
    i = 0
    while True:
        i += 1
        try:
            driver.execute_script("arguments[0].click();", show_more_button)
            time.sleep(2)
        except Exception:
            break
        
        # Remove the pop up box that appears
        if i == 1:
            try:
                actions = ActionChains(driver)
                actions.move_by_offset(bottom_left_x, bottom_left_y).click().perform()
            except: pass

    # Get the page source after dynamic content has loaded
    page_source = driver.page_source

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find the section containing popular books
    popular_books_section = soup.find('div', class_='RankedBookList')

    # Find all the book titles and authors within the popular books section
    book_titles_html = popular_books_section.find_all('h3', class_='Text Text__title3 Text__umber')
    book_authors_html = popular_books_section.find_all('h3', class_='Text Text__h3 Text__regular')
    book_ratings_html = popular_books_section.find_all(class_='AverageRating__ratingValue')
    book_num_ratings_html = popular_books_section.find_all(class_='AverageRating__ratingsCount')
    book_num_shelvings_html = popular_books_section.find_all(class_='Text Text__body3')
    book_summary_html = popular_books_section.find_all(class_='Formatted')
    img_html = popular_books_section.find_all(class_='BookCover__image')

    # Close the browser
    driver.quit()

    # Extract metadata for each book and put them in separate lists
    book_titles = [title.text for title in book_titles_html]
    book_authors = [title.text for title in book_authors_html]
    book_ratings = [title.text for title in book_ratings_html]
    book_num_ratings = [title.text.split()[0] for title in book_num_ratings_html]
    book_num_shelvings = [title.text.split('ratings')[-1].split()[0] for title in book_num_shelvings_html]
    book_summary = [title.text for title in book_summary_html]
    book_image_tags = [tag.img for tag in img_html]

    book_images = [re.findall(r'(https://[^ "]*)', str(img_tag))[0] for img_tag in book_image_tags]
    YEAR = [str(year).split('/')[0]] * len(book_titles)

    # Dataframe of all book information for this specific year
    year_df = pd.DataFrame([book_titles, book_authors, book_ratings, book_num_ratings, book_num_shelvings, YEAR, book_summary, book_images]).T
    year_df.columns = ["Title", "Author", "Score", "Ratings", "Shelvings", "Published", "Description", "Image"]

    return year_df

### Scraping all popular books (1900-2024)

In [3]:
## Main script to scrape all popular books from each of the years 1900 to current year
START_YEAR = 1900
CURRENT_YEAR = 2024

books_df = pd.DataFrame()
for year in range(START_YEAR, CURRENT_YEAR + 1):
    try:
        print(f"#########------- Extracting books from year {year} -------#########")
        print("\t************** Starting to Extract ***************")
        year_df = scrape_full_page(year)
        books_df = pd.concat([books_df, year_df], ignore_index=True)
        print("\t########------- Finished Extracting -------#######")
        time.sleep((CURRENT_YEAR - START_YEAR) % 5)
    except: pass
    finally:
        print(f"\nTotal books from year {year}: {len(year_df)}\n")

books_df

#########------- Extracting books from year 1900 -------#########
	************** Starting to Extract ***************
	########------- Finished Extracting -------#######

Total books from year 1900: 200

#########------- Extracting books from year 1901 -------#########
	************** Starting to Extract ***************
	########------- Finished Extracting -------#######

Total books from year 1901: 200

#########------- Extracting books from year 1902 -------#########
	************** Starting to Extract ***************
	########------- Finished Extracting -------#######

Total books from year 1902: 200

#########------- Extracting books from year 1903 -------#########
	************** Starting to Extract ***************
	########------- Finished Extracting -------#######

Total books from year 1903: 200

#########------- Extracting books from year 1904 -------#########
	************** Starting to Extract ***************
	########------- Finished Extracting -------#######

Total books f

Unnamed: 0,0,1,2,3,4,5,6,7
0,"The Wonderful Wizard of Oz (Oz, #1)",L. Frank Baum ...more,4,454k,688k,1900,"Come along, Toto, she said. We will go to the ...",https://images-na.ssl-images-amazon.com/images...
1,Up from Slavery,Booker T. Washington ...more,4.1,33.9k,63.1k,1900,"Booker T. Washington, the most recognized nati...",https://images-na.ssl-images-amazon.com/images...
2,Sister Carrie,Theodore Dreiser,3.77,40.2k,62.9k,1900,"A landmark in American literature, presented i...",https://images-na.ssl-images-amazon.com/images...
3,Lord Jim,Joseph Conrad ...more,3.62,31.1k,54.6k,1900,"Jim, a young British seaman, becomes first mat...",https://images-na.ssl-images-amazon.com/images...
4,The Three Sisters,Anton Chekhov,3.72,22k,31.6k,1900,First performed at the Moscow Art Theatre in 1...,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...,...,...
23991,Apprentice to the Villain (Assistant to the Vi...,Hannah Nicole Maehrer,4.44,131,33.3k,2024,From the creator of the viral TikTok videos ab...,https://images-na.ssl-images-amazon.com/images...
23992,The Break-Up Pact,Emma Lord,3.56,1305,33.2k,2024,Two best friends who haven’t spoken in ten yea...,https://images-na.ssl-images-amazon.com/images...
23993,Slow Dance,Rainbow Rowell,4.18,435,33.2k,2024,From the #1 New York Times bestselling author ...,https://images-na.ssl-images-amazon.com/images...
23994,"Destroy the Day (Defy the Night, #3)",Brigid Kemmerer,4.24,8936,33.2k,2024,"Left for dead, but desperate to survive . . . ...",https://images-na.ssl-images-amazon.com/images...


In [51]:
driver = webdriver.Chrome()

# Load the web page
driver.get(f"https://www.goodreads.com/book/popular_by_date/2024/4")

# Let the page load
time.sleep(5)

# Mark the Show more books button
show_more_books_xpath = "//button[@class='Button Button--secondary Button--small' and contains(span[@class='Button__labelItem'], 'Show more books')]"
show_more_button = driver.find_element(by=By.XPATH, value=show_more_books_xpath)
bottom_left_x = 0  # X-coordinate of the bottom left
bottom_left_y = driver.execute_script("return window.innerHeight")

# Click the show more books button until all the books are loaded
i = 0
while True:
    i += 1
    # try:
    driver.execute_script("arguments[0].click();", show_more_button)
    time.sleep(2)
    if i == 1:
        actions = ActionChains(driver)
        actions.move_by_offset(bottom_left_x, bottom_left_y).click().perform()
    if i == 3:
        break
        # break
    # except Exception:
    #     break

page_source = driver.page_source

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(page_source, 'html.parser')

# Find the section containing popular books
popular_books_section = soup.find('div', class_='RankedBookList')

# Find all the book titles and authors within the popular books section
book_titles_html = popular_books_section.find_all('h3', class_='Text Text__title3 Text__umber')


len(book_titles_html)
# driver.quit()

45

# Data Cleaning

In [27]:
import pandas as pd

books_df = pd.read_csv("Popular-Books.csv")
books_df

Unnamed: 0,Title,Author,Score,Ratings,Shelvings,Published,Description,Image
0,"The Wonderful Wizard of Oz (Oz, #1)",L. Frank Baum,4.00,454k,688k,1900,"Come along, Toto, she said. We will go to the ...",https://images-na.ssl-images-amazon.com/images...
1,Up from Slavery,Booker T. Washington,4.10,33.9k,63.1k,1900,"Booker T. Washington, the most recognized nati...",https://images-na.ssl-images-amazon.com/images...
2,Sister Carrie,Theodore Dreiser,3.77,40.2k,62.9k,1900,"A landmark in American literature, presented i...",https://images-na.ssl-images-amazon.com/images...
3,Lord Jim,Joseph Conrad,3.62,31.1k,54.6k,1900,"Jim, a young British seaman, becomes first mat...",https://images-na.ssl-images-amazon.com/images...
4,The Three Sisters,Anton Chekhov,3.72,22k,31.6k,1900,First performed at the Moscow Art Theatre in 1...,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...,...,...
27616,"Beautiful Bitch (Beautiful Bastard, #1.5)",Christina Lauren,3.84,48.9k,89.4k,2013,"Picking up where Beautiful Bastard left off, C...",https://images-na.ssl-images-amazon.com/images...
27617,"Grounded (Up in the Air, #3)",R.K. Lilley,4.27,60.9k,89.1k,2013,James and Bianca’s story draws to a close in t...,https://images-na.ssl-images-amazon.com/images...
27618,"Down London Road (On Dublin Street, #2)",Samantha Young,4.20,54.2k,88.9k,2013,Johanna Walker is used to taking charge. But s...,https://images-na.ssl-images-amazon.com/images...
27619,"Frost Burned (Mercy Thompson, #7)",Patricia Briggs,4.36,74.6k,88.7k,2013,Patricia Briggs's novel River Marked was prais...,https://images-na.ssl-images-amazon.com/images...


In [28]:
# Dropping any duplicate books from the data
books_df = books_df.drop_duplicates(subset=['Title', 'Author'])
books_df.Published.value_counts()

2023    2266
2024     768
1900     200
1980     200
1993     200
        ... 
1931     199
1967     199
2004     199
1928     198
2013     196
Name: Published, Length: 125, dtype: int64

In [33]:
books_df.Author.nunique()

11362