<a href="https://colab.research.google.com/github/mc2398/metadata-analysis-vis/blob/main/goodreads_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The below scripts were written in spring 2025, as part of an academic project analysing metadata from books nominated for the Goodreads Readers' Favorite Awards from 2011-2024.

In [None]:
# building out the metadata test scraper (trying to get it to loop over a list of books)

In [None]:
# importing libraries I will need to use for the project
import requests
import re
import bs4
from time import sleep
from urllib.request import urlopen
from urllib.error import HTTPError
import random
import csv
import json
import pandas as pd

In [None]:
# code which identifies me as entering the website from a browser (helps avoid recognition as a bot)
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:78.0)   Gecko/20100101 Firefox/78.0",
"Referer": "https://www.google.com"}

In order to grab the required metadata for the project, I needed to generate a list of links to pages for all the books which were nominated for the awards. The appropriate methods to identify links for an alternate web scraping project will depend on the intended project scope.

In [None]:
# Step 1- creating a list of the pages I want to pull data from using a dictionary + link stem
competition_links = [
    '2024',
    '2023',
    '2022',
    '2021',
    '2020',
    '2019',
    '2018',
    '2017',
    '2016',
    '2015',
    '2014',
    '2013',
    '2012',
    '2011'
]

In [None]:
url_list = ["https://www.goodreads.com/choiceawards/best-books-"+ls for ls in competition_links]
print(url_list) # "sanity check" to test my script

['https://www.goodreads.com/choiceawards/best-books-2024', 'https://www.goodreads.com/choiceawards/best-books-2023', 'https://www.goodreads.com/choiceawards/best-books-2022', 'https://www.goodreads.com/choiceawards/best-books-2021', 'https://www.goodreads.com/choiceawards/best-books-2020', 'https://www.goodreads.com/choiceawards/best-books-2019', 'https://www.goodreads.com/choiceawards/best-books-2018', 'https://www.goodreads.com/choiceawards/best-books-2017', 'https://www.goodreads.com/choiceawards/best-books-2016', 'https://www.goodreads.com/choiceawards/best-books-2015', 'https://www.goodreads.com/choiceawards/best-books-2014', 'https://www.goodreads.com/choiceawards/best-books-2013', 'https://www.goodreads.com/choiceawards/best-books-2012', 'https://www.goodreads.com/choiceawards/best-books-2011']


In [None]:
# functions to pull all of the links for every competitition category in each year and output a new list of the urls
def getPage(url):
    response = requests.get(url, headers=headers)
    sleep_time = random.randint(10, 19)/10
    sleep(sleep_time)
    return response

all_cat_links = []

results = map(getPage, url_list)
for result in results:
    soup = bs4.BeautifulSoup(result.content,"html.parser")
    search_term = r'^/choiceawards/|(-)(\d+)(?!.*\d)' # regex to pull only desired links
    main_content = soup.find(id="landingLeft") # isolate the main content of the page (excludes nav/column)
    links = main_content.find_all('a', href=True) # isolate the links in the main page content
    # pull the links to pages for each category in a year
    for a in links:
      link_url = a['href']
      # sub_urls = []
      if re.search(search_term, link_url):
        all_cat_links.append(link_url)
        #print(link_url)

In [None]:
print(all_cat_links)

['/choiceawards/readers-favorite-fiction-books-2024', '/choiceawards/readers-favorite-historical-fiction-books-2024', '/choiceawards/readers-favorite-mystery-thriller-books-2024', '/choiceawards/readers-favorite-romance-books-2024', '/choiceawards/readers-favorite-romantasy-books-2024', '/choiceawards/readers-favorite-fantasy-books-2024', '/choiceawards/readers-favorite-science-fiction-books-2024', '/choiceawards/readers-favorite-horror-books-2024', '/choiceawards/readers-favorite-debut-novel-2024', '/choiceawards/readers-favorite-audio-books-2024', '/choiceawards/readers-favorite-ya-fantasy-books-2024', '/choiceawards/readers-favorite-ya-fiction-books-2024', '/choiceawards/readers-favorite-nonfiction-books-2024', '/choiceawards/readers-favorite-memoir-books-2024', '/choiceawards/readers-favorite-history-bio-books-2024', '/choiceawards/best-fiction-books-2023', '/choiceawards/best-historical-fiction-books-2023', '/choiceawards/best-mystery-thriller-books-2023', '/choiceawards/best-roma

In [None]:
category_list = ["https://www.goodreads.com"+ls for ls in all_cat_links]
print(category_list)

['https://www.goodreads.com/choiceawards/readers-favorite-fiction-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-historical-fiction-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-mystery-thriller-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-romance-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-romantasy-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-fantasy-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-science-fiction-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-horror-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-debut-novel-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-audio-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-ya-fantasy-books-2024', 'https://www.goodreads.com/choiceawards/readers-favorite-ya-fiction-books-2024', 'https://www.goodreads.com/choic

In [None]:
# function to pull the link to the page for every book that was nominated for the awards
def getCategories(url):
    # print('Indexing {0}......'.format(url))
    response = requests.get(url, headers=headers)
    # result = requests.get(url)
    # print('Url Indexed...Now pausing 50secs before next ')
    sleep_time = random.randint(10, 19)/10
    sleep(sleep_time)
    return response

all_book_links = []

cat_results = map(getCategories, category_list)
for result in cat_results:
    soup = bs4.BeautifulSoup(result.content,"html.parser")
    poll_content = soup.find('div', {'class': 'pollContents'}) # isolate the main content of the page (excludes nav/column)
    links = poll_content.find_all('a', {'class': 'pollAnswer__bookLink'}, href=True) # isolate the links in the main page content
    for a in links:
      link_url = a['href']
      working_url = ('https://www.goodreads.com' + link_url)
      all_book_links.append(working_url)


In [None]:
# check to make sure it worked!
print(all_book_links)

In [None]:
# check to see how many books are included in my dataset (helps make sure my scraper is working)
book_list_length = len(all_book_links)
print(book_list_length)

5336


After collecting a list of links to the pages for every book nominated for the awards, the next step is to write functions which will extract specific metadata from the web page for each book. Fortunately the strucuture of the web pages for the books explored in this project is identical. Different metadata categories are identified (using beautiful soup), by specific classes and ids, identified by looking at goodreads book pages using in-browser developer tools. In the future, this section of the code will likely need to be updated to reflect changes in Goodreads' book pages layouts.

In [None]:
# functions for pulling specific metadata
# function to pull the title for every book
def get_title(soup):
  titles = []
  for title in soup.find('h1', {'data-testid': 'bookTitle'}):
    current_title = title
    if current_title.strip():
      titlesss = str(current_title)
      titles.append(titlesss)
  return titles

In [None]:
# function to pull the genres or subject tag for every book
def get_genres(soup):
    genres = []
    for node in soup.find_all('div', {'data-testid': 'genresList'}):
      current_genres = node.find_all('span', {'class': 'Button__labelItem'})
      current_genre = ', '.join([g.text for g in current_genres])
      if current_genre.strip():
        genres.append(current_genre)
    return genres

In [None]:
# function to pull the description field for every book
def get_description(soup):
    description = []
    for node in soup.find('div', {'data-testid': 'description'}):
      current_descriptions = node.find_all('span', {'class': 'Formatted'})
      current_description = ''.join([g.text for g in current_descriptions])
      description.append(current_description)
    return description

Code to loop over the list of all nominated books, grab the metadata for every field I want to collect data for, and output the data as a pandas data frame. Note this function will take several hours to run.

In [None]:
# Pulling the metadata for all nominated books into a data frame which I can save!
# Note the sleep time in this function is randomized- helps 1) not overload goodreads' servers and 2) appear human
def getBooks(url):
    response = requests.get(url, headers=headers)
    sleep_time = random.randint(10, 19)/10
    sleep(sleep_time)
    return response

all_book_metadata = {}

book_results = map(getBooks, all_book_links)
for result in book_results:
  soup = bs4.BeautifulSoup(result.content,"html.parser")
  all_book_metadata.setdefault('title', []).extend(get_title(soup))
  all_book_metadata.setdefault('description', []).extend(get_description(soup))
  all_book_metadata.setdefault('genres', []).extend(get_genres(soup))
  df= pd.DataFrame(all_book_metadata)

In [None]:
# saving the metadata as a csv
df.to_csv('output.csv', index=False)

In [None]:
# check that my scraper worked #1
df.head()

Unnamed: 0,title,description,genres
0,The Wedding People,Alternate cover edition of ISBN 9781250899576....,"Fiction, Romance, Audiobook, Contemporary, Lit..."
1,Intermezzo,"An exquisitely moving story about grief, love,...","Fiction, Contemporary, Literary Fiction, Roman..."
2,Welcome to the Hyunam-Dong Bookshop,Yeongju is burned out. With her high-flying ca...,"Contemporary, Books About Books, Audiobook, Co..."
3,Blue Sisters,Three estranged siblings return to their famil...,"Fiction, Contemporary, Literary Fiction, Audio..."
4,Here One Moment,"\nIf you knew your future, would you try to fi...","Fiction, Audiobook, Mystery, Contemporary, Thr..."


In [None]:
# check that my scraper worked #2
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        18 non-null     object
 1   description  18 non-null     object
 2   genres       18 non-null     object
dtypes: object(3)
memory usage: 564.0+ bytes
