In [1]:
!pip3 install beautifulsoup4
!pip3 install selenium
!pip3 install lxml
!pip3 install geckodriver-autoinstaller
!pip3 install chromedriver-py

Collecting selenium
  Downloading selenium-3.141.0-py2.py3-none-any.whl (904 kB)
[K     |████████████████████████████████| 904 kB 3.9 MB/s eta 0:00:01     |██████▌                         | 184 kB 3.9 MB/s eta 0:00:01
Installing collected packages: selenium
Successfully installed selenium-3.141.0
Collecting geckodriver-autoinstaller
  Downloading geckodriver_autoinstaller-0.1.0-py3-none-any.whl (5.6 kB)
Installing collected packages: geckodriver-autoinstaller
Successfully installed geckodriver-autoinstaller-0.1.0
Collecting chromedriver-py
  Downloading chromedriver_py-88.0.4324.27-py3-none-any.whl (19.5 MB)
[K     |████████████████████████████████| 19.5 MB 18.4 MB/s eta 0:00:01
[?25hInstalling collected packages: chromedriver-py
Successfully installed chromedriver-py-88.0.4324.27


In [2]:
import argparse
from datetime import datetime
import json
import os
import re
import time

from urllib.request import urlopen
from urllib.request import HTTPError
import bs4
import pandas as pd

In [3]:
def get_all_lists(soup):

    lists = []
    list_count_dict = {}

    if soup.find('a', text='More lists with this book...'):

        lists_url = soup.find('a', text='More lists with this book...')['href']

        source = urlopen('https://www.goodreads.com' + lists_url)
        soup = bs4.BeautifulSoup(source, 'lxml')
        lists += [' '.join(node.text.strip().split()) for node in soup.find_all('div', {'class': 'cell'})]

        i = 0
        while soup.find('a', {'class': 'next_page'}) and i <= 10:

            time.sleep(2)
            next_url = 'https://www.goodreads.com' + soup.find('a', {'class': 'next_page'})['href']
            source = urlopen(next_url)
            soup = bs4.BeautifulSoup(source, 'lxml')

            lists += [node.text for node in soup.find_all('div', {'class': 'cell'})]
            i += 1

        # Format lists text.
        for _list in lists:
            # _list_name = ' '.join(_list.split()[:-8])
            # _list_rank = int(_list.split()[-8][:-2]) 
            # _num_books_on_list = int(_list.split()[-5].replace(',', ''))
            # list_count_dict[_list_name] = _list_rank / float(_num_books_on_list)     # TODO: switch this back to raw counts
            _list_name = _list.split()[:-2][0]
            _list_count = int(_list.split()[-2].replace(',', ''))
            list_count_dict[_list_name] = _list_count

    return list_count_dict

In [4]:
def get_shelves(soup):

    shelf_count_dict = {}
    
    if soup.find('a', text='See top shelves…'):

        # Find shelves text.
        shelves_url = soup.find('a', text='See top shelves…')['href']
        source = urlopen('https://www.goodreads.com' + shelves_url)
        soup = bs4.BeautifulSoup(source, 'lxml')
        shelves = [' '.join(node.text.strip().split()) for node in soup.find_all('div', {'class': 'shelfStat'})]
        
        # Format shelves text.
        shelf_count_dict = {}
        for _shelf in shelves:
            _shelf_name = _shelf.split()[:-2][0]
            _shelf_count = int(_shelf.split()[-2].replace(',', ''))
            shelf_count_dict[_shelf_name] = _shelf_count

    return shelf_count_dict

In [5]:
def get_genres(soup):
    genres = []
    for node in soup.find_all('div', {'class': 'left'}):
        current_genres = node.find_all('a', {'class': 'actionLinkLite bookPageGenreLink'})
        current_genre = ' > '.join([g.text for g in current_genres])
        if current_genre.strip():
            genres.append(current_genre)
    return genres

In [6]:
def get_isbn(soup):
    try:
        isbn = re.findall(r'nisbn: [0-9]{10}' , str(soup))[0].split()[1]
        return isbn
    except:
        return "isbn not found"

In [7]:
def get_isbn13(soup):
    try:
        isbn13 = re.findall(r'nisbn13: [0-9]{13}' , str(soup))[0].split()[1]
        return isbn13
    except:
        return "isbn13 not found"

In [8]:
def get_rating_distribution(soup):
    distribution = re.findall(r'renderRatingGraph\([\s]*\[[0-9,\s]+', str(soup))[0]
    distribution = ' '.join(distribution.split())
    distribution = [int(c.strip()) for c in distribution.split('[')[1].split(',')]
    distribution_dict = {'5 Stars': distribution[0],
                         '4 Stars': distribution[1],
                         '3 Stars': distribution[2],
                         '2 Stars': distribution[3],
                         '1 Star':  distribution[4]}
    return distribution_dict

In [9]:
def get_num_pages(soup):
    if soup.find('span', {'itemprop': 'numberOfPages'}):
        num_pages = soup.find('span', {'itemprop': 'numberOfPages'}).text.strip()
        return int(num_pages.split()[0])
    return ''

In [10]:
def get_year_first_published(soup):
    year_first_published = soup.find('nobr', attrs={'class':'greyText'}).string
    return re.search('([0-9]{3,4})', year_first_published).group(1)

In [11]:
def get_id(bookid):
    pattern = re.compile("([^.-]+)")
    return pattern.search(bookid).group()

In [12]:
def scrape_book(book_id):
    url = 'https://www.goodreads.com/book/show/' + book_id
    source = urlopen(url)
    soup = bs4.BeautifulSoup(source, 'html.parser')

    time.sleep(2)

    return {'book_id_title':        book_id, 
            'book_id':              get_id(book_id), 
            'book_title':                ' '.join(soup.find('h1', {'id': 'bookTitle'}).text.split()), 
            'isbn':                 get_isbn(soup),
            'isbn13':               get_isbn13(soup),
            'year_first_published': get_year_first_published(soup), 
            'author':               ' '.join(soup.find('span', {'itemprop': 'name'}).text.split()), 
            'num_pages':            get_num_pages(soup), 
            'genres':               get_genres(soup), 
            'shelves':              get_shelves(soup), 
            'lists':                get_all_lists(soup), 
            'num_ratings':          soup.find('meta', {'itemprop': 'ratingCount'})['content'].strip(), 
            'num_reviews':          soup.find('meta', {'itemprop': 'reviewCount'})['content'].strip(),
            'average_rating':       soup.find('span', {'itemprop': 'ratingValue'}).text.strip(), 
            'rating_distribution':  get_rating_distribution(soup)}

In [13]:
def condense_books(books_directory_path):

    books = []

    for file_name in os.listdir(books_directory_path):
        if file_name.endswith('.json') and not file_name.startswith('.') and file_name != "all_books.json":
            _book = json.load(open(books_directory_path + '/' + file_name, 'r')) #, encoding='utf-8', errors='ignore'))
            books.append(_book)

    return books

In [None]:
start_time = datetime.now()

# parser = argparse.ArgumentParser()
# parser.add_argument('--book_ids_path', type=str)
# parser.add_argument('--output_directory_path', type=str)
# parser.add_argument('--format', type=str, action="store", default="json",
#                     dest="format", choices=["json", "csv"],
#                     help="set file output format")
# args = parser.parse_args()

book_ids_path = 'book_ids.txt'

book_ids              = [line.strip() for line in open(book_ids_path, 'r') if line.strip()]
# books_already_scraped =  [file_name.replace('.json', '') for file_name in os.listdir(args.output_directory_path) if file_name.endswith('.json') and not file_name.startswith('all_books')]
books_to_scrape       = [book_id for book_id in book_ids]
# condensed_books_path   = args.output_directory_path + '/all_books'

for i, book_id in enumerate(books_to_scrape):
    try:
        print(str(datetime.now()) + ': Scraping ' + book_id + '...')
#         print(str(datetime.now()) + ': #' + str(i+1+len(books_already_scraped)) + ' out of ' + str(len(book_ids)) + ' books')

        book = scrape_book(book_id)
        json.dump(book, open(args.output_directory_path + '/' + book_id + '.json', 'w'))

        print('=============================')

    except HTTPError as e:
        print(e)
        exit(0)


books = condense_books(args.output_directory_path)
if args.format == 'json':
    json.dump(books, open(f"{condensed_books_path}.json", 'w'))
elif args.format == 'csv':
    json.dump(books, open(f"{condensed_books_path}.json", 'w'))
    book_df = pd.read_json(f"{condensed_books_path}.json")
    book_df.to_csv(f"{condensed_books_path}.csv", index=False, encoding='utf-8')
    
print(str(datetime.now()) + ' ' + script_name + f':\n\n🎉 Success! All book metadata scraped. 🎉\n\nMetadata files have been output to /{args.output_directory_path}\nGoodreads scraping run time = ⏰ ' + str(datetime.now() - start_time) + ' ⏰')