In [1]:
"""
book scraper is designed to target the 53 most popular books in my sample
more than 40000 reviews each, or more than 1000000 ratings
"""

import requests
import numpy as np
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import datetime
import pickle
import selenium
import os
import pyprind
import multiprocessing as multi
import pprint as pp

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import os

In [2]:
"""
Inputs: a Goodreads.com book review url
        https://www.goodreads.com/review/show/342857147
Returns: a pandas Series containing info about the review and the goodreads user who wrote it.
        index = ['url', 'book_title', 'author','reviewtext', 'shelves','review_date', 'review_score', 
                     'words', 'paragraphs', 'images', 'likes', 'comments',
                     'username', 'userurl', 'user_friends', 'user_followers'])
if this is changed, update matching dataframe in book_scraper
        
Process: Use Beautiful Soup to scrape relevant text from a Goodreads.com and the associated
        user page. Takes about 5 seconds total.


"""

def review_scraper(url) :
    
    #initializing some blank variables
    book_title = ''
    author = ''
    soup =''
    t0= time.time()
    
    while len(soup) == 0:
        try:
            response = requests.get(url)
            page = response.text
            soup = BeautifulSoup(page,"lxml")
        except:
            time.sleep(0.25)
            print('connection error')
            return
    
    #check for non-existent page
    try:
        if soup.find_all('a', class_='gr-button')[1].text == 'Back to the Goodreads homepage':
            print(url)
            return
    except:
        pass
    
    try:
        book_title = soup.find('a', class_='bookTitle').text
    except:
        pass
    username =  soup.find('a', class_='userReview').text
    author = soup.find('a', class_='authorName').text
    review_date = soup.find('span', itemprop="publishDate").text
    
    
    shelves = []
    for item in soup.find_all('a', class_='actionLinkLite'):
        shelves.append(item.text)

    #reviewsoup scrapes the review.  The several try/except loops are necessary
    #to deal with reviews that do not have any text
    reviewsoup= soup.find('div', class_="reviewText mediumText description readable")
    try:
        reviewtext = reviewsoup.text
    except:
        reviewtext = ''
    if len(reviewtext) == 0:
        print('No Review')
        return
    
    paragraphs = reviewtext.count('\n')
    reviewtext = reviewtext.replace('\n', ' ').replace('\t', ' ')
    words = reviewtext.count(' ')
    try:
        images = len(reviewsoup.find_all('img'))
    except:
        images = 0
    try:
        likes = int(soup.find('span', class_='likesCount').text.split(' ')[0])
    except:
        likes = 0
    try:
        comments = int(soup.find('span', class_='smallText').text.split(' ')[-1].split(')')[0])
    except:
        comments = 0
    try:    
        star_text = soup.find('div', class_='rating').text.strip()
    except:
        star_text = 'not rated'
    rating_dict = {'did not like it': 1,
                  'it was ok': 2,
                  'liked it': 3,
                  'really liked it': 4,
                  'it was amazing': 5,
                  'not rated': 0}
    try:
        review_score = rating_dict[star_text]
    except:
        review_score = star_text
        

    #usersoup looks at the user, and in particular how many friends and followers they have
    userurl =  'https://www.goodreads.com'+ soup.find('a', class_='userReview').get('href')    
    usersoup = BeautifulSoup(requests.get(userurl).text, 'lxml')
    user_followers= 0
    user_friends = 0
    for item in usersoup.find_all('a', rel='nofollow'):
        if 'Friends' in item.text:
            user_friends = item.text.split(' ')[-1].replace(',','').strip('()')
            if user_friends == 'Friends':
                user_friends = 0
        if 'people are' in item.text:
            user_followers = int(item.text.split(' ')[0])
    
    t1 = time.time()
    #print('Time %.3f :' % (t1-t0), book_title)
    return(pd.Series([url, book_title, author, reviewtext, shelves, review_date, review_score, words, paragraphs, images, likes, comments,
           username, userurl, user_friends, user_followers],
                    index = ['url', 'book_title', 'author','reviewtext', 'shelves','review_date', 'review_score', 
                                 'words', 'paragraphs', 'images', 'likes', 'comments',
                                 'username', 'userurl', 'user_friends', 'user_followers']))

#, times_reviewed, times_rated, average_rating, isbn, book_date, book_url

In [3]:
"""
input: driver, counter
returns: list of book_urls

        sel_scrape is a helper function used to grab book review links using Selenium.
        The page numbers at the bottom of a book page are dynamic Ajak objects, not hyperlinks
        next_find locates the 'next' button on the page
        review_finder returns 30 links
        the while loop below gives the page 20 seconds to fully load, and then scrapes the rest
        if next.click() doesn't do anything, the function returns
    
"""

def sel_scrape(driver, i):

    def next_find(driver):
        element = driver.find_element_by_link_text('next »')
        if element:
            return element

    def review_finder(driver):
        element = driver.find_elements_by_link_text('see review')
        if len(element)==30:
            return element
    
    t_end = time.time() + 20
    while time.time() < t_end:
        links = review_finder(driver)
        if links == False:
            links = driver.find_elements_by_link_text('see review')
        new_urls = []
        try:
            for item in links:
                new_urls.append(item.get_attribute('href').split('?')[0])
                print(i, end=' ')
                i = i +1
        except:
            pass
        if len(new_urls)==30:
            print('')
            break
    try:
        next_find(driver).click()
        time.sleep(3)
        driver.find_element_by_tag_name('body').send_keys(Keys.CONTROL + Keys.HOME)
    except:
        return (new_urls, 'END')
    return (new_urls, i)

In [4]:
"""
Inputs: A Goodreads book URL
        https://www.goodreads.com/book/show/11324722-the-righteous-mind
        pages: how many pages to scrape, an int 1-10
        
Returns: save a dataframe of review and book info to disk, as {Book Title} book.pkl
        if there is an error, return the book_url for re-scraping at a later date.
        Prints the book_url, number of reviews, and total time to terminal.

        
Process:  grab the first 30 reviews using Beautiful Soup.  
        Selenium is required to view the next pages of reviews.  By default, 
        this grabs a total of 90 reviews.  Reviews added to review_urls, a list which
        is processed using the multiprocessing.map function and review_scraper.  Series
        from review_scraper are assembled into a pandas DataFrame and pickled to disk.
        
        Multiprocessing gives an increase in speed equal to number of CPU cores.  
        On my laptop and with decent internet, scraping 300 reviews takes about 400 seconds.
        The function checks the duration between beginning and end.
"""        


def book_scraper(book_url, pages):
    t2 = time.time()
    booksoup = BeautifulSoup(requests.get(book_url).text, 'lxml')
    print(book_url)

    #first we grab all book review links visible on the page
    review_urls = []
    i = 0
            
    #we need to use Selenium to get reviews on pages 2 and 3
    chromedriver = "/usr/lib/chromium-browser/chromedriver" # path to the chromedriver executable
    os.environ["webdriver.chrome.driver"] = chromedriver

    driver = webdriver.Chrome(chromedriver)
    driver.get(book_url)
    
    for j in range(10):
        new_urls, i= sel_scrape(driver, i)
        review_urls = review_urls + new_urls
        if i == 'END':
            break
    driver.close()
    
    #quick filter to remove some none-review things Selenium throws in the list
    #and yes, there is a cool lambda function way to do this.  
    #and no, I didn't implement it past midnight Sunday
    filtered = []
    for item in review_urls:
        if 'https:' in item:
            filtered.append(item)
        
    review_urls = filtered
    print('aiming for %d reviews' % len(review_urls))

    #booksoup returns information about the book
    times_rated = int(booksoup.find('span', class_='votes value-title').text.strip().replace(',',''))
    times_reviewed = int(booksoup.find('span', class_='count value-title').text.strip().replace(',',''))
    average_rating = float(booksoup.find('span', class_='average').text)
    isbn =''
    for item in booksoup.find_all('div', class_='infoBoxRowItem'):
        try:
            a = item.text.strip().split('\n')[0]
            b = re.search(r'\w\w\w\w\w\w\w\w\w\w', a)
            if b != None:
                isbn = b[0]
        except: 
            pass 
        for item in booksoup.find_all('div', class_='row'):
            if 'Published' in item.text:
                a = item.text
                book_date = a.strip().split('\n')[1].strip()

    #df is a dataframe of book info scraped from 90 reviews
    #we initialize it with the columns that will be passed by review_scraper
    #make sure to update this and review_scraper output in sync
    df = pd.DataFrame(columns= ['url', 'book_title', 'author', 'reviewtext', 'shelves', 'review_date', 'review_score', 
                                 'words', 'paragraphs', 'images', 'likes', 'comments',
                                 'username', 'userurl', 'user_friends', 'user_followers'])
    
    times = []
    i = 1
    
    #we call review_scraper for every url on the page, passing it info about the book
    #pool is the most elegant way to multithread
    
    reviews = []
    try:
        pool = multi.Pool()
        reviews = pool.map(review_scraper, review_urls)
        pool.terminate()
    except:
        print('SCRAPE ERROR', book_url)
        return (book_url)

    
    filtered = []
    for item in reviews:
        if str(type(item)) != "<class 'NoneType'>":
            filtered.append(item)
        
    reviews = filtered
    
    t3 = time.time()
    print('REVIEWS: ', len(reviews), 'TOTAL TIME: ', t3-t2)
    
    #create a dataframe from the reviews, and add on book info
    df = pd.DataFrame.from_records(reviews)
    df['times_reviewed'] = times_reviewed
    df['times_rated'] =  times_rated
    df['average_rating'] =average_rating
    df['isbn'] = isbn
    df['book_date'] = book_date
    df['book_url']= book_url    
    
    #remove restricted characters from file name
    try:
        book_title = df.book_title[0]
    except:
        book_title = 'Bad Title'
    book_title = re.sub('\/|\\|\?|\%|\*|\:|\<|\>|\.|\"|\|', "", book_title)
    
    #and save as {title} book.pkl
    filename = book_title+' book.pkl'
    outfile = open(filename,'wb')
    pickle.dump(df,outfile)
    outfile.close()

In [6]:
"""
Goodreads.com list represent large collections of books liked by the community. 
For this prototype, I've left this code outside a function.  It takes a page of a list
(100 books) and adds the book urls to a set, which prevents duplicate scrapes
"""


list_urls = ['https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century',
            'https://www.goodreads.com/list/show/824.Best_Non_fiction_War_Books',
            'https://www.goodreads.com/list/show/134.Best_Non_Fiction_no_biographies_',
            'https://www.goodreads.com/list/show/3.Best_Science_Fiction_Fantasy_Books',
            'https://www.goodreads.com/list/show/397.Best_Paranormal_Romance_Series',
            'https://www.goodreads.com/list/show/348.Thrillers',
            'https://www.goodreads.com/list/show/5490.YA_Books_Far_Better_than_Twilight',
            'https://www.goodreads.com/list/show/29013.Best_Biographies',]

list_urls2 = ['https://www.goodreads.com/list/show/10942.Our_Favorite_Indie_Reads',
             'https://www.goodreads.com/list/show/16.Best_Books_of_the_19th_Century',
             'https://www.goodreads.com/list/show/1083.The_Most_Influential_Books_in_History',
             'https://www.goodreads.com/list/show/397.Best_Paranormal_Romance_Series?page=2',
             'https://www.goodreads.com/list/show/281.Best_Memoir_Biography_Autobiography',
             'https://www.goodreads.com/list/show/3.Best_Science_Fiction_Fantasy_Books?page=2',
             'https://www.goodreads.com/list/show/3.Best_Science_Fiction_Fantasy_Books?page=3',
             'https://www.goodreads.com/list/show/824.Best_Non_fiction_War_Books?page=2',
             'https://www.goodreads.com/list/show/6.Best_Books_of_the_20th_Century?page=2',
             'https://www.goodreads.com/list/show/952.1001_Books_You_Must_Read_Before_You_Die',
             'https://www.goodreads.com/list/show/952.1001_Books_You_Must_Read_Before_You_Die?page=2',
             'https://www.goodreads.com/list/show/134.Best_Non_Fiction_no_biographies_?page=2']

book_urls = set()

for thing in [list_urls2[5]]:
    listsoup = BeautifulSoup(requests.get(thing).text, 'lxml')
    for item in listsoup.find_all('a', class_='bookTitle'):
        book_urls.add('https://www.goodreads.com'+item.get('href'))

In [7]:
#load books already scraped

jar = []
df = pd.DataFrame()

for file in os.listdir():
    if file.split(' ')[-1]=='book.pkl':
        jar.append(file)

jar.append('Benson_Data_Sample.pkl')        

for file in jar:
    d = pickle.load(open(file, "rb" ))
    df = pd.concat([df,d], sort=False, ignore_index=True)

books_done = set(df.book_url.unique())

In [10]:
#example target
#note if cell output is zero, either change the url below to something not yet scraped
#or rerun the notebook and skip the cell above
targets = set(['https://www.goodreads.com/book/show/37976541-bad-blood'])

#alternatively, if you have several days
#targets = book_urls

errors =set()
targets = (targets-books_done).union(errors)-{None}
print('targets:', len(targets)

1

In [11]:
"""
This cell does the actual scraping
"""
errors = set()

for book in targets:
    errors.add(book_scraper(book, 10))
    books_done.add(book)

https://www.goodreads.com/book/show/37976541-bad-blood
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 
120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 
180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

In [18]:
books_done

{'https://www.goodreads.com/book/show/100365.The_Mote_in_God_s_Eye',
 'https://www.goodreads.com/book/show/108942.Bad_Luck_and_Trouble',
 'https://www.goodreads.com/book/show/13831.Alanna',
 'https://www.goodreads.com/book/show/32067.Lady_Chatterley_s_Lover',
 'https://www.goodreads.com/book/show/32530.The_Third_Victim',
 'https://www.goodreads.com/book/show/350.Stranger_in_a_Strange_Land',
 'https://www.goodreads.com/book/show/45032.Mansfield_Park',
 'https://www.goodreads.com/book/show/6487308-fallen'}

In [20]:
errors

{None, 'https://www.goodreads.com/book/show/45032.Mansfield_Park'}