In [453]:
import random
import time
import csv
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from dateutil.parser import parse
import numpy as np

In [454]:
def getTitle():
    return 'title'
def getSeries():
    return 'series'
def getSeriesPosition():
    return 'seriesPosition'
def getSeriesNumBooks():
    return 'seriesNumBooks'
def getAuthor():
    return 'author'
def getAuthorNumFollowers():
    return 'authorNumFollowers'
def getRating():
    return 'rating'
def getNumRatings():
    return 'numRatings'
def getNumReviews():
    return 'numReviews'
def getNumOneStars():
    return 'numOneStars'
def getNumTwoStars():
    return 'numTwoStars'
def getNumThreeStars():
    return 'numThreeStars'
def getNumFourStars():
    return 'numFourStars'
def getNumFiveStars():
    return 'numFiveStars'
def getPercOneStars():
    return 'percOneStars'
def getPercTwoStars():
    return 'percTwoStars'
def getPercThreeStars():
    return 'percThreeStars'
def getPercFourStars():
    return 'percFourStars'
def getPercFiveStars():
    return 'percFiveStars'
def getNumPages():
    return 'numPages'
def getPublishDate():
    return 'publishDate'
def getISBN():
    return 'isbn'
def getISBN13():
    return 'isbn13'
def getASIN():
    return 'asin'
def getPublisher():
    return 'publisher'
def getNumAwards():
    return 'numAwards'
def getAwards():
    return 'awards'
def getGenres():
    return 'genres'
def getGenresCount():
    return 'genresCount'

In [455]:
# Wait between browser actions
def wait():
    time.sleep(.5+2*random.random())

In [456]:
def getFieldNames():
    fieldNames = [getTitle(), getSeries(), getSeriesPosition(), getSeriesNumBooks(), 
                     getAuthor(), getAuthorNumFollowers(), getRating(), getNumRatings(),
                      getNumReviews(), getNumOneStars(), getNumTwoStars(), getNumThreeStars(),
                      getNumFourStars(), getNumFiveStars(), getPercOneStars(), getPercTwoStars(),
                      getPercThreeStars(), getPercFourStars(), getPercFiveStars(), getNumPages(),
                      getPublishDate(), getISBN(), getISBN13(), getASIN(), getPublisher(),
                      getNumAwards(), getAwards(), getGenres(), getGenresCount()]
    return fieldNames

In [457]:
def addHeadersToFile():
    with open('data.csv', 'w', newline='') as writeFile:
        writer = csv.DictWriter(writeFile, fieldnames=getFieldNames())
        writer.writeheader()

In [458]:
def appendToFile(dictionary):
    with open('data.csv', 'a', newline='') as writeFile:
        fieldNames = getFieldNames()
        if len(dictionary) != len(fieldNames):
            return False
        writer = csv.DictWriter(writeFile, fieldnames=fieldNames)
        writer.writerow(dictionary)
        return True

In [459]:
def readFile():
    df = pd.read_csv('data.csv')        
    return df

In [460]:
# Scraping Books. Currently works when browser opened to page n. Should be easy to
# extend to automatically iterate through pages
def getBookInfo(driver, dictionary):
    
    # Get book title
    title = driver.find_element_by_id('bookTitle')
    dictionary[getTitle()] = title.text
        
    try:
        # Get series name
        series = driver.find_element_by_xpath("//div[@class='seriesList']/div[1]/div[2]/div[1]")
        seriesName = series.find_element_by_xpath("./span[1]/a[1]")
        dictionary[getSeries()] = seriesName.text
                
        # Get number of books in series
        seriesNumBooksData = series.text
        seriesNumBooks = seriesNumBooksData.split('(')[1]
        seriesNumBooks = seriesNumBooks.split('of')
                
        if len(seriesNumBooks) == 1:
            seriesNumBooks = seriesNumBooks[0]
        else:
            seriesNumBooks = seriesNumBooks[1]
            
        seriesNumBooks = seriesNumBooks.replace(')','')
        seriesNumBooks = seriesNumBooks.strip()
        seriesNumBooks = int(seriesNumBooks.split()[0])
        dictionary[getSeriesNumBooks()] = seriesNumBooks
        
        # Get position of book in series
        seriesPositionData = driver.find_element_by_xpath("//h2[@id='bookSeries']/a")
        seriesPositionData = seriesPositionData.text
        seriesPositionData = seriesPositionData.split('#')

        if len(seriesPositionData) != 2:
            return None
        else:
            seriesPositionData = seriesPositionData[1]
            seriesPosition = seriesPositionData.replace(')','').strip()
            try:
                seriesPosition = int(seriesPosition)
                dictionary[getSeriesPosition()] = seriesPosition
            except ValueError:
                return None
    except NoSuchElementException:
        dictionary[getSeries()] = np.nan
        dictionary[getSeriesNumBooks()] = 1
        dictionary[getSeriesPosition()] = 1
    except ValueError:
        return None
    
    # Get name of author
    author = driver.find_element_by_xpath("//a[@class='authorName']")
    dictionary[getAuthor()] = author.text

    # Get number of followers of author
    authorNumFollowers = driver.find_element_by_xpath("//div[@class='bookAuthorProfile__followerCount']")
    authorNumFollowers = authorNumFollowers.text.split()[0]
    authorNumFollowers = int(authorNumFollowers.replace(',', ''))
    dictionary[getAuthorNumFollowers()] = authorNumFollowers

    # Get book rating
    rating = driver.find_element_by_xpath("//span[@itemprop='ratingValue']")
    rating = float(rating.text)
    dictionary[getRating()] = rating

    # Get number of ratings
    numRatings = driver.find_element_by_xpath("//div[@id='bookMeta']/a[2]")
    numRatings = numRatings.text.split()[0]
    numRatings = int(numRatings.replace(',', ''))
    dictionary[getNumRatings()] = numRatings
    
    # Get number of reviews
    numReviews = driver.find_element_by_xpath("//div[@id='bookMeta']/a[3]")
    numReviews = numReviews.text.split()[0]
    numReviews = int(numReviews.replace(',', ''))
    dictionary[getNumReviews()] = numReviews
    
    # Click on 'Rating details' to get more information        
    ratingDetailsButton = driver.find_element_by_xpath("//a[@id='rating_details']")
    ratingDetailsButton.click()
    wait()
    
    # Get number and percentage of 1 stars
    numOneStars = driver.find_element_by_xpath("//table[@id='rating_distribution']/tbody[1]/tr[5]/td[2]")
    percOneStars = numOneStars.text.split()[0]
    numOneStars = numOneStars.text.split()[1]
    percOneStars = int(percOneStars.replace('%', ''))
    numOneStars = numOneStars.replace('(', '')
    numOneStars = int(numOneStars.replace(')', ''))
    dictionary[getNumOneStars()] = numOneStars
    dictionary[getPercOneStars()] = percOneStars
    
    # Get number and percentage of 2 stars
    numTwoStars = driver.find_element_by_xpath("//table[@id='rating_distribution']/tbody[1]/tr[4]/td[2]")
    percTwoStars = numTwoStars.text.split()[0]
    numTwoStars = numTwoStars.text.split()[1]
    percTwoStars = int(percTwoStars.replace('%', ''))
    numTwoStars = numTwoStars.replace('(', '')
    numTwoStars = int(numTwoStars.replace(')', ''))
    dictionary[getNumTwoStars()] = numTwoStars
    dictionary[getPercTwoStars()] = percTwoStars
    
    # Get number and percentage of 3 stars
    numThreeStars = driver.find_element_by_xpath("//table[@id='rating_distribution']/tbody[1]/tr[3]/td[2]")
    percThreeStars = numThreeStars.text.split()[0]
    numThreeStars = numThreeStars.text.split()[1]
    percThreeStars = int(percThreeStars.replace('%', ''))
    numThreeStars = numThreeStars.replace('(', '')
    numThreeStars = int(numThreeStars.replace(')', ''))
    dictionary[getNumThreeStars()] = numThreeStars
    dictionary[getPercThreeStars()] = percThreeStars
    
    # Get number and percentage of 4 stars
    numFourStars = driver.find_element_by_xpath("//table[@id='rating_distribution']/tbody[1]/tr[2]/td[2]")
    percFourStars = numFourStars.text.split()[0]
    numFourStars = numFourStars.text.split()[1]
    percFourStars = int(percFourStars.replace('%', ''))
    numFourStars = numFourStars.replace('(', '')
    numFourStars = int(numFourStars.replace(')', ''))
    dictionary[getNumFourStars()] = numFourStars
    dictionary[getPercFourStars()] = percFourStars
    
    # Get number and percentage of 5 stars
    numFiveStars = driver.find_element_by_xpath("//table[@id='rating_distribution']/tbody[1]/tr[1]/td[2]")
    percFiveStars = numFiveStars.text.split()[0]
    numFiveStars = numFiveStars.text.split()[1]
    percFiveStars = int(percFiveStars.replace('%', ''))
    numFiveStars = numFiveStars.replace('(', '')
    numFiveStars = int(numFiveStars.replace(')', ''))
    dictionary[getNumFiveStars()] = numFiveStars
    dictionary[getPercFiveStars()] = percFiveStars

    # Click on 'Rating details' to close more information
    ratingDetailsButton.click()
    wait()
    
    try:
        # Get number of pages in book
        numPages = driver.find_element_by_xpath("//span[@itemprop='numberOfPages']")
        numPages = int(numPages.text.split()[0])
        dictionary[getNumPages()] = numPages
    except NoSuchElementException:
        dictionary[getNumPages()] = 0

    # Get Publisher and publish date
    try:
        details = driver.find_element_by_xpath("//div[@id='details']/div[2]") 
        details_list = details.text.split('(')
        if len(details_list) == 1:
            publishDate = parse(details_list[0], fuzzy=True)
            dictionary[getPublishDate()] = publishDate
        else:
            publishDate = details_list[1].replace(')','')
            publishDate = parse(publishDate, fuzzy=True)
            dictionary[getPublishDate()] = publishDate
        if "by" in details_list[0]:
            publisher = details_list[0].split('by')[1].strip()
            dictionary[getPublisher()] = publisher
        else:
            dictionary[getPublisher()] = np.nan
    except ValueError:
        dictionary[getPublishDate()] = np.nan
        dictionary[getPublisher()] = np.nan
    except NoSuchElementException:
        dictionary[getPublishDate()] = np.nan
        dictionary[getPublisher()] = np.nan

    # Click on '...more' if only partial data is visible
    moreDetailsButton = driver.find_element_by_xpath("//a[@id='bookDataBoxShow']")
    if moreDetailsButton.is_displayed():
        moreDetailsButton.click()
        wait()
    
    try:
        # Get book ISBN and ISBN13
        isbn_data = driver.find_element_by_xpath("//div[contains(text(), 'ISBN')]/following-sibling::div")
        isbn_list = isbn_data.text.split('(')

        if len(isbn_list) == 1:
            isbn_list[0] = isbn_list[0].strip()
            dictionary[getISBN()] = isbn_list[0]
            dictionary[getISBN13()] = np.nan
        else:
            isbn = isbn_list[0].strip()
            dictionary[getISBN()] = isbn
            isbn13_list = isbn_list[1].split(':')
            isbn13 = isbn13_list[1].replace(')','').strip()
            dictionary[getISBN13()] = isbn13
    except NoSuchElementException:
        dictionary[getISBN()] = np.nan
        dictionary[getISBN13()] = np.nan
    
    try:
        # Get book ASIN
        asin_data = driver.find_element_by_xpath("//div[contains(text(), 'ASIN')]/following-sibling::div")    
        dictionary[getASIN()] = asin_data.text.strip()
    except NoSuchElementException:
        dictionary[getASIN()] = np.nan
    
    try:
        # Click on '...more' in Awards if all awards not visible
        awards_data = driver.find_element_by_xpath("//div[contains(text(), 'Literary Awards')]/following-sibling::div")  
        more_rewards_button = awards_data.find_element_by_xpath(".//span[1]/a[1]")
    
        if more_rewards_button.is_displayed():
            more_rewards_button.click()
            wait()
    except NoSuchElementException:
        pass

    try:
        # Get list of awards and number of awards
        awards_data = driver.find_element_by_xpath("//div[contains(text(), 'Literary Awards')]/following-sibling::div") 
        awards_list = awards_data.find_elements_by_xpath("//a[@class='award']")
        awards = []
        for award in awards_list:
            awardName = award.text.split('(')[0].strip()
            awards.append(awardName)

        dictionary[getNumAwards()] = len(awards_list)
        dictionary[getAwards()] = awards
    except NoSuchElementException:
        dictionary[getNumAwards()] = 0
        dictionary[getAwards()] = np.nan
    
    # Return dictionary object
    return dictionary

In [461]:
def getBookGenres(driver, dictionary):
    
    genres = []
    genresCount = []
    tables = driver.find_elements_by_xpath("//div[@class='left']")
    
    # Iterating through tables and storing all genres and count
    for table in tables:
        shelves = table.find_elements_by_xpath(".//div[@class='shelfStat']")
        for shelve in shelves:
            genre = shelve.find_element_by_xpath(".//div[1]")
            genre = genre.text
            numPeople = shelve.find_element_by_xpath(".//div[2]")
            numPeople = numPeople.text.split()[0].replace(',','').strip()
            numPeople = int(numPeople)            
            genres.append(genre)
            genresCount.append(numPeople)
    
    dictionary[getGenres()] = genres
    dictionary[getGenresCount()] = genresCount
    return dictionary