# Data Collection

Scrape book data from Goodreads website. Get book title, author name, publication year, synopsis, average rating, number of ratings, and book length

## Import Necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

## Create a list of html files

In [2]:
html_files = []
for i in range(0, 25):
    num = i + 1
    html_files.append('romance' + str(num) + '.html')

## Define functions

In [3]:
# function to normalize rating, number of ratings, and release year
def norm_ratings(doc):
    x = str(doc)
    doc = re.sub(r'[^.\d+]', '', x)
    doc = doc.strip()
    return doc

In [4]:
# a function the get the book synopses and lengths
def get_synop_length(link):
    # request the html code for the individual book page
    base = 'http://www.goodreads.com'
    url = base + str(link)
    req = requests.get(url)
    soup2 = bs(req.text, 'html.parser')
    
    # get the book synopsis
    synopsis = soup2.find('span', id=re.compile('[^freeText\d+]'))
    if synopsis != None:
        synopsis = synopsis.text
    
    # get the book length
    length = soup2.find('span', itemprop='numberOfPages')
    length = str(length)
    length = re.sub(r'[^\d+]', '', length)
    
    return synopsis, length

## Loop through all the files and save the titles to a list

In [5]:
# loop through each page of the Romance list
# get lists of titles, authors, ratings, number of ratings, and release years
book_list = []
author_list = []
rating_list = []
num_ratings_list = []
year_list = []
synopsis_list = []
length_list = []

for page in range(len(html_files)):
    file_name = html_files[page]
    html_page = open(file_name, "r")
    index = html_page.read()
    soup = bs(index, 'html.parser')
    titles = soup.find_all(class_='bookTitle')
    authors = soup.find_all(class_='authorName')
    ratings_nums_years = soup.find_all(class_='greyText smallText')
    book_links = soup.find_all('a', class_='bookTitle', href=True)
    
    for x in range(len(titles)):
        book_list.append(titles[x].text)
        author_list.append(authors[x].text)
    
    for y in range(len(ratings_nums_years)):
        rny = norm_ratings(ratings_nums_years[y])
        rating_list.append(float(rny[:4]))
        num_ratings_list.append(int(rny[4:-4]))
        year_list.append(int(rny[-4:]))
        
    for z in range(len(book_links)):
        book_syn, book_len = get_synop_length(book_links[z]['href'])
        synopsis_list.append(book_syn)
        length_list.append(book_len)

## Compile data in a data frame

In [6]:
data = {'title': book_list,
        'author': author_list,
        'release year': year_list,
        'synopsis': synopsis_list,
        'book length': length_list,
        'rating': rating_list,
        'number of ratings': num_ratings_list
       }
book_df = pd.DataFrame(data)

# preview first 10 rows of the data frame
book_df.head(10)

Unnamed: 0,title,author,release year,synopsis,book length,rating,number of ratings
0,Pride and Prejudice (Paperback),Jane Austen,1813,,,4.28,3732237
1,The Fault in Our Stars (Hardcover),John Green,2012,,,4.16,4501032
2,"Red, White & Royal Blue (Paperback)",Casey McQuiston,2019,,,4.16,607767
3,"Twilight (The Twilight Saga, #1)",Stephenie Meyer,2005,About three things I was absolutely positive.F...,498.0,3.63,5901197
4,The Hating Game (Paperback),Sally Thorne,2016,Nemesis (n.) 1) An opponent or rival whom a pe...,365.0,3.98,537504
5,Beach Read (Paperback),Emily Henry,2020,A romance writer who no longer believes in lov...,361.0,4.06,607628
6,The Kiss Quotient (Kindle Edition),Helen Hoang,2018,,,3.94,331121
7,It Ends with Us (Kindle Edition),Colleen Hoover,2016,,,4.38,1641804
8,The Love Hypothesis (Paperback),Ali Hazelwood,2021,"As a third-year Ph.D. candidate, Olive Smith d...",356.0,4.29,724017
9,The Unhoneymooners (Paperback),Christina Lauren,2019,,,3.98,525789


In [7]:
count = 0
for i in range(len(book_df)):
    if book_df.loc[i, 'synopsis'] == None:
        count += 1
print(count)

605


## Save data to a csv file

In [9]:
book_df.to_csv('books.csv')