In [3]:
import json

with open("data/credentials.json", "r") as file:
    credentials = json.load(file)
    goodreads_cr = credentials['goodreads']
    user_id = goodreads_cr['USER_ID']
    api_key = goodreads_cr['API_KEY']

In [1]:
# Dependencies
import urllib.parse
import xmltodict
from collections import OrderedDict

import numpy as np, string, re, pytz
import pandas as pd

In [2]:
def get_books_url(api_key, user_id, page, per_page):
    base_url = "https://www.goodreads.com/review/list/"
    args = urllib.parse.urlencode({
            "key" : api_key,
            "v" : 2,
            "shelf" : shelf_name,
            "page" : page,
            "per_page" : per_page
        })
    url = base_url + str(user_id) + ".xml?%s" % (args)
    return(url)

def get_books_dict(**kwargs):

    print("Collecting %s books via API for username \"%s\", be patient!" % (kwargs["user_info"]["books_total"], kwargs["user_info"]["user_name"]))

    books_collected, page, requests = 0, 1, [] # initialise variables

    if "per_page" not in kwargs: kwargs["per_page"] = 200

    while books_collected < kwargs["user_info"]["books_total"]:
        books_url = get_books_url(kwargs["api_key"], kwargs["user_id"], page, kwargs["per_page"])
        books_data = urllib.request.urlopen(books_url).read()
        books_dict = xmltodict.parse(books_data)
        books_dict = books_dict["GoodreadsResponse"]["reviews"]["review"]
        requests.append(books_dict)
        for item in books_dict:
            books_collected = books_collected + 1
        print("Books collected = %s/%s" % (books_collected, kwargs["user_info"]["books_total"]))
        page = page + 1
    print("Book collection complete")
    return(requests)

def get_user_info_url(api_key, user_id):
    base_url = "https://www.goodreads.com/user/show/"
    args = urllib.parse.urlencode({
            "key" : api_key
        })
    url = base_url + str(user_id) + ".xml?%s" % (args)
    return(url)

def get_user_info(api_key, user_id):
    user_info_url = get_user_info_url(api_key, user_id)
    user_info_data = urllib.request.urlopen(user_info_url).read()
    user_info_dict = xmltodict.parse(user_info_data)
    books_total = int(user_info_dict["GoodreadsResponse"]["user"]["reviews_count"]["#text"])
    user_name = user_info_dict["GoodreadsResponse"]["user"]["user_name"]
    user_info = {
        "user_name" : user_name,
        "books_total" : books_total
        }
    return(user_info)

def get_shelves(book):
    return(book["shelves"]["shelf"])

def format_shelf_names(shelves):
    # If one shelf, xmltodict reads in the tags as OrderedDict
    if isinstance(shelves, OrderedDict):
        return shelves["@name"]
    return(", ".join(
        shelf["@name"] for shelf in shelves if "@name" in shelf
    ))

def get_author(review):
    return(review["book"]["authors"]["author"]["name"])

def get_title(review):
    return(review["book"]["title"])

def get_title_without_series(review):
    return(review["book"]["title_without_series"])

def get_image_url(review):
    return(review["book"]["image_url"])

def get_num_pages(review):
    return(review["book"]["num_pages"])

def get_publication_year(review):
    return(review["book"]["publication_year"])

def get_average_rating(review):
    return(review["book"]["average_rating"])

In [6]:
user_info = get_user_info(api_key, user_id)
user_info

{'user_name': 'waagles', 'books_total': 995}

In [7]:
# get user's books
books_dict_list = get_books_dict(api_key = api_key, user_id = user_id, user_info = user_info)

Collecting 995 books via API for username "waagles", be patient!
Books collected = 200/995
Books collected = 400/995
Books collected = 600/995
Books collected = 800/995
Books collected = 995/995
Book collection complete


In [8]:
# combine books into df
books_to_concat = []
for item in books_dict_list:
    books_to_concat.append(pd.DataFrame(item))

books = pd.concat(books_to_concat)
books.tail()

In [9]:
# more processing and adding additional info
shelves, authors, titles, titles_without_series, images_urls, nums_pages, publication_years, average_ratings = ([] for i in range(8))

for item in books_dict_list:
    for x in item:
        shelves.append(format_shelf_names(get_shelves(x)))
        authors.append(get_author(x))
        titles.append(get_title(x))
        titles_without_series.append(get_title_without_series(x))
        images_urls.append(get_image_url(x))
        nums_pages.append(get_num_pages(x))
        publication_years.append(get_publication_year(x))
        average_ratings.append(get_average_rating(x))
        
books["shelves"] = shelves
books["author"] = authors
books["title"] = titles
books["title_without_series"] = titles_without_series
books["image_url"] = images_urls
books["num_page"] = nums_pages
books["publication_year"] = publication_years
books["average_rating"] = average_ratings

# type addition
books['started_at'] = pd.to_datetime(books['started_at'])
books['read_at'] = pd.to_datetime(books['read_at'])
books['date_added'] = pd.to_datetime(books['date_added'])
books['date_updated'] = pd.to_datetime(books['date_updated'])

books['rating'] = pd.to_numeric(books["rating"])
books['average_rating'] = pd.to_numeric(books["average_rating"])
books['read_count'] = pd.to_numeric(books["read_count"])
books["num_page"] = pd.to_numeric(books["num_page"])

In [10]:
# this data is no longer needed so dropping it
books.drop(['book'], axis=1, inplace=True)

In [11]:
# total books
len(books)

995

In [69]:
books.to_csv("data/books.csv", index = False)
print("books.csv created")

books.csv created


In [71]:
books = pd.read_csv("data/books.csv")

In [72]:
books['started_at'] = books['started_at'].str.slice(start=0, stop=10, step=1)
books['read_at'] = books['read_at'].str.slice(start=0, stop=10, step=1)
books['date_added'] = books['date_added'].str.slice(start=0, stop=10, step=1)
books['date_updated'] = books['date_updated'].str.slice(start=0, stop=10, step=1)

In [73]:
books['started_at'] = pd.to_datetime(books['started_at'])
books['read_at'] = pd.to_datetime(books['read_at'])
books['date_added'] = pd.to_datetime(books['date_added'])
books['date_updated'] = pd.to_datetime(books['date_updated'])

In [74]:
books['rating'] = pd.to_numeric(books["rating"])
books['average_rating'] = pd.to_numeric(books["average_rating"])
books['read_count'] = pd.to_numeric(books["read_count"])
books["num_page"] = pd.to_numeric(books["num_page"])

In [79]:
'''
from IPython.display import display
with pd.option_context('display.max_rows', 1000):
    display(books['read_at'])
'''

"\nfrom IPython.display import display\nwith pd.option_context('display.max_rows', 1000):\n    display(books['read_at'])\n"

In [75]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    995 non-null    int64         
 1   rating                995 non-null    int64         
 2   votes                 995 non-null    int64         
 3   spoiler_flag          995 non-null    bool          
 4   spoilers_state        995 non-null    object        
 5   shelves               995 non-null    object        
 6   recommended_for       0 non-null      float64       
 7   recommended_by        0 non-null      float64       
 8   started_at            48 non-null     datetime64[ns]
 9   read_at               105 non-null    datetime64[ns]
 10  date_added            995 non-null    datetime64[ns]
 11  date_updated          995 non-null    datetime64[ns]
 12  read_count            995 non-null    int64         
 13  body                

In [78]:
books.to_csv("data/books.csv", index = False)
print("books.csv created")

books.csv created
