# Goodreads Books Read Downloader

Simple function to download and process User Books from Goodreads.com, especially your read books. 

----

## Acknowledgement 

* Still a work in progress. 
* This code is heavily indebted to and adopted from https://github.com/nladwa/goodreads-api

-----

## Authentification

* Get your user id and API Key from Goodreads.com.
* Copy credentials-sample.json to credentials.json
* Add your user id and api key and save.

In [1]:
import json

with open("credentials.json", "r") as file:
    credentials = json.load(file)
    goodreads_cr = credentials['goodreads']
    user_id = goodreads_cr['USER_ID']
    api_key = goodreads_cr['API_KEY']

---

In [2]:
import urllib.parse
import xmltodict
from collections import OrderedDict

In [3]:
import numpy as np, string, re, pytz
import pandas as pd

----

## Helper Functions

In [4]:
def get_books_url(api_key, user_id, page, per_page):
    base_url = "https://www.goodreads.com/review/list/"
    args = urllib.parse.urlencode({
            "key" : api_key,
            "v" : 2,
            # "shelf" : shelf_name,
            "page" : page,
            "per_page" : per_page
        })
    url = base_url + str(user_id) + ".xml?%s" % (args)
    return(url)

def get_books_dict(**kwargs):

    print("Collecting %s books via API for username \"%s\", be patient!" % (kwargs["user_info"]["books_total"], kwargs["user_info"]["user_name"]))

    books_collected, page, requests = 0, 1, [] # initialise variables

    if "per_page" not in kwargs: kwargs["per_page"] = 200

    while books_collected < kwargs["user_info"]["books_total"]:
        books_url = get_books_url(kwargs["api_key"], kwargs["user_id"], page, kwargs["per_page"])
        books_data = urllib.request.urlopen(books_url).read()
        books_dict = xmltodict.parse(books_data)
        books_dict = books_dict["GoodreadsResponse"]["reviews"]["review"]
        requests.append(books_dict)
        for item in books_dict:
            books_collected = books_collected + 1
        print("Books collected = %s/%s" % (books_collected, kwargs["user_info"]["books_total"]))
        page = page + 1
    print("Book collection complete")
    return(requests)

def get_user_info_url(api_key, user_id):
    base_url = "https://www.goodreads.com/user/show/"
    args = urllib.parse.urlencode({
            "key" : api_key
        })
    url = base_url + str(user_id) + ".xml?%s" % (args)
    return(url)

def get_user_info(api_key, user_id):
    user_info_url = get_user_info_url(api_key, user_id)
    user_info_data = urllib.request.urlopen(user_info_url).read()
    user_info_dict = xmltodict.parse(user_info_data)
    books_total = int(user_info_dict["GoodreadsResponse"]["user"]["reviews_count"]["#text"])
    user_name = user_info_dict["GoodreadsResponse"]["user"]["user_name"]
    user_info = {
        "user_name" : user_name,
        "books_total" : books_total
        }
    return(user_info)

def get_shelves(book):
    return(book["shelves"]["shelf"])

def format_shelf_names(shelves):
    # If one shelf, xmltodict reads in the tags as OrderedDict
    if isinstance(shelves, OrderedDict):
        return shelves["@name"]
    return(", ".join(
        shelf["@name"] for shelf in shelves if "@name" in shelf
    ))

def get_author(review):
    return(review["book"]["authors"]["author"]["name"])

def get_title(review):
    return(review["book"]["title"])

def get_title_without_series(review):
    return(review["book"]["title_without_series"])

def get_image_url(review):
    return(review["book"]["image_url"])

def get_num_pages(review):
    return(review["book"]["num_pages"])

def get_publication_year(review):
    return(review["book"]["publication_year"])

def get_average_rating(review):
    return(review["book"]["average_rating"])

---

## Get User's Book Collection in GoodReads

In [5]:
user_info = get_user_info(api_key, user_id)
user_info

{'books_total': 1307, 'user_name': 'markwkoester'}

In [6]:
# get user's books
books_dict_list = get_books_dict(api_key = api_key, user_id = user_id, user_info = user_info)

Collecting 1307 books via API for username "markwkoester", be patient!
Books collected = 200/1307
Books collected = 400/1307
Books collected = 600/1307
Books collected = 800/1307
Books collected = 1000/1307
Books collected = 1200/1307
Books collected = 1307/1307
Book collection complete


In [7]:
# combine books into df
books_to_concat = []
for item in books_dict_list:
    books_to_concat.append(pd.DataFrame(item))

books = pd.concat(books_to_concat)
# books.tail()

In [8]:
# more processing and adding additional info
shelves, authors, titles, titles_without_series, images_urls, nums_pages, publication_years, average_ratings = ([] for i in range(8))

for item in books_dict_list:
    for x in item:
        shelves.append(format_shelf_names(get_shelves(x)))
        authors.append(get_author(x))
        titles.append(get_title(x))
        titles_without_series.append(get_title_without_series(x))
        images_urls.append(get_image_url(x))
        nums_pages.append(get_num_pages(x))
        publication_years.append(get_publication_year(x))
        average_ratings.append(get_average_rating(x))
        
books["shelves"] = shelves
books["author"] = authors
books["title"] = titles
books["title_without_series"] = titles_without_series
books["image_url"] = images_urls
books["num_page"] = nums_pages
books["publication_year"] = publication_years
books["average_rating"] = average_ratings

# type addition
books['started_at'] = pd.to_datetime(books['started_at'])
books['read_at'] = pd.to_datetime(books['read_at'])
books['date_added'] = pd.to_datetime(books['date_added'])
books['date_updated'] = pd.to_datetime(books['date_updated'])

books['rating'] = pd.to_numeric(books["rating"])
books['average_rating'] = pd.to_numeric(books["average_rating"])
books['read_count'] = pd.to_numeric(books["read_count"])
books["num_page"] = pd.to_numeric(books["num_page"])

In [9]:
# this data is no longer needed so dropping it
books.drop(['book'], axis=1, inplace=True)

In [10]:
# books.columns

In [11]:
# total books
len(books)

1307

-----

## Additional Columns to process and nuance date read

In [20]:
# TODO
# Unfortunatlely functions currently fail since many fields have NaT or missing data

In [13]:
# functions to convert UTC to Shanghai time zone and extract date/time elements
#convert_tz = lambda x: x.to_pydatetime().replace(tzinfo=pytz.utc).astimezone(pytz.timezone('Asia/Shanghai'))
#get_year = lambda x: convert_tz(x).year
#get_month = lambda x: '{}-{:02}'.format(convert_tz(x).year, convert_tz(x).month) #inefficient
#get_day = lambda x: convert_tz(x).day
#get_hour = lambda x: convert_tz(x).hour
#get_day_of_week = lambda x: convert_tz(x).weekday()

In [14]:
# parse out date and time elements as Shanghai time
#books['ReadYear'] = books['read_at'].map(get_year)
#books['ReadMonth'] = books['read_at'].map(get_month)
#books['ReadDay'] = books['read_at'].map(get_day)
#books['ReadHour'] = books['read_at'].map(get_hour)
#books['ReadDOW'] = books['read_at'].map(get_day_of_week)
# past_tasks = past_tasks.drop(labels=['completed_date'], axis=1)

-----

## Books Read

In [15]:
# total books read
books_read = books[(books.read_count >= 1) & (books.shelves != 'currently-reading')]
len(books_read)

866

In [16]:
books_read.describe()

Unnamed: 0,rating,read_count,num_page,average_rating
count,866.0,866.0,810.0,866.0
mean,3.262125,1.004619,360.044444,3.986028
std,1.920831,0.067845,244.911281,0.338544
min,0.0,1.0,2.0,0.0
25%,2.0,1.0,217.25,3.86
50%,4.0,1.0,320.0,4.01
75%,5.0,1.0,432.0,4.18
max,5.0,2.0,4100.0,4.74


In [19]:
# books_read.head()

----

## Export to CSV

In [18]:
books.to_csv("data/books.csv", index = False)
print("books.csv created")

books.csv created
