#  Installing and importing packages

In [1]:
pip install betterreads

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
from betterreads import client
from goodreads_data_gather import helper

Gathering data... 
[   10    11    12 ...  9998  9999 10000]
getting 10 out of missing 9991 descriptions
10,Pride and Prejudice
saving the results to the file....
11,The Kite Runner
12,Divergent (Divergent, #1)
13,1984
14,Animal Farm
15,The Diary of a Young Girl
16,The Girl with the Dragon Tattoo (Millennium, #1)
17,Catching Fire (The Hunger Games, #2)
18,Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
19,The Fellowship of the Ring (The Lord of the Rings, #1)


# Data Gathering

In [2]:
h = helper()

Gathering data... 


In [3]:
### Note : Only run this very first run so we can get the orginal books set

In [4]:
books = pd.read_csv('../data/books.csv')
print( f"shape of the books dataframe is {books.shape}")
#We are going to add. 3 more columns and populate them using the GoodReads client
books['description'] =np.nan
books['num_pages'] =0
books['e_book'] =False
books.to_csv("../data/books_desc.csv", index=False)
books.columns

In [6]:
books =pd.read_csv("../data/books_desc.csv")

ratings = pd.read_csv("../data/ratings.csv")
print( f"shape of the ratings dataframe is {ratings.shape}")

book_tags = pd.read_csv("../data/book_tags.csv")
print( f"shape of the books_tags dataframe is {book_tags.shape}")

tags = pd.read_csv("../data/tags.csv")
print( f"shape of the tags dataframe is {tags.shape}")

### Combining booktag ids with the actual names
pd.merge(left=book_tags,right =tags, how ="left", on ='tag_id')
book_tags.to_csv("../data/book_tags_combined",index =False)
books.head(1)

shape of the ratings dataframe is (5976479, 3)
shape of the books_tags dataframe is (999912, 3)
shape of the tags dataframe is (34252, 2)


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,description,num_pages,e_book,is_ebook
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"Could you survive on your own, in the wild, wi...",374,False,False


In [7]:
tags.head(1)

Unnamed: 0,tag_id,tag_name
0,0,-


In [8]:
book_tags.head(1)

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697


In [9]:
ratings.head(1)

Unnamed: 0,user_id,book_id,rating
0,1,258,5


In [10]:
### We are going to populate description , num_of_pages and is_ebook from goodreads Api

In [11]:
books =h.fill_missing_columns("../data/books_desc.csv",10)

[   10    11    12 ...  9998  9999 10000]
getting 10 out of missing 9991 descriptions
10,Pride and Prejudice
saving the results to the file....
11,The Kite Runner
12,Divergent (Divergent, #1)
13,1984
14,Animal Farm
15,The Diary of a Young Girl
16,The Girl with the Dragon Tattoo (Millennium, #1)
17,Catching Fire (The Hunger Games, #2)
18,Harry Potter and the Prisoner of Azkaban (Harry Potter, #3)
19,The Fellowship of the Ring (The Lord of the Rings, #1)


585 books that are missing orig title, replace them with title
book = gc.book(3)
book.original_title

### Collecting  recent books  based on Denver DSI classmates reading preferences

In [15]:
collect_books = pd.DataFrame()
topics =['python','of Thrones','Patrick Rothfuss','A Song of Ice and Fire','Inspector Rebus',
      'Data Science', 'mechine learning','Data Visualization','Robert T. Kiyosaki','Brené Brown',
      'weapons of math destruction','Girl, Wash Your Face','recommender systems']
for topic in topics :
    df2 =h.get_books(topic)
    books = pd.concat([books,df2],sort=True)
books.reset_index(inplace=True, drop=True)  
books.to_csv("../data/books_desc.csv", index=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys


In [19]:
books.columns

Index(['authors', 'average_rating', 'book_id', 'description', 'e_book',
       'goodreads_book_id', 'is_ebook', 'language_code', 'num_pages',
       'original_publication_year', 'ratings_1', 'ratings_2', 'ratings_3',
       'ratings_4', 'ratings_5', 'ratings_count', 'title'],
      dtype='object')

### Data Cleaning

In [20]:
###### How many of the  columns are missing data ?
books.isna().sum()

authors                         0
average_rating                  0
book_id                       138
description                  9981
e_book                        138
goodreads_book_id               0
is_ebook                     9981
language_code                1138
num_pages                       0
original_publication_year     159
ratings_1                     138
ratings_2                     138
ratings_3                     138
ratings_4                     138
ratings_5                     138
ratings_count                   0
title                           0
dtype: int64

In [18]:
print(f"{len(books)}, {len(books[books['original_title'].isna()])},{len(books[books['original_title'] != books ['title']])}")
# Title seems to be including original title in all these cases, we are going to drop original_title
books[books['original_title'] != books ['title']][['goodreads_book_id','title','original_title']]

KeyError: 'original_title'

In [14]:
#Drop columns that are not relevant for the recommender
# books = books.drop(['best_book_id','work_id','isbn13','isbn','original_title',
#                     'books_count','work_ratings_count','work_text_reviews_count',
#                     'image_url','small_image_url'], axis =1)

In [None]:
###### If the book description was empty ,  adding a string so that  Vecotrizer can  work with the data
books['description'] = books['description'].fillna('-')

In [None]:
books['original_publication_year'] = books['original_publication_year'].fillna(0).astype('int')
books['original_publication_year'].describe()

In [None]:
books['language_code'] = books['language_code'].apply(lambda x : 'eng' if x in  ['eng','en-US','en-GB','en-CA'] else 'other')
books['language_code'].value_counts()

In [None]:
books.loc[books['original_publication_year'] <0,'original_publication_year'] =0
def publication_year_categorize(original_publication_year):
    if original_publication_year >=2015 :
        return('2015-2020')
    elif original_publication_year >= 2010:
        return('2010-2014')
    elif original_publication_year >= 2000:
        return('2000-2013')
    elif original_publication_year >= 1990:
        return('1990-2000')
    elif original_publication_year >= 1950:
        return('1950-1990')
    elif original_publication_year >= 1:
        return('<1950')
    else:
         return('unknown')
books['publication_timeframe'] =books['original_publication_year'].map(publication_year_categorize)
books['publication_timeframe'].value_counts()

In [None]:
###### Are the books unique ?

we have books unique isbn numbers  but  they have similar titles.
I am little wary of this data set now. It might  be a good  for proof of concept but  we may need to get cleaner data

In [139]:
#De-duplicating the  records as we only care about title and content of the book , 
books_with_same_title = (books['title'].value_counts()  )
print(f"duplicate book count = {len(books_with_same_title.index[books_with_same_title >1])}")

dup_books =books_with_same_title.index[books_with_same_title >1]
dup_book_df = books[books['title'].isin(dup_books)]

books = books.groupby('title').agg(
                             {
                              'book_id':'first',
                              'goodreads_book_id':'first',
                              'authors':'first', 
                              'description' :'first',
                              'original_publication_year': 'first',
                              'language_code' :'first',
                              'average_rating':'mean',
                              'ratings_count': 'sum',                              
                              'num_pages':'first',
                              'is_ebook':'first',
                              'ratings_count':'first'
                              }).reset_index()

duplicate book count = 163


In [None]:
###### Consolidate all Audio tags under same category
def consolidate_tags(tag_lst,target_tag):
    target_tag_id = book_tags[book_tags['tag_name']==target_tag]['tag_id'].values[0]  
    l= book_tags[(book_tags['tag_name'].isin(tag_lst))]['tag_name'].to_list()
    print(f"replacing {len(l)} tags with {target_tag}, tag_id= {target_tag_id}")
    book_tags['tag_name']= book_tags['tag_name'].apply(lambda x : target_tag if x in tag_lst else x)
    book_tags.loc[book_tags['tag_name'] ==target_tag,'tag_id']= target_tag_id 

In [None]:
##### Consolidate all childrens books under same category
#tag_lst= book_tags[(book_tags['tag_name'].str.contains('children')) ]['tag_name'].to_list()
#consolidate_tags(list(set(tag_lst)),'childrens')

#tag_lst= ['audio','audiobooks','audiobook','audible','audio-book','audio-books'] +(book_tags[(book_tags['tag_name'].str.contains('audiobook'))]['tag_name'].to_list())
#consolidate_tags(set(tag_lst),'audiobook')

# tag_lst = book_tags[(book_tags['tag_name'].str.contains('history'))]['tag_name'].to_list()
# consolidate_tags(set(['war'] +tag_lst),'history')

#suspense
# tag_lst= book_tags[(book_tags['tag_name'].str.contains('mystery'))]['tag_name'].to_list()
# consolidate_tags(tag_lst,'mystery-thriller')


# tag_lst=set(book_tags[(book_tags['tag_name'].str.contains('novel'))]['tag_name'].to_list())
# consolidate_tags(tag_lst,'novels')

# consolidate_tags(['teen','juvenile','youth','new-adult'],'young-adult')
# consolidate_tags(['general-fiction','speculative-fiction','magic','literary-fiction','contemporary-fiction','modern-fiction','vampires','vampire','fiction-to-read'],'fiction')
# consolidate_tags(['paranormal','paranormal-romance','paranormal-fantasy','fantasy-paranormal'],'supernatural') 
# consolidate_tags(['fantasy-sci-fi,sci-fi-fantasy','science-fiction-fantasy','fantasy-scifi','sci-fi-and-fantasy'],'scifi-fantasy') 
# consolidate_tags(['funny','humour','comedy','humorous'],'humor') 
# consolidate_tags(['20th-century','19th-century','historicals'],'historical') 
# consolidate_tags(['sci-fi','scifi'],'science-fiction') 

#consolidate_tags(['non-fiction','to-read-nonfiction','non-fic','nonfiction-to-read'],'nonfiction') 
#consolidate_tags(['ya-fiction','harry-potter'],'fiction') 

# consolidate_tags(['urban-fantasy','ya-fantasy','high-fantasy','sf-fantasy','guilty-pleasures'],'fantasy')
# consolidate_tags(['thrillers'],'thriller')
# consolidate_tags(['childhood-reads','kids','kids-books','childhood-books','childhood','childhood-favorites'],'childrens')
# consolidate_tags(['listened-to'],'audiobook')
# consolidate_tags(['to-read-non-fiction','non-fiction-to-read'],'nonfiction') 
# consolidate_tags(['contemporary-romance'],'romance')
# consolidate_tags(['urban-fantasy','ya-fantasy'],'fantasy')


# tag_lst=set(book_tags[(book_tags['tag_name'].str.contains('memoir'))]['tag_name'].to_list())
# consolidate_tags(tag_lst,'biography')

# consolidate_tags(['teen-fiction','juvenile-fiction'],'young-adult-fiction')
# consolidate_tags(['classic','literature','literary','classics','american-lit','american-literature','british-literature','chic-lit','kid-lit','chicklit','to-read-classics'],'classic-literature') 
# consolidate_tags(['dystopia'],'dystopian')



# consolidate_tags(['realistic'],'realistic-fiction')
# consolidate_tags(['erotica','erotic'],'adult-romance')
# consolidate_tags(['modern'],'contemporary')
consolidate_tags(['suspense-thriller','thriller-mystery','thriller-suspense','mystery-thrillers','epic-fantasy','mystery-suspense-thriller','mystery-thriller-suspense'],'mystery-thriller')


In [145]:
books.to_csv("../data/books_desc.csv", index=False)

######  references
https://pypi.org/project/Goodreads/

https://betterreads.readthedocs.io/en/latest/

Kaggle data source : https://www.kaggle.com/zygmunt/goodbooks-10k

data set location : https://github.com/zygmuntz/goodbooks-10k

#https://pandas.pydata.org/pandas-docs/stable/reference/groupby.html