In [1]:
import json
import pandas as pd

# Books Data

### Reading in one line of books.json:

In [56]:
with open("goodreads_books.json") as f:
    line = f.readline()
    
data = json.loads(line)
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

### Extract Genre from `popular_shelves` field

In [57]:
def get_genre(data):
    genres = []
    for i in range(len(data['popular_shelves'])):
        genre = list(data['popular_shelves'][i].values())[1]
        genres.append(genre)
    return genres

get_genre(data)

['to-read', 'p', 'collection', 'w-c-fields', 'biography']

### Extract `author_id`

In [58]:
def get_author(data):
    for i in range(len(data['authors'])):
        author = data['authors'][i]['author_id']
        return author

get_author(data)

'604031'

### Extract fields to set up a pandas DataFrame

In [59]:
def parse_fields(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "isbn" : data['isbn'],
        "author_id" : get_author(data),
        "publisher" : data['publisher'],
        "title": data["title_without_series"],
        "genres": get_genre(data),
        "description" :data['description'],
        "avg_rating" : data['average_rating'],
        "ratings_count": data["ratings_count"],
        "num_pages" : data['num_pages'],
        "pub_year" : data['publication_year'],
        'language_code' : data['language_code'],
        "similar_books" : data['similar_books'],
        "url": data["url"], 
        "cover_image": data["image_url"]}

In [60]:
# Testing out parse_fields function
parse_fields(line)

{'book_id': '5333265',
 'isbn': '0312853122',
 'author_id': '604031',
 'publisher': "St. Martin's Press",
 'title': 'W.C. Fields: A Life on Film',
 'genres': ['to-read', 'p', 'collection', 'w-c-fields', 'biography'],
 'description': '',
 'avg_rating': '4.00',
 'ratings_count': '3',
 'num_pages': '256',
 'pub_year': '1984',
 'language_code': '',
 'similar_books': [],
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'cover_image': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg'}

In [61]:
# Loop through books.json to extract fields of interest
books_titles = []

with open("goodreads_books.json") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_fields(line)
        try:
            ratings = int(fields["ratings_count"])
        except ValueError:
            continue
        if ratings > 5:
            books_titles.append(fields)

In [62]:
# Create dataframe
books = pd.DataFrame.from_dict(books_titles)

In [63]:
books.head()

Unnamed: 0,book_id,isbn,author_id,publisher,title,genres,description,avg_rating,ratings_count,num_pages,pub_year,language_code,similar_books,url,cover_image
0,1333909,743509986.0,626222,Simon & Schuster Audio,Good Harbor,"[to-read, fiction, currently-reading, contempo...","Anita Diamant's international bestseller ""The ...",3.23,10,,2001.0,,"[8709549, 17074050, 28937, 158816, 228563, 112...",https://www.goodreads.com/book/show/1333909.Go...,https://s.gr-assets.com/assets/nophoto/book/11...
1,7327624,,10333,"Nelson Doubleday, Inc.","The Unschooled Wizard (Sun Wolf and Starhawk, ...","[to-read, fantasy, fiction, owned, hardcover, ...",Omnibus book club edition containing the Ladie...,4.03,140,600.0,1987.0,eng,"[19997, 828466, 1569323, 425389, 1176674, 2627...",https://www.goodreads.com/book/show/7327624-th...,https://images.gr-assets.com/books/1304100136m...
2,6066819,743294297.0,9212,Atria Books,Best Friends Forever,"[to-read, chick-lit, currently-reading, fictio...",Addie Downs and Valerie Adler were eight when ...,3.49,51184,368.0,2009.0,eng,"[6604176, 6054190, 2285777, 82641, 7569453, 70...",https://www.goodreads.com/book/show/6066819-be...,https://s.gr-assets.com/assets/nophoto/book/11...
3,287140,850308712.0,149918,,Runic Astrology: Starcraft and Timekeeping in ...,"[to-read, runes, owned, nonfiction, kill-it-wi...",,3.4,15,,,,[],https://www.goodreads.com/book/show/287140.Run...,https://images.gr-assets.com/books/1413219371m...
4,287141,1599150603.0,3041852,Yesterday's Classics,The Aeneid for Boys and Girls,"[to-read, currently-reading, history, classics...","Relates in vigorous prose the tale of Aeneas, ...",4.13,46,162.0,2006.0,,[],https://www.goodreads.com/book/show/287141.The...,https://s.gr-assets.com/assets/nophoto/book/11...


In [64]:
books.shape

(1782579, 15)

In [75]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1782579 entries, 0 to 1782578
Data columns (total 15 columns):
 #   Column         Dtype   
---  ------         -----   
 0   book_id        object  
 1   isbn           object  
 2   author_id      object  
 3   publisher      object  
 4   title          object  
 5   genres         object  
 6   description    object  
 7   avg_rating     float64 
 8   ratings_count  int64   
 9   num_pages      int64   
 10  pub_year       object  
 11  language_code  category
 12  similar_books  object  
 13  url            object  
 14  cover_image    object  
dtypes: category(1), float64(1), int64(2), object(11)
memory usage: 193.8+ MB


### Change some data types

In [67]:
books['avg_rating'] = books['avg_rating'].astype(float)
books['language_code'] = books['language_code'].astype('category')
books['ratings_count'] = books['ratings_count'].astype(int)
books['num_pages'] = books['num_pages'].replace('', 0).astype(int)

### Subset the data:

* We will only include books with a description available, and that are in english

In [78]:
books_with_desc = books[books['description'].str.len() > 1].reset_index(drop=True)

In [79]:
books_eng = books_with_desc.query('language_code == "eng" or language_code == ""').reset_index(drop=True)

In [81]:
books_eng.shape

(1115445, 15)

In [82]:
books_eng.to_csv('books.csv', index=False)