# Display Sample Records

In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd

**Specify your directory here:**

In [2]:
DIR = './data'

**This function shows how to load datasets**

In [9]:
def load_data(file_name, head = 500):
    '''
        Given a *.json.gz file, returns a list of dictionaries,
        optionally can select the first n records
    '''
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the 500th line
            if (head is not None) and (count >= head):
                break
    return data

**Load and display sample records of books/authors/works/series**

In [10]:
poetry = load_data(os.path.join(DIR, 'goodreads_books_poetry.json.gz'))

# books = load_data(os.path.join(DIR, 'goodreads_books.json.gz'))
# authors = load_data(os.path.join(DIR, 'goodreads_book_authors.json.gz'))
# works = load_data(os.path.join(DIR, 'goodreads_book_works.json.gz'))
# series = load_data(os.path.join(DIR, 'goodreads_book_series.json.gz'))

In [11]:
len(poetry)

500

In [12]:
poetry[0]

{'isbn': '',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': 'eng',
 'popular_shelves': [{'count': '8', 'name': 'to-read'},
  {'count': '3', 'name': 'poetry'},
  {'count': '2', 'name': 'currently-reading'},
  {'count': '1', 'name': '01-kindle'},
  {'count': '1', 'name': 'real-books'},
  {'count': '1', 'name': 'personal-library'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '3.83',
 'kindle_asin': '',
 'similar_books': [],
 'description': 'Number 30 in a series of literary pamphlets published monthly and available at the price of 15 cents per copy, or a yearly subscription (19 numbers) for $1.25',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/16037549-vision-of-sir-launfal-and-other-poems',
 'authors': [{'author_id': '15585', 'role': ''}],
 'publisher': 'Houghton, Mifflin and Company',
 'num_pages': '80',
 'publication_day': '1',
 'isbn13': '',
 'publication_month': '11',
 'edition_information': '',
 'publication_yea

In [16]:
# print(' == sample record (books) ==')
# display(np.random.choice(books))
# print(' == sample record (authors) ==')
# display(np.random.choice(authors))
# print(' == sample record (works) ==')
# display(np.random.choice(works))
# print(' == sample record (series) ==')
# display(np.random.choice(series))

**Load and display sample records of user-book interactions (shelves)**

In [18]:
interactions = load_data(os.path.join(DIR, 'goodreads_interactions_poetry.json.gz'))
np.random.choice(interactions)

{'user_id': 'e65bfb998920844f263f5cbb499cf203',
 'book_id': '820305',
 'review_id': '73b58b3eaa020123efbe806398a3fdda',
 'is_read': True,
 'rating': 4,
 'review_text_incomplete': '',
 'date_added': 'Mon Aug 05 05:53:22 -0700 2013',
 'date_updated': 'Mon Aug 05 05:53:22 -0700 2013',
 'read_at': '',
 'started_at': ''}

**Load and display sample records of book reviews**

In [19]:
reviews = load_data(os.path.join(DIR, 'goodreads_reviews_poetry.json.gz'))
np.random.choice(reviews)

{'user_id': '13a8074a4cf64337edcf084b30b9a72a',
 'book_id': '770591',
 'review_id': 'd0d1ee0faf4ce414749e63ebd400a949',
 'rating': 4,
 'review_text': "People have compared my poetry to Mary Oliver's enough times that I felt I should learn more about her. After reading this book, I'm honored by the comparison. She writes very simple, clean poetry about the natural world, and I can only hope that one day I achieve some measure of the control that she exhibits here. \n One oddity about this book: I'd expected the poems to be arranged in chronological order, and it took me some time to realize they were actually arranged in reverse chronological order, with the newest poems first. I suspect this was done for the benefit of her fans who wanted to get to the new stuff first, but it was somewhat confusing for me, as I was reading the book expecting to see the unfolding of her style over time. Once I figured out what was going on, I started reading the book from the back (the way I almost alwa

**Load and display sample records of book reviews (with spoiler tags)**

In [23]:
spoilers = load_data(os.path.join(DIR, 'goodreads_reviews_spoiler.json.gz'))
np.random.choice([s for s in spoilers if s['has_spoiler']])

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'timestamp': '2014-07-26',
 'review_sentences': [[0, 'What a fun series.'],
  [0,
   'I loved Wool, and Dust and Shift both gave us the backstory to explain the world and how it ended up.'],
  [0,
   'I think the first book was by far the best, but this gave us a nice conclusion.'],
  [1,
   'It was the conclusion we wanted to see - the people finally get outside!'],
  [1, 'My problem with this book is there were lots of holes.'],
  [1,
   "A lot of the other reviews have pointed this out too, and I'm not sure if the fact I know it was self-published is biasing me to say it could have used more editing, but feels that way a little."],
  [1, 'But the writing was great.'],
  [1,
   "There were lots of things that weren't cleared up or never really fully made sense."],
  [1,
   'The major one is why Thurman really felt the need to destroy the whole world - feels like there could have been a lot more to that.'],
  [1,
   "It also wasn't clea

In [9]:
# spoilers = load_data(os.path.join(DIR, 'goodreads_reviews_spoiler_raw.json.gz'))
# np.random.choice([s for s in spoilers if 'view spoiler' in s['review_text']])

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '28684704',
 'review_id': '2ede853b14dc4583f96cf5d120af636f',
 'rating': 3,
 'review_text': 'A fun, fast paced science fiction thriller. I read it in 2 nights and couldn\'t put it down. The book is about the quantum theory of many worlds which states that all decisions we make throughout our lives basically create branches, and that each possible path through the decision tree can be thought of as a parallel world. And in this book, someone invents a way to switch between these worlds. This was nicely alluded to/foreshadowed in this quote: \n "I think about all the choices we\'ve made that created this moment. Us sitting here together at this beautiful table. Then I think of all the possible events that could have stopped this moment from ever happening, and it all feels, I don\'t know..." "What?" "So fragile." Now he becomes thoughtful for a moment. He says finally, "It\'s terrifying when you consider that every thought we ha