In [9]:
import wget
import json
import gzip
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Review Data

In [4]:
# Download review data
url1 = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_reviews_dedup.json.gz'
file1 = wget.download(url1)

In [12]:
# Look at a sample
with gzip.open("/home/ubuntu/goodreads_reviews_dedup.json.gz") as f:
    line = f.readline()
    
data = json.loads(line)
data

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '24375664',
 'review_id': '5cd416f3efc3f944fce4ce2db2290d5e',
 'rating': 5,
 'review_text': "Mind blowingly cool. Best science fiction I've read in some time. I just loved all the descriptions of the society of the future - how they lived in trees, the notion of owning property or even getting married was gone. How every surface was a screen. \n The undulations of how society responds to the Trisolaran threat seem surprising to me. Maybe its more the Chinese perspective, but I wouldn't have thought the ETO would exist in book 1, and I wouldn't have thought people would get so over-confident in our primitive fleet's chances given you have to think that with superior science they would have weapons - and defenses - that would just be as rifles to arrows once were. \n But the moment when Luo Ji won as a wallfacer was just too cool. I may have actually done a fist pump. Though by the way, if the Dark Forest theory is right - and I

In [14]:
# Function to extract important features from review data
def parse_review_data(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        "user_id" : data['user_id'],
        'rating' : data['rating']}

parse_review_data(line)

{'book_id': '24375664',
 'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'rating': 5}

In [15]:
# Extract review data from entire dataset
books_reviews = []

with gzip.open("/home/ubuntu/goodreads_reviews_dedup.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_review_data(line)
        books_reviews.append(fields)

In [16]:
# Save as a dataframe
reviews = pd.DataFrame.from_dict(books_reviews)

print(reviews.shape)
reviews.head()

(15739967, 3)


Unnamed: 0,book_id,user_id,rating
0,24375664,8842281e1d1347389f2ab93d60773d4d,5
1,18245960,8842281e1d1347389f2ab93d60773d4d,5
2,6392944,8842281e1d1347389f2ab93d60773d4d,3
3,22078596,8842281e1d1347389f2ab93d60773d4d,4
4,6644782,8842281e1d1347389f2ab93d60773d4d,4


# Book Data

In [10]:
# Download book data
url2 = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_books.json.gz'
file2 = wget.download(url2)

In [17]:
# Look at a sample
with gzip.open("/home/ubuntu/goodreads_books.json.gz") as f:
    line = f.readline()
    
data = json.loads(line)
data

{'isbn': '0312853122',
 'text_reviews_count': '1',
 'series': [],
 'country_code': 'US',
 'language_code': '',
 'popular_shelves': [{'count': '3', 'name': 'to-read'},
  {'count': '1', 'name': 'p'},
  {'count': '1', 'name': 'collection'},
  {'count': '1', 'name': 'w-c-fields'},
  {'count': '1', 'name': 'biography'}],
 'asin': '',
 'is_ebook': 'false',
 'average_rating': '4.00',
 'kindle_asin': '',
 'similar_books': [],
 'description': '',
 'format': 'Paperback',
 'link': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'authors': [{'author_id': '604031', 'role': ''}],
 'publisher': "St. Martin's Press",
 'num_pages': '256',
 'publication_day': '1',
 'isbn13': '9780312853129',
 'publication_month': '9',
 'edition_information': '',
 'publication_year': '1984',
 'url': 'https://www.goodreads.com/book/show/5333265-w-c-fields',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg',
 'book_id': '5333265',
 'ratings_count': '3',
 'work_id': '5400751',
 'title': '

In [18]:
# Function to extract author id from books data
def get_author(data):
    for i in range(len(data['authors'])):
        author = data['authors'][i]['author_id']
        return author


# Function to extract important book features
def parse_book_data(line):
    data = json.loads(line)
    return {
        "book_id": data["book_id"],
        'title': data['title_without_series'],
        'language':data['language_code'],
        'description': data['description'],
        'reviews_count': data['text_reviews_count'],
        'avg_rating' :data['average_rating'],
        'ratings_count' : data['ratings_count'],
        'author_id' : get_author(data),
        'image_url' : data['image_url']
    }

parse_book_data(line)

{'book_id': '5333265',
 'title': 'W.C. Fields: A Life on Film',
 'language': '',
 'description': '',
 'reviews_count': '1',
 'avg_rating': '4.00',
 'ratings_count': '3',
 'author_id': '604031',
 'image_url': 'https://images.gr-assets.com/books/1310220028m/5333265.jpg'}

In [19]:
# Extract features from full book dataset
books_data = []

with gzip.open("/home/ubuntu/goodreads_books.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_book_data(line)
        books_data.append(fields)

In [20]:
# Save as a dataframe
books = pd.DataFrame.from_dict(books_data)
print(books.shape)
books.head()

(2360655, 9)


Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id,image_url
0,5333265,W.C. Fields: A Life on Film,,,1,4.0,3,604031,https://images.gr-assets.com/books/1310220028m...
1,1333909,Good Harbor,,"Anita Diamant's international bestseller ""The ...",6,3.23,10,626222,https://s.gr-assets.com/assets/nophoto/book/11...
2,7327624,"The Unschooled Wizard (Sun Wolf and Starhawk, ...",eng,Omnibus book club edition containing the Ladie...,7,4.03,140,10333,https://images.gr-assets.com/books/1304100136m...
3,6066819,Best Friends Forever,eng,Addie Downs and Valerie Adler were eight when ...,3282,3.49,51184,9212,https://s.gr-assets.com/assets/nophoto/book/11...
4,287140,Runic Astrology: Starcraft and Timekeeping in ...,,,5,3.4,15,149918,https://images.gr-assets.com/books/1413219371m...


In [21]:
# We will only keep books that are labeled as english (or unknown)
books['language'].unique()

array(['', 'eng', 'ger', 'spa', 'en-US', 'ita', 'per', 'en-GB', 'tur',
       'ind', 'mon', 'fre', 'por', 'ara', 'en-CA', 'tha', 'lav', 'jpn',
       'pol', 'swe', 'kor', 'fin', 'msa', 'bul', 'nl', 'gre', 'slo',
       'nor', 'heb', 'hun', 'ben', 'scr', 'zho', 'fil', 'rus', 'lit',
       'rum', 'cze', 'dan', 'slv', 'nno', 'pes', 'hye', 'nob', 'cat',
       'en', 'vie', 'nep', 'mar', 'srp', 'urd', 'guj', 'est', 'sqi',
       'ukr', 'afr', 'mul', 'grc', 'kat', 'mkd', 'hin', 'tam', 'mus',
       '--', 'bos', 'enm', 'gla', 'isl', 'glg', 'mal', 'kur', 'wel',
       'pt-BR', 'crh', 'tel', 'es-MX', 'kan', 'mya', 'fao', 'aze', 'ota',
       'arw', 'pra', 'tgl', 'lat', 'dum', 'eus', 'sin', 'mlt', 'ada',
       'apa', 'udm', 'peo', 'bel', 'iro', 'nld', 'ori', 'smn', 'amh',
       'tut', 'frs', 'arg', 'ang', 'abk', 'epo', 'snd', 'pan', 'egy',
       'dut', 'vls', 'jav', 'tlh', 'din', 'gle', 'alg', 'gsw', 'nah',
       'her', 'aus', 'aka', 'chm', 'ace', 'oci', 'ast', 'kok', 'tib',
       'frm', 'i

In [22]:
# Subset english books
language_list = ['eng', 'en', 'en-US','en-GB','']

books_eng = books[books['language'].isin(language_list)]
books_eng.shape

(1918645, 9)

In [23]:
# Check for null values
books_eng.isna().sum()

book_id            0
title              0
language           0
description        0
reviews_count      0
avg_rating         0
ratings_count      0
author_id        503
image_url          0
dtype: int64

In [36]:
# The books that are missing the author id are also completely blank in the other columns, so we will just drop these
books_eng.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_eng.dropna(inplace=True)


In [37]:
# Check dtypes
books_eng.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1918142 entries, 0 to 2360654
Data columns (total 9 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   book_id        object
 1   title          object
 2   language       object
 3   description    object
 4   reviews_count  object
 5   avg_rating     object
 6   ratings_count  object
 7   author_id      object
 8   image_url      object
dtypes: object(9)
memory usage: 146.3+ MB


In [39]:
# Change some data types
books_eng = books_eng.astype({'reviews_count':int,
                              'avg_rating':float,
                              'ratings_count':int,
                              'author_id':int})

In [41]:
# Check for duplicates
books_eng.duplicated(subset=['title', 'author_id']).value_counts()

False    1492788
True      425354
Name: count, dtype: int64

In [42]:
# Remove duplicates, we will keep the book with the highest rating count
books_no_dup = books_eng.sort_values(by='ratings_count', ascending=False)\
                    .drop_duplicates(subset=['title', 'author_id'], keep='first')\
                    .reset_index(drop=True)

books_no_dup.shape

(1492788, 9)

# Author Data

In [13]:
# Download author data
url3 = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_book_authors.json.gz'
file3 = wget.download(url3)

In [43]:
# look at a sample
with gzip.open("/home/ubuntu/goodreads_book_authors.json.gz") as f:
    line = f.readline()
    
data = json.loads(line)
data

{'average_rating': '3.98',
 'author_id': '604031',
 'text_reviews_count': '7',
 'name': 'Ronald J. Fields',
 'ratings_count': '49'}

In [46]:
# Function to extract author info
def parse_author_data(line):
    data = json.loads(line)
    return {
        'name' : data['name'],
        'author_id' : data['author_id']
    }

parse_author_data(line)

{'name': 'Ronald J. Fields', 'author_id': '604031'}

In [47]:
# Extract author data
author_data = []

with gzip.open("/home/ubuntu/goodreads_book_authors.json.gz") as f:
    while True:
        line = f.readline()
        if not line:
            break
        fields = parse_author_data(line)
        author_data.append(fields)

In [48]:
# Save as a dataframe
authors = pd.DataFrame.from_dict(author_data)
print(authors.shape)
authors.head()

(829529, 2)


Unnamed: 0,name,author_id
0,Ronald J. Fields,604031
1,Anita Diamant,626222
2,Barbara Hambly,10333
3,Jennifer Weiner,9212
4,Nigel Pennick,149918


In [50]:
authors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 829529 entries, 0 to 829528
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   name       829529 non-null  object
 1   author_id  829529 non-null  object
dtypes: object(2)
memory usage: 12.7+ MB


# Merging the Data

In [52]:
# Change data type
authors = authors.astype({'author_id':int})

In [55]:
# Merge with book data so that we have the actual author name, and not just author id
books_with_author = books_no_dup.merge(authors, on='author_id', how='left')
print(books_with_author.shape)
books_with_author.head()

(1492788, 10)


Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id,image_url,name
0,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,eng,Harry Potter's life is miserable. His parents ...,59856,4.45,4765497,1077326,https://images.gr-assets.com/books/1474154022m...,J.K. Rowling
2,41865,"Twilight (Twilight, #1)",en-US,About three things I was absolutely positive.\...,90766,3.57,3941381,941441,https://images.gr-assets.com/books/1361039443m...,Stephenie Meyer
3,2657,To Kill a Mockingbird,eng,The unforgettable novel of a childhood in a sl...,59827,4.26,3255518,1825,https://images.gr-assets.com/books/1361975680m...,Harper Lee
4,4671,The Great Gatsby,eng,"THE GREAT GATSBY, F. Scott Fitzgerald's third ...",43881,3.89,2758812,3190,https://images.gr-assets.com/books/1490528560m...,F. Scott Fitzgerald


In [57]:
# Merge books and users
full_data = books_with_author.merge(reviews, on='book_id', how='inner')
print(full_data.shape)
full_data.head()

(12681324, 12)


Unnamed: 0,book_id,title,language,description,reviews_count,avg_rating,ratings_count,author_id,image_url,name,user_id,rating
0,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,8842281e1d1347389f2ab93d60773d4d,5
1,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,704eb93a316aff687a93d5215882eb21,5
2,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,4b3636a043e5c99fa27ac897ccfa1151,5
3,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,012aa353140af13109d00ca36cdc0637,5
4,2767052,"The Hunger Games (The Hunger Games, #1)",eng,Winning will make you famous.\nLosing means ce...,142645,4.34,4899965,153394,https://images.gr-assets.com/books/1447303603m...,Suzanne Collins,2f6af21d14c83a5df6cdcef5e6af0b3e,4


In [58]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12681324 entries, 0 to 12681323
Data columns (total 12 columns):
 #   Column         Dtype  
---  ------         -----  
 0   book_id        object 
 1   title          object 
 2   language       object 
 3   description    object 
 4   reviews_count  int64  
 5   avg_rating     float64
 6   ratings_count  int64  
 7   author_id      int64  
 8   image_url      object 
 9   name           object 
 10  user_id        object 
 11  rating         int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 1.1+ GB


In [60]:
# Save to csv file
full_data.to_csv('../ubuntu/book_data.csv.gzip', compression='gzip', index=False)

# Top 50 books

In [None]:
top50 = books_with_author.sort_values(by='ratings_count', ascending=False).head(50)

In [None]:
pickle.dump(top50, open('top50.pkl', 'wb'))