In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import sklearn.model_selection

In [2]:
# Specify your directory
DIR = r'C:\Users\Jackie\OneDrive - Georgia Institute of Technology\ISYE6740 Computational Data Analytics\Project\Data\Genres\YoungAdult'

In [3]:
def load_data(file_name):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
    return data

In [4]:
# The Young Adult genre has the highest interaction / book ratio: 34,919,254 interactions and 93,398 books
#interactions = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [5]:
def load_data(file_name, head = 1000000):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the "head"th line
            if (head is not None) and (count > head):
                break
    return data

In [6]:
interactions_sample = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [7]:
print(' == sample record (interaction) ==')
display(np.random.choice(interactions_sample))

 == sample record (interaction) ==


{'user_id': '9daf8b2193b374032ad73e468bd117fd',
 'book_id': '13425784',
 'review_id': 'ed00548ee8f23a69bd541f8add4541f1',
 'is_read': True,
 'rating': 1,
 'review_text_incomplete': '',
 'date_added': 'Wed Jun 20 09:17:19 -0700 2012',
 'date_updated': 'Tue Jun 26 11:23:16 -0700 2012',
 'read_at': 'Tue Jun 26 11:23:16 -0700 2012',
 'started_at': 'Sun Jun 24 18:17:58 -0700 2012'}

In [8]:
df = pd.json_normalize(interactions_sample)

In [9]:
# Filter to books that the user has read
is_read = np.logical_and(df['is_read'] == True, df['rating'] != 0) # a book can be read but still have a rating of 0
is_read_df = df[is_read]

In [10]:
is_read_df

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
3,8842281e1d1347389f2ab93d60773d4d,8684868,d29b8238762d70b7c2b67941bc81fbe0,True,3,,Tue Dec 17 13:42:25 -0800 2013,Tue Dec 17 13:47:26 -0800 2013,Sun Dec 15 00:00:00 -0800 2013,Sat Dec 14 00:00:00 -0800 2013
4,8842281e1d1347389f2ab93d60773d4d,8423493,357c8c178fd0e06cff5c025649231672,True,2,,Sun Dec 08 01:26:12 -0800 2013,Tue Dec 27 05:37:48 -0800 2016,Tue Dec 10 00:00:00 -0800 2013,
5,8842281e1d1347389f2ab93d60773d4d,87976,e6306259819c47f278e78d32a1b73ddf,True,5,,Wed Oct 17 17:49:11 -0700 2012,Wed Oct 17 17:49:11 -0700 2012,,
6,8842281e1d1347389f2ab93d60773d4d,18116,8aa100c7b681e9fedbe2c6bd2c25696a,True,5,,Fri Apr 29 13:14:05 -0700 2011,Fri Apr 29 13:14:05 -0700 2011,,
7,8842281e1d1347389f2ab93d60773d4d,2767052,248c011811e945eca861b5c31a549291,True,5,I cracked and finally picked this up. Very enj...,Wed Jan 13 13:38:25 -0800 2010,Wed Mar 22 11:46:36 -0700 2017,Sun Mar 25 00:00:00 -0700 2012,Fri Mar 23 00:00:00 -0700 2012
...,...,...,...,...,...,...,...,...,...,...
999982,96508a4e9a88670c6c6d7a2bc80ac682,11581475,8a78254298ea9233671ffc568020f220,True,3,,Tue Oct 07 21:09:30 -0700 2014,Tue Oct 07 21:09:33 -0700 2014,,
999988,96508a4e9a88670c6c6d7a2bc80ac682,8175604,fcc4d01a106de628b1097cc335332265,True,3,,Tue Oct 07 20:57:28 -0700 2014,Tue Oct 07 20:57:30 -0700 2014,,
999989,96508a4e9a88670c6c6d7a2bc80ac682,8606706,81030e9852a7312f916e23748534cd44,True,4,,Tue Oct 07 20:54:48 -0700 2014,Sat Dec 31 13:10:17 -0800 2016,Fri Dec 23 00:00:00 -0800 2016,Fri Dec 23 00:00:00 -0800 2016
999998,96508a4e9a88670c6c6d7a2bc80ac682,1902241,a29ee904f836a9d7462663dd6a0cf3b6,True,3,The Adoration of Jenna Fox explores the ethics...,Tue Oct 07 16:30:42 -0700 2014,Wed Feb 24 17:20:50 -0800 2016,Thu Jun 09 00:00:00 -0700 2011,


In [11]:
# Define user-book rating matrix
mat = pd.pivot(is_read_df, index='user_id', columns='book_id', values='rating')
mat.replace(0, np.nan, inplace=True)

In [12]:
mat

book_id,10000600,10001576,10001793,10001905,10002219,10004138,1000589,10011780,1001346,10016274,...,9989287,999346,9994194,9996290,9996645,9996853,9997510,9998891,9999403,9999813
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00130b2e4d35f9b0edee72c620d9a687,,,,,,,,,,,...,,,,,,,,,,
001aa4da5b2c1c53765b168224c7f106,,,,,,,,,,,...,,,,,,,,,,
0021e047a599f9827d75628db22097b6,,,,,,,,,,,...,,,,,,,,,,
0029b250cad1be96d172447b0386ad6d,,,,,,,,,,,...,,,,,,,,,,
002e4cdbb30a0fd5c00ef486c6e3bd6b,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffdcb1c27933801046330f847da830d4,,,,,,,,,,,...,,,,,,,,,,
ffe883170a3d48f22a4bacf678c9d2bd,,,,,,,,,,,...,,,,,,,,,,
ffec6cbb0db016bc371fc30b902c3166,,,,,,,,,,,...,,,,,,,,,,
ffeca3c2b7d21f3b57b0b7c3cfe86c5d,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Train/test split
train, test = sklearn.model_selection.train_test_split(mat, test_size=0.1, random_state=1111)

In [14]:
train

book_id,10000600,10001576,10001793,10001905,10002219,10004138,1000589,10011780,1001346,10016274,...,9989287,999346,9994194,9996290,9996645,9996853,9997510,9998891,9999403,9999813
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19c6e2ce1f91607d9149fcf3319d045b,,,,,,,,,,,...,,,,,,,,,,
3df3f656cd25d7ae965c13fdd92e23f5,,,,,,,,,,,...,,,,,,,,,,
ca8652e14aadfd7603cd025ea6fc45f7,,,,,,,,,,,...,,,,,,,,,,
460e86c02d120d4d518dd37deec5621a,,,,,,,,,,,...,,,,,,,,,,
df680eb1ede972206731c34ee5171f4c,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eed608afc40a87cf08efadf1a915743f,,,,,,,,,,,...,,,,,,,,,,
edb4b879aabde5c27c480e4a01dca765,,,,,,,,,,,...,,,,,,,,,,
39cb0fa5f5b9deb02829d7abf04513ce,,,,,,,,,,,...,,,,,,,,,,
063fa0e06132f22bb86d9119542c2b95,,,,,,,,,,,...,,,,,,,,,,


In [15]:
test

book_id,10000600,10001576,10001793,10001905,10002219,10004138,1000589,10011780,1001346,10016274,...,9989287,999346,9994194,9996290,9996645,9996853,9997510,9998891,9999403,9999813
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b798789f0340d5a27c591ea2f1d9be7e,,,,,,,,,,,...,,,,,,,,,,
d806b8c325b223a2a3af1bda90ea2649,,,,,,,,,,,...,,,,,,,,,,
90004fcd139213e2d36b4751e0c8c481,,,,,,,,,,,...,,,,,,,,,,
4e7a029070687339e55fbb53968a1795,,,,,,,,,,,...,,,,,,,,,,
b42835e0335b800c8eeeaecc6e4dc210,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3f0d71dfbde89e2caa0c71a628e90a5f,,,,,,,,,,,...,,,,,,,,,,
7401b628cff54cf1ce8d8c62d7ba7e57,,,,,,,,,,,...,,,,,,,,,,
401a45ef1cc21a055786659e589df132,,,,,,,,,,,...,,,,,,,,,,
a8ceaad452451fb240556c801ec5f16f,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Calculate average rating for each book over users
avg_book_rating = np.nanmean(train, axis=0)
# For empty columns, fill with avg rating of 3 
np.nan_to_num(avg_book_rating, copy=False, nan=3.0)

# Fill missing value with the appropriate average for that book
filled = np.where(np.isnan(train), avg_book_rating, train)

  avg_book_rating = np.nanmean(train, axis=0)


In [18]:
# Remove from each user (row) the mean over all books
avg_user_rating = np.average(filled, axis=1)
avg_user_rating_col_vec = avg_user_rating.reshape(len(avg_user_rating),1)
removed = filled - avg_user_rating_col_vec

In [19]:
removed

array([[ 0.1807187 ,  0.29182981, -0.70817019, ..., -0.42245591,
         0.29182981,  0.29182981],
       [ 0.18059709,  0.2917082 , -0.7082918 , ..., -0.42257752,
         0.2917082 ,  0.2917082 ],
       [ 0.18073712,  0.29184823, -0.70815177, ..., -0.42243749,
         0.29184823,  0.29184823],
       ...,
       [ 0.18060549,  0.2917166 , -0.7082834 , ..., -0.42256912,
         0.2917166 ,  0.2917166 ],
       [ 0.18040073,  0.29151184, -0.70848816, ..., -0.42277388,
         0.29151184,  0.29151184],
       [ 0.18064004,  0.29175115, -0.70824885, ..., -0.42253457,
         0.29175115,  0.29175115]])

In [None]:
# Perform SVD, decide on a rank k, and calculate a reduced rating matrix to remove noise
# Set up cross-validation?