In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import sklearn.model_selection
import scipy
import sklearn.metrics.pairwise as pw
from time import time

In [2]:
# Specify your directory
DIR = r'C:\Users\Jackie\OneDrive - Georgia Institute of Technology\ISYE6740 Computational Data Analytics\Project\Data\Genres\YoungAdult'

In [3]:
def load_data(file_name):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
    return data

In [4]:
# The Young Adult genre has the highest interaction / book ratio: 34,919,254 interactions and 93,398 books
#interactions = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [5]:
def load_data(file_name, head = 10000): # should be 1000000
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the "head"th line
            if (head is not None) and (count > head):
                break
    return data

In [6]:
interactions_sample = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [7]:
print(' == sample record (interaction) ==')
display(np.random.choice(interactions_sample))

 == sample record (interaction) ==


{'user_id': '012515e5802b2e0f42915118c90fa04b',
 'book_id': '7198988',
 'review_id': '2a2f9ea76a4c752d7d7a16be0ff296e1',
 'is_read': True,
 'rating': 2,
 'review_text_incomplete': '',
 'date_added': 'Mon Jun 04 19:50:10 -0700 2012',
 'date_updated': 'Sun Aug 03 12:05:22 -0700 2014',
 'read_at': '',
 'started_at': ''}

In [8]:
df = pd.json_normalize(interactions_sample)

In [9]:
# Filter to books that the user has read
is_read = np.logical_and(df['is_read'] == True, df['rating'] != 0) # a book can be read but still have a rating of 0
is_read_df = df[is_read]

In [10]:
is_read_df

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
3,8842281e1d1347389f2ab93d60773d4d,8684868,d29b8238762d70b7c2b67941bc81fbe0,True,3,,Tue Dec 17 13:42:25 -0800 2013,Tue Dec 17 13:47:26 -0800 2013,Sun Dec 15 00:00:00 -0800 2013,Sat Dec 14 00:00:00 -0800 2013
4,8842281e1d1347389f2ab93d60773d4d,8423493,357c8c178fd0e06cff5c025649231672,True,2,,Sun Dec 08 01:26:12 -0800 2013,Tue Dec 27 05:37:48 -0800 2016,Tue Dec 10 00:00:00 -0800 2013,
5,8842281e1d1347389f2ab93d60773d4d,87976,e6306259819c47f278e78d32a1b73ddf,True,5,,Wed Oct 17 17:49:11 -0700 2012,Wed Oct 17 17:49:11 -0700 2012,,
6,8842281e1d1347389f2ab93d60773d4d,18116,8aa100c7b681e9fedbe2c6bd2c25696a,True,5,,Fri Apr 29 13:14:05 -0700 2011,Fri Apr 29 13:14:05 -0700 2011,,
7,8842281e1d1347389f2ab93d60773d4d,2767052,248c011811e945eca861b5c31a549291,True,5,I cracked and finally picked this up. Very enj...,Wed Jan 13 13:38:25 -0800 2010,Wed Mar 22 11:46:36 -0700 2017,Sun Mar 25 00:00:00 -0700 2012,Fri Mar 23 00:00:00 -0700 2012
...,...,...,...,...,...,...,...,...,...,...
9996,4a44f603cc3df339acc48590044a2db0,17261628,2310bbf77a0d72234e4739a3adf7194d,True,3,,Sat Jan 24 21:56:03 -0800 2015,Sat Dec 19 17:16:14 -0800 2015,Sun Jan 25 17:01:54 -0800 2015,Sat Jan 24 21:56:03 -0800 2015
9997,4a44f603cc3df339acc48590044a2db0,18190208,c1c267254139f66d2b7d26d27a423c1d,True,4,See full review for The Witch Hunter at: <a ta...,Thu Jan 15 09:52:14 -0800 2015,Sat Jul 18 09:16:21 -0700 2015,Fri Jul 10 00:00:00 -0700 2015,Fri Jul 10 00:00:00 -0700 2015
9998,4a44f603cc3df339acc48590044a2db0,22308716,0c71f23060e6513f28cad1496b6a7e61,True,5,"See full review for Red Queen at: <a target=""_...",Thu Jan 15 09:51:38 -0800 2015,Sat Dec 19 17:20:26 -0800 2015,Tue Mar 03 00:00:00 -0800 2015,Thu Feb 26 00:00:00 -0800 2015
9999,4a44f603cc3df339acc48590044a2db0,18967205,911a06b2f968713818e78544434a330e,True,4,"See full review for Hellhole at: <a target=""_b...",Thu Jan 15 09:50:59 -0800 2015,Sat Feb 28 15:58:19 -0800 2015,Fri Feb 27 00:00:00 -0800 2015,Fri Feb 27 00:00:00 -0800 2015


In [11]:
# Define user-book rating matrix
mat = pd.pivot(is_read_df, index='user_id', columns='book_id', values='rating')
mat.replace(0, np.nan, inplace=True)

In [12]:
mat

book_id,10001793,10004138,10025007,10025305,1003318,1008231,10118172,10151730,10165727,10165761,...,9917938,9917945,9918053,9918133,9939115,9947386,99561,9961796,9972882,9975679
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
012515e5802b2e0f42915118c90fa04b,,,,,,,,,,,...,,,,,,,3.0,,,
012aa353140af13109d00ca36cdc0637,,,,,,,,,,,...,,,,,,,,,,
01d02898170634e6e7232650ebbf2e43,,,,,,,,,,,...,,,,,,,,,,
01ec1a320ffded6b2dd47833f2c8e4fb,,,,,,,,,,,...,,,,,,,,,,
040b31603912dc03f19e0b76d58c3660,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f4d16ea4ac59af59d257631398af39f4,,,,5.0,,,,,,,...,,,,2.0,,,,,,
f88032f4ad97b46654fe59ce3387cf5d,,,,,,,,,,,...,,,,,,,,,,
f8a89075dc6de14857561522e729f82c,3.0,5.0,,,,,,,,,...,,,,,,,,,,
faa322d2624b0e7eb3064e39dac4af9c,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Train/test split
train, test = sklearn.model_selection.train_test_split(is_read_df, test_size=0.1, random_state=1111)
train.shape, test.shape

((3247, 10), (361, 10))

In [70]:
# Set all test values to nan
zipped_test = np.c_[test['user_id'], test['book_id'], test['rating']]
test_mask = np.full(mat.shape, False)
for i in range(len(zipped_test)):
    row_idx = mat.index.get_loc(zipped_test[i][0])
    col_idx = mat.columns.get_loc(zipped_test[i][1])
    test_mask[row_idx, col_idx] = True
mat[test_mask] = np.nan

In [15]:
mat.at[zipped_test[0][0], zipped_test[0][1]]

nan

In [16]:
# Calculate sparsity of data
not_null = mat.count().sum()
sparsity = 1 - not_null / (mat.shape[0] * mat.shape[1])
sparsity

0.9875425861698536

In [17]:
mat.count().sort_values(ascending=False)

book_id
2767052     66
7260188     47
6148028     44
41865       38
11870085    35
            ..
353900       0
354520       0
35504431     0
35712937     0
15817997     0
Length: 2102, dtype: int64

In [18]:
# Calculate average rating for each book over users
avg_book_rating = np.nanmean(mat, axis=0) # column avg r_j
# For empty columns, fill with avg rating of 3 
np.nan_to_num(avg_book_rating, copy=False, nan=3.0)

# Fill missing value with the appropriate average for that book
filled = np.where(np.isnan(mat), avg_book_rating, mat)

  avg_book_rating = np.nanmean(mat, axis=0) # column avg r_j


In [19]:
# Remove from each user (row) the mean over all books
avg_user_rating = np.average(filled, axis=1) # row avg r_i
avg_user_rating_col_vec = avg_user_rating.reshape(len(avg_user_rating),1)
normalized = filled - avg_user_rating_col_vec

In [20]:
normalized

array([[-0.77845839,  1.22154161,  0.22154161, ...,  0.22154161,
        -0.77845839,  0.22154161],
       [-0.77398938,  1.22601062,  0.22601062, ...,  0.22601062,
        -0.77398938,  0.22601062],
       [-0.77620505,  1.22379495,  0.22379495, ...,  0.22379495,
        -0.77620505,  0.22379495],
       ...,
       [-0.77829132,  1.22170868,  0.22170868, ...,  0.22170868,
        -0.77829132,  0.22170868],
       [-0.77349298,  1.22650702,  0.22650702, ...,  0.22650702,
        -0.77349298,  0.22650702],
       [-0.77703074,  1.22296926,  0.22296926, ...,  0.22296926,
        -0.77703074,  0.22296926]])

In [21]:
normalized.shape

(124, 2102)

In [22]:
# Perform SVD, decide on a rank k, and calculate a reduced rating matrix to remove noise
# Can use sparse matrix algorithms because of special structure
# How to choose k: if top few values are big and the rest are small, take the big values. So chose k=11
U, S, VT = scipy.sparse.linalg.svds(normalized, k=6) # CHANGE BACK TO k=11

In [23]:
#U, S, VT = np.linalg.svd(normalized) #k=10

In [24]:
# U = U[:, :9]
# S = S[:9]
# VT = VT[:9, :]

In [25]:
S

array([  8.15754905,   9.12891417,  10.07289629,  11.20894659,
        12.35870379, 465.37869548])

In [26]:
U.shape, S.shape, VT.shape

((124, 6), (6,), (6, 2102))

In [27]:
reduced_rating = U @ np.diag(S) @ VT

In [28]:
reduced_rating # SHOULD THIS BE >=1? Maybe not, because it's already been reduced

array([[-0.78097943,  1.21557161,  0.21729609, ...,  0.29100328,
        -0.78097943,  0.21729609],
       [-0.77545698,  1.22618193,  0.22536247, ...,  0.23957788,
        -0.77545698,  0.22536247],
       [-0.77599848,  1.22341376,  0.22370764, ...,  0.28675779,
        -0.77599848,  0.22370764],
       ...,
       [-0.77808863,  1.22307744,  0.2224944 , ...,  0.23132573,
        -0.77808863,  0.2224944 ],
       [-0.77421055,  1.2250016 ,  0.22539553, ...,  0.22551034,
        -0.77421055,  0.22539553],
       [-0.77707112,  1.22373605,  0.22333247, ...,  0.22989943,
        -0.77707112,  0.22333247]])

In [29]:
reduced_rating.shape

(124, 2102)

In [30]:
# Calculate matrix product for dimensionality-reduced rating matrix
sqrtS = np.diag(np.sqrt(S))
meta_on_users = U @ np.transpose(sqrtS)
meta_on_items = sqrtS @ VT

In [31]:
meta_on_items

array([[ 1.46689679e-03, -1.41129057e-03,  2.78031088e-05, ...,
         7.41854287e-02,  1.46689679e-03,  2.78031088e-05],
       [ 5.35151524e-03,  4.16348816e-03,  4.75750170e-03, ...,
         2.98421476e-02,  5.35151524e-03,  4.75750170e-03],
       [ 1.85245439e-03,  1.63359021e-03,  1.74302230e-03, ...,
        -1.31967158e-01,  1.85245439e-03,  1.74302230e-03],
       [-7.43680689e-03, -2.61581008e-03, -5.02630848e-03, ...,
         6.17763648e-03, -7.43680689e-03, -5.02630848e-03],
       [ 1.10500733e-02,  1.47305844e-02,  1.28903288e-02, ...,
        -3.77956080e-02,  1.10500733e-02,  1.28903288e-02],
       [-3.99789222e-01,  6.32584853e-01,  1.16397816e-01, ...,
         1.16429489e-01, -3.99789222e-01,  1.16397816e-01]])

In [32]:
# # Calculate similarity of 2 books j and f based on the reduced rating matrix
# # Similarity is the correlation of 2 movies based on average over users
# def similarity(j, f, matrix):
#     j_col = matrix[:, j]
#     f_col = matrix[:, f]
#     numerator = np.sum(np.multiply(j_col, f_col))
#     denominator = np.sqrt(np.multiply(np.sum(np.square(j_col)), np.sum(np.square(f_col))))
#     return numerator / denominator

In [33]:
# num_books = reduced_rating.shape[1]
# t1 = time()
# similarities = np.zeros((num_books, num_books))
# # TODO: is there a faster way to do this?
# for i in range(num_books):
#     for j in range(num_books):
#         similarities[i, j] = similarity(i, j, meta_on_items)
# t2 = time()

In [34]:
# t2-t1

In [35]:
# num_books = reduced_rating.shape[1]
# t1 = time()
# col_prods = meta_on_items.T.dot(meta_on_items)

# similarities2 = np.zeros((num_books, num_books))
# # TODO: is there a faster way to do this?
# for i in range(num_books):
#     for j in range(i, num_books):
#         similarities2[i, j] = col_prods[i, j] / np.sqrt(col_prods[i, i] * col_prods[j, j])
#         similarities2[j, i] = similarities2[i, j]
# t2 = time()

In [36]:
# similarities.shape, t2-t1, np.allclose(similarities, similarities2)

In [37]:
num_books = reduced_rating.shape[1]
t1 = time()
similarities = pw.cosine_similarity(meta_on_items.T, meta_on_items.T)
t2 = time()

In [38]:
similarities.shape, t2-t1

((2102, 2102), 0.017405271530151367)

In [39]:
# # Without reducing dimensionality
# num_books = reduced_rating.shape[1]
# t1 = time()
# similarities = pw.cosine_similarity(reduced_rating.T, reduced_rating.T)
# t2 = time()
# similarities.shape, t2-t1

In [40]:
# TODO: try optimizing
# Calculate prediction of rating by user i of book j, by avging over the ratings of all books by the same user, each weighted according to its similarity to book j
num_users = reduced_rating.shape[0]
predictions = np.zeros((num_users, num_books))

# Isolate the set of items which appear to be the most similar to the active item. Vozalis suggests 10 items is best.
most_similar_idx = np.argsort(similarities, axis=1)[:,-10:]
for a in range(num_users):
    for j in range(num_books):
        numerator = 0
        denominator = 0
        # check that similarities are non-negative
        for k in most_similar_idx[j,:]:
            sim = similarities[j,k]
            if sim < 0:
                break
            numerator += sim * (reduced_rating[a,k] + avg_user_rating[a])
            denominator += sim
        predictions[a,j] = numerator/denominator

In [41]:
print(np.min(predictions), np.max(predictions))
predictions

0.9879153962892226 5.036432347838605


array([[2.99747896, 4.99402999, 3.99575448, ..., 4.23895594, 2.99747896,
        3.99575448],
       [2.9985324 , 5.00017132, 3.99935186, ..., 4.1355657 , 2.9985324 ,
        3.99935186],
       [3.00020658, 4.99961881, 3.99991269, ..., 4.27345462, 3.00020658,
        3.99991269],
       ...,
       [3.00020269, 5.00136876, 4.00078572, ..., 4.10097384, 3.00020269,
        4.00078572],
       [2.99928243, 4.99849458, 3.9988885 , ..., 4.09274205, 2.99928243,
        3.9988885 ],
       [2.99995962, 5.00076679, 4.0003632 , ..., 4.12769229, 2.99995962,
        4.0003632 ]])

In [42]:
np.shape(predictions)

(124, 2102)

In [59]:
# Define error function
def rmse(predictions, actuals):
    return np.sqrt(((predictions - actuals) ** 2).mean())

In [60]:
# Calculate training error
training_mask = np.array(mat.notna())
train_predictions = predictions[training_mask]
train_actuals = np.array(mat)[training_mask]
train_rmse = rmse(train_predictions, train_actuals)
train_rmse

0.5368157321911367

In [61]:
train_predictions, train_actuals

(array([2.99747896, 3.99575448, 3.6512771 , ..., 2.71384157, 5.00076679,
        4.06424344]),
 array([3., 4., 5., ..., 3., 5., 5.]))

In [71]:
# Calculate test error
#zipped_test = np.c_[test['user_id'], test['book_id'], test['rating']]
test_predictions = predictions[test_mask]

In [72]:
test_predictions.shape

(361,)

# Debugging

In [68]:
np.argsort(similarities, axis=0)

array([[ 771, 1050, 1971, ..., 1999, 1818, 1971],
       [1569,  246,   65, ..., 1943,  660,   65],
       [1677, 1855, 1959, ...,  769, 1863, 1959],
       ...,
       [ 662, 1662, 1342, ...,  956,  192,  749],
       [ 672,  403, 1319, ..., 1682, 1819, 1267],
       [ 530, 1328, 2101, ...,  838, 1050,  555]], dtype=int64)

In [69]:
testing = np.array([[1,5,8,3,7],[1,2,3,4,5]])
sorted_testing = np.argsort(testing, axis=1)
sorted_testing[:,-2:]

array([[4, 2],
       [3, 4]], dtype=int64)

In [70]:
np.all(predictions < 5), np.all(predictions > 1)

(False, False)

In [71]:
np.all(reduced_rating < 5), np.all(reduced_rating > 1)

(True, False)

In [72]:
reduced_rating

array([[-0.79095619,  1.21049331,  0.20976856, ...,  0.27381826,
        -2.7924057 ,  0.20976856],
       [-0.79390461,  1.20649989,  0.20629764, ...,  0.24127527,
        -2.7943091 ,  0.20629764],
       [-0.79534857,  1.20311795,  0.20388469, ...,  0.17779601,
        -2.79381509,  0.20388469],
       ...,
       [-0.79395224,  1.20310346,  0.20457561, ...,  0.21655754,
        -2.79100794,  0.20457561],
       [-0.79684507,  1.20337675,  0.20326584, ...,  0.21446582,
        -2.79706689,  0.20326584],
       [-0.79508945,  1.20493898,  0.20492476, ...,  0.1678162 ,
        -2.79511788,  0.20492476]])

In [73]:
avg_user_rating

array([3.79291392, 3.79634311, 3.79667185, 3.79423653, 3.82073502,
       3.80307328, 3.80522777, 3.78977195, 3.79678001, 3.80490058,
       3.79654926, 3.79643033, 3.78226176, 3.79556661, 3.79673956,
       3.79552167, 3.797025  , 3.79397581, 3.7940936 , 3.79589396,
       3.79709636, 3.80385936, 3.77688461, 3.80664769, 3.79654121,
       3.79709609, 3.80236371, 3.79563743, 3.80403545, 3.79764826,
       3.79629806, 3.79637901, 3.78530414, 3.79519341, 3.79754804,
       3.79309643, 3.79790997, 3.79535992, 3.7974874 , 3.80162379,
       3.79676614, 3.79765419, 3.79619246, 3.79614461, 3.79812172,
       3.79654926, 3.79077052, 3.79762333, 3.7998803 , 3.79110997,
       3.79663648, 3.79647483, 3.80145392, 3.7932477 , 3.79439511,
       3.79698535, 3.79627175, 3.79746109, 3.79625174, 3.79643033,
       3.79643033, 3.79643033, 3.79515466, 3.79688876, 3.79754998,
       3.79486843, 3.79547885, 3.79717565, 3.7966682 , 3.79745788,
       3.80610679, 3.79229618, 3.79733423, 3.79721809, 3.79787

In [48]:
# TODO: 