In [1]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd
import sklearn.model_selection
import scipy
import sklearn.metrics.pairwise as pw
from time import time

In [2]:
# Specify your directory
DIR = r'C:\Users\Jackie\OneDrive - Georgia Institute of Technology\ISYE6740 Computational Data Analytics\Project\Data\Genres\YoungAdult'

In [3]:
def load_data(file_name):
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
    return data

In [4]:
# The Young Adult genre has the highest interaction / book ratio: 34,919,254 interactions and 93,398 books
#interactions = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [5]:
def load_data(file_name, head = 10000): # should be 1000000
    count = 0
    data = []
    with gzip.open(file_name) as fin:
        for l in fin:
            d = json.loads(l)
            count += 1
            data.append(d)
            
            # break if reaches the "head"th line
            if (head is not None) and (count > head):
                break
    return data

In [6]:
interactions_sample = load_data(os.path.join(DIR, 'goodreads_interactions_young_adult.json.gz'))

In [7]:
print(' == sample record (interaction) ==')
display(np.random.choice(interactions_sample))

 == sample record (interaction) ==


{'user_id': '6e3fbaf57959cb9900714c249bce173f',
 'book_id': '6624871',
 'review_id': 'ea2a3279f98c4972c40359a55b1d60ed',
 'is_read': False,
 'rating': 0,
 'review_text_incomplete': '',
 'date_added': 'Wed Jul 25 08:29:53 -0700 2012',
 'date_updated': 'Wed Jul 25 08:29:53 -0700 2012',
 'read_at': '',
 'started_at': ''}

In [8]:
df = pd.json_normalize(interactions_sample)

In [9]:
# Filter to books that the user has read
is_read = np.logical_and(df['is_read'] == True, df['rating'] != 0) # a book can be read but still have a rating of 0
is_read_df = df[is_read]

In [10]:
is_read_df

Unnamed: 0,user_id,book_id,review_id,is_read,rating,review_text_incomplete,date_added,date_updated,read_at,started_at
3,8842281e1d1347389f2ab93d60773d4d,8684868,d29b8238762d70b7c2b67941bc81fbe0,True,3,,Tue Dec 17 13:42:25 -0800 2013,Tue Dec 17 13:47:26 -0800 2013,Sun Dec 15 00:00:00 -0800 2013,Sat Dec 14 00:00:00 -0800 2013
4,8842281e1d1347389f2ab93d60773d4d,8423493,357c8c178fd0e06cff5c025649231672,True,2,,Sun Dec 08 01:26:12 -0800 2013,Tue Dec 27 05:37:48 -0800 2016,Tue Dec 10 00:00:00 -0800 2013,
5,8842281e1d1347389f2ab93d60773d4d,87976,e6306259819c47f278e78d32a1b73ddf,True,5,,Wed Oct 17 17:49:11 -0700 2012,Wed Oct 17 17:49:11 -0700 2012,,
6,8842281e1d1347389f2ab93d60773d4d,18116,8aa100c7b681e9fedbe2c6bd2c25696a,True,5,,Fri Apr 29 13:14:05 -0700 2011,Fri Apr 29 13:14:05 -0700 2011,,
7,8842281e1d1347389f2ab93d60773d4d,2767052,248c011811e945eca861b5c31a549291,True,5,I cracked and finally picked this up. Very enj...,Wed Jan 13 13:38:25 -0800 2010,Wed Mar 22 11:46:36 -0700 2017,Sun Mar 25 00:00:00 -0700 2012,Fri Mar 23 00:00:00 -0700 2012
...,...,...,...,...,...,...,...,...,...,...
9996,4a44f603cc3df339acc48590044a2db0,17261628,2310bbf77a0d72234e4739a3adf7194d,True,3,,Sat Jan 24 21:56:03 -0800 2015,Sat Dec 19 17:16:14 -0800 2015,Sun Jan 25 17:01:54 -0800 2015,Sat Jan 24 21:56:03 -0800 2015
9997,4a44f603cc3df339acc48590044a2db0,18190208,c1c267254139f66d2b7d26d27a423c1d,True,4,See full review for The Witch Hunter at: <a ta...,Thu Jan 15 09:52:14 -0800 2015,Sat Jul 18 09:16:21 -0700 2015,Fri Jul 10 00:00:00 -0700 2015,Fri Jul 10 00:00:00 -0700 2015
9998,4a44f603cc3df339acc48590044a2db0,22308716,0c71f23060e6513f28cad1496b6a7e61,True,5,"See full review for Red Queen at: <a target=""_...",Thu Jan 15 09:51:38 -0800 2015,Sat Dec 19 17:20:26 -0800 2015,Tue Mar 03 00:00:00 -0800 2015,Thu Feb 26 00:00:00 -0800 2015
9999,4a44f603cc3df339acc48590044a2db0,18967205,911a06b2f968713818e78544434a330e,True,4,"See full review for Hellhole at: <a target=""_b...",Thu Jan 15 09:50:59 -0800 2015,Sat Feb 28 15:58:19 -0800 2015,Fri Feb 27 00:00:00 -0800 2015,Fri Feb 27 00:00:00 -0800 2015


In [11]:
# Define user-book rating matrix
mat = pd.pivot(is_read_df, index='user_id', columns='book_id', values='rating')
mat.replace(0, np.nan, inplace=True)

In [12]:
mat

book_id,10001793,10004138,10025007,10025305,1003318,1008231,10118172,10151730,10165727,10165761,...,9917938,9917945,9918053,9918133,9939115,9947386,99561,9961796,9972882,9975679
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
012515e5802b2e0f42915118c90fa04b,,,,,,,,,,,...,,,,,,,3.0,,,
012aa353140af13109d00ca36cdc0637,,,,,,,,,,,...,,,,,,,,,,
01d02898170634e6e7232650ebbf2e43,,,,,,,,,,,...,,,,,,,,,,
01ec1a320ffded6b2dd47833f2c8e4fb,,,,,,,,,,,...,,,,,,,,,,
040b31603912dc03f19e0b76d58c3660,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
f4d16ea4ac59af59d257631398af39f4,,,,5.0,,,,,,,...,,,,2.0,,,,,,
f88032f4ad97b46654fe59ce3387cf5d,,,,,,,,,,,...,,,,,,,,,,
f8a89075dc6de14857561522e729f82c,3.0,5.0,,,,,,,,,...,,,,,,,,,,
faa322d2624b0e7eb3064e39dac4af9c,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Calculate sparsity of data
not_null = mat.count().sum()
sparsity = 1 - not_null / (mat.shape[0] * mat.shape[1])
sparsity

0.986157576501642

In [14]:
mat.count().sort_values(ascending=False)

book_id
2767052     70
6148028     52
7260188     50
41865       43
11870085    37
            ..
17899392     1
17862206     1
17860222     1
17860199     1
9975679      1
Length: 2102, dtype: int64

In [15]:
# Train/test split
train, test = sklearn.model_selection.train_test_split(mat, test_size=0.1, random_state=1111)

In [16]:
train

book_id,10001793,10004138,10025007,10025305,1003318,1008231,10118172,10151730,10165727,10165761,...,9917938,9917945,9918053,9918133,9939115,9947386,99561,9961796,9972882,9975679
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8b6ad6654855f6d11b276d41df968de5,,,,,,,,,,,...,,,,,,,,,,
bafc2d50014200cda7cb2b6acd60cd73,,,,,,,,,,,...,,,,,,,,,,
7f3fb103a5597aae9ff34e5ac49a77bb,,,,,,,,,,,...,,,,,,,,,,
9b5dc6789acb444bef4f9efbfdb3654f,,,,,,,,,,,...,,,,,,,,,,
dcaf63d82422e351590aba2b36950a17,,,,,,,,,,,...,,,,,,,5.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0f191806c278059bcc7a1015ff7c5420,,,,,,,,,,,...,,,,,,,,,,
a34e60cb88cea96fd767a9db81f6f5c8,,,,,,,,,,,...,,,,,,,,,,
d20102b939fd040355d6c2452230f709,,,,,,,,,,,...,,,,,,,,,1.0,
74923eed26118d852196f9aa18070272,,,,,,,,,,,...,,,,,,,,,,


In [17]:
test

book_id,10001793,10004138,10025007,10025305,1003318,1008231,10118172,10151730,10165727,10165761,...,9917938,9917945,9918053,9918133,9939115,9947386,99561,9961796,9972882,9975679
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ac66a7e4003e36fb1c360e0b375e9d07,,,,,,,,,,,...,,,,,,,,,,
8ecbc8ba108ccf1be156d99a7415c44d,,,,,,,,,,,...,,,,,,,,,,
ab2923b738ea3082f5f3efcbbfacb218,,,,,,,,,,,...,,,,,,,,,,
24f6896efc60c9e52a3df7e77d3524c7,,,,,,,,,,,...,,,,,,,,,,
4b3636a043e5c99fa27ac897ccfa1151,,,,,,,,,,,...,,,,,,,,,,
197ee8c6624f698557603a05ad670d45,,,,,,,,,,,...,,,,,,,,,,
c7b9a63678911865302a204f45d6cc2d,,,,,,,,,,,...,,,,,,,,,,
7b2e5fe9fd353fecf3eeebb4850b88d3,,,,,,,,,,,...,4.0,,,,,,,,,
72fb0d0087d28c832f15776b0d936598,,,,,,,,,,,...,,,,,,,,,,
fd379cf294fc1937e41f3f7df3c9eabe,,,,,,,,,,,...,,,,,,,,,,


In [18]:
# Calculate average rating for each book over users
avg_book_rating = np.nanmean(train, axis=0) # column avg r_j
# For empty columns, fill with avg rating of 3 
np.nan_to_num(avg_book_rating, copy=False, nan=3.0)

# Fill missing value with the appropriate average for that book
filled = np.where(np.isnan(train), avg_book_rating, train)

  avg_book_rating = np.nanmean(train, axis=0)


In [71]:
# Remove from each user (row) the mean over all books
avg_user_rating = np.average(filled, axis=1) # row avg r_i
avg_user_rating_col_vec = avg_user_rating.reshape(len(avg_user_rating),1)
normalized = filled - avg_user_rating_col_vec

In [72]:
normalized

array([[-0.79291392,  1.20708608,  0.20708608, ...,  0.20708608,
        -2.79291392,  0.20708608],
       [-0.79634311,  1.20365689,  0.20365689, ...,  0.20365689,
        -2.79634311,  0.20365689],
       [-0.79667185,  1.20332815,  0.20332815, ...,  0.20332815,
        -2.79667185,  0.20332815],
       ...,
       [-0.79713961,  1.20286039,  0.20286039, ...,  0.20286039,
        -2.79713961,  0.20286039],
       [-0.79616853,  1.20383147,  0.20383147, ...,  0.20383147,
        -2.79616853,  0.20383147],
       [-0.79484286,  1.20515714,  0.20515714, ...,  0.20515714,
        -2.79484286,  0.20515714]])

In [73]:
normalized.shape

(111, 2102)

In [22]:
# Perform SVD, decide on a rank k, and calculate a reduced rating matrix to remove noise
# Can use sparse matrix algorithms because of special structure
# How to choose k: if top few values are big and the rest are small, take the big values. So chose k=11
U, S, VT = scipy.sparse.linalg.svds(normalized, k=6) # CHANGE BACK TO k=11

In [75]:
#U, S, VT = np.linalg.svd(normalized) #k=10

In [79]:
# U = U[:, :9]
# S = S[:9]
# VT = VT[:9, :]

In [80]:
S

array([449.74509418,  12.41419825,  11.3572002 ,  10.0172646 ,
         9.2065746 ,   8.35799759,   7.93735936,   7.81951299,
         7.04573656])

In [81]:
U.shape, S.shape, VT.shape

((111, 9), (9,), (9, 2102))

In [82]:
reduced_rating = U @ np.diag(S) @ VT

In [91]:
reduced_rating # SHOULD THIS BE >=1?

array([[-0.79156237,  1.21044871,  0.20944317, ...,  0.25708654,
        -2.79357344,  0.20944317],
       [-0.79475066,  1.20589657,  0.20557296, ...,  0.25979132,
        -2.79539789,  0.20557296],
       [-0.79520382,  1.20325208,  0.20402413, ...,  0.18426777,
        -2.79365972,  0.20402413],
       ...,
       [-0.79395108,  1.2030251 ,  0.20453701, ...,  0.22599539,
        -2.79092726,  0.20453701],
       [-0.7983921 ,  1.20320116,  0.20240453, ...,  0.16230349,
        -2.79998536,  0.20240453],
       [-0.79503577,  1.20487234,  0.20491828, ...,  0.17570092,
        -2.79494389,  0.20491828]])

In [83]:
reduced_rating.shape

(111, 2102)

In [84]:
# Calculate matrix product for dimensionality-reduced rating matrix
sqrtS = np.diag(np.sqrt(S))
meta_on_users = U @ np.transpose(sqrtS)
meta_on_items = sqrtS @ VT

In [85]:
meta_on_items

array([[-3.95663787e-01,  5.97927582e-01,  1.01131898e-01, ...,
         1.01165920e-01, -1.38925516e+00,  1.01131898e-01],
       [ 1.00084477e-02,  1.44539259e-02,  1.22311868e-02, ...,
        -4.81789509e-02,  5.56296951e-03,  1.22311868e-02],
       [ 9.81846018e-03,  5.73191873e-03,  7.77518945e-03, ...,
        -4.38219709e-02,  1.39050016e-02,  7.77518945e-03],
       ...,
       [ 1.57411751e-03,  3.27378165e-03,  2.42394958e-03, ...,
        -1.36740053e-01, -1.25546630e-04,  2.42394958e-03],
       [-2.73117187e-03,  1.45773024e-03, -6.36720817e-04, ...,
        -2.44326340e-01, -6.92007399e-03, -6.36720817e-04],
       [ 6.78411526e-04,  7.58956242e-04,  7.18683884e-04, ...,
        -7.51428547e-02,  5.97866810e-04,  7.18683884e-04]])

In [29]:
# # Calculate similarity of 2 books j and f based on the reduced rating matrix
# # Similarity is the correlation of 2 movies based on average over users
# def similarity(j, f, matrix):
#     j_col = matrix[:, j]
#     f_col = matrix[:, f]
#     numerator = np.sum(np.multiply(j_col, f_col))
#     denominator = np.sqrt(np.multiply(np.sum(np.square(j_col)), np.sum(np.square(f_col))))
#     return numerator / denominator

In [30]:
# num_books = reduced_rating.shape[1]
# t1 = time()
# similarities = np.zeros((num_books, num_books))
# # TODO: is there a faster way to do this?
# for i in range(num_books):
#     for j in range(num_books):
#         similarities[i, j] = similarity(i, j, meta_on_items)
# t2 = time()

In [31]:
# t2-t1

In [32]:
# num_books = reduced_rating.shape[1]
# t1 = time()
# col_prods = meta_on_items.T.dot(meta_on_items)

# similarities2 = np.zeros((num_books, num_books))
# # TODO: is there a faster way to do this?
# for i in range(num_books):
#     for j in range(i, num_books):
#         similarities2[i, j] = col_prods[i, j] / np.sqrt(col_prods[i, i] * col_prods[j, j])
#         similarities2[j, i] = similarities2[i, j]
# t2 = time()

In [33]:
# similarities.shape, t2-t1, np.allclose(similarities, similarities2)

In [86]:
num_books = reduced_rating.shape[1]
t1 = time()
similarities = pw.cosine_similarity(meta_on_items.T, meta_on_items.T)
t2 = time()

In [87]:
similarities.shape, t2-t1

((2102, 2102), 0.009905815124511719)

In [36]:
# # Without reducing dimensionality
# num_books = reduced_rating.shape[1]
# t1 = time()
# similarities = pw.cosine_similarity(reduced_rating.T, reduced_rating.T)
# t2 = time()
# similarities.shape, t2-t1

In [88]:
# Calculate prediction of rating by user i of book j, by avging over the ratings of all books by the same user, each weighted according to its similarity to book j
num_users = reduced_rating.shape[0]
predictions = np.zeros((num_users, num_books))
addends = np.zeros(num_books)

# Isolate the set of items which appear to be the most similar to the active item. Vozalis suggests 10 items is best.
most_similar_idx = np.argsort(similarities, axis=1)[:,-10:]
for a in range(num_users):
    for j in range(num_books):
        for k in most_similar_idx[j,:]:
            addends[k] = similarities[j,k] * (reduced_rating[a,k] + avg_user_rating[a])
        numerator = np.sum(addends)
        denominator = np.sum(np.absolute(similarities[j,:]))
        predictions[a,j] = numerator/denominator

In [89]:
predictions

array([[0.01541752, 0.0410281 , 0.0619847 , ..., 3.11454429, 0.89468282,
        0.90041843],
       [0.8968088 , 0.89481871, 0.90125114, ..., 3.12542109, 0.89780727,
        0.90356291],
       [0.89993595, 0.89793282, 0.90438363, ..., 3.12804407, 0.89856075,
        0.90432123],
       ...,
       [0.89952094, 0.8975393 , 0.90400883, ..., 3.12684731, 0.89821699,
        0.90397526],
       [0.90031954, 0.89832336, 0.90476725, ..., 3.13288994, 0.89995274,
        0.90572214],
       [0.90209452, 0.90010026, 0.90657908, ..., 3.12474608, 0.89761336,
        0.90336776]])

In [51]:
np.argsort(similarities, axis=0)

array([[1929, 1510, 1969, ...,  168, 1743, 1969],
       [ 451,   65, 1790, ...,  867,   26, 1790],
       [1777, 1025, 1959, ...,  860,  451, 1959],
       ...,
       [ 850,  922, 1345, ..., 1682,  192, 1345],
       [ 884, 1642,  280, ..., 1635,  164,  280],
       [   0,  262, 2101, ..., 2099, 1050, 2101]], dtype=int64)

In [56]:
testing = np.array([[1,5,8,3,7],[1,2,3,4,5]])
sorted_testing = np.argsort(testing, axis=1)
sorted_testing[:,-2:]

array([[4, 2],
       [3, 4]], dtype=int64)

In [67]:
np.all(predictions < 5), np.all(predictions > 1)

(False, False)

In [68]:
np.all(reduced_rating < 5), np.all(reduced_rating > 1)

(True, False)

In [90]:
reduced_rating

array([[-0.79156237,  1.21044871,  0.20944317, ...,  0.25708654,
        -2.79357344,  0.20944317],
       [-0.79475066,  1.20589657,  0.20557296, ...,  0.25979132,
        -2.79539789,  0.20557296],
       [-0.79520382,  1.20325208,  0.20402413, ...,  0.18426777,
        -2.79365972,  0.20402413],
       ...,
       [-0.79395108,  1.2030251 ,  0.20453701, ...,  0.22599539,
        -2.79092726,  0.20453701],
       [-0.7983921 ,  1.20320116,  0.20240453, ...,  0.16230349,
        -2.79998536,  0.20240453],
       [-0.79503577,  1.20487234,  0.20491828, ...,  0.17570092,
        -2.79494389,  0.20491828]])

In [44]:
avg_user_rating

array([3.79291392, 3.79634311, 3.79667185, 3.79423653, 3.82073502,
       3.80307328, 3.80522777, 3.78977195, 3.79678001, 3.80490058,
       3.79654926, 3.79643033, 3.78226176, 3.79556661, 3.79673956,
       3.79552167, 3.797025  , 3.79397581, 3.7940936 , 3.79589396,
       3.79709636, 3.80385936, 3.77688461, 3.80664769, 3.79654121,
       3.79709609, 3.80236371, 3.79563743, 3.80403545, 3.79764826,
       3.79629806, 3.79637901, 3.78530414, 3.79519341, 3.79754804,
       3.79309643, 3.79790997, 3.79535992, 3.7974874 , 3.80162379,
       3.79676614, 3.79765419, 3.79619246, 3.79614461, 3.79812172,
       3.79654926, 3.79077052, 3.79762333, 3.7998803 , 3.79110997,
       3.79663648, 3.79647483, 3.80145392, 3.7932477 , 3.79439511,
       3.79698535, 3.79627175, 3.79746109, 3.79625174, 3.79643033,
       3.79643033, 3.79643033, 3.79515466, 3.79688876, 3.79754998,
       3.79486843, 3.79547885, 3.79717565, 3.7966682 , 3.79745788,
       3.80610679, 3.79229618, 3.79733423, 3.79721809, 3.79787

In [41]:
# TODO: 