In [22]:
import operator
import numpy as np
import pandas as pd
import csv
import surprise
import time
import sklearn.preprocessing as prepro
from surprise import SVD
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
import collections
from surprise.model_selection import cross_validate
from IPython.display import Image, display
from IPython.core.display import HTML

In [3]:
denserRatings = pd.read_csv('denser_ratings_1.csv', sep=';', error_bad_lines=False, encoding="latin-1")
denserRatings.columns = ['userID', 'ISBN', 'bookRating']

ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']

time_MF = []
time_SVD = []
RMS_MF = []
RMS_SVD = []

print(denserRatings.head())
# print(dRatings.describe())
print(ratings.head())
# print(ratings.describe())

   userID        ISBN  bookRating
0  276725  034545104X         0.0
1  276726  0155061224         5.0
2  276727  0446520802         0.0
3  276729  052165615X         3.0
4  276729  0521795028         6.0
   userID        ISBN  bookRating
0  276725  034545104X           0
1  276726  0155061224           5
2  276727  0446520802           0
3  276729  052165615X           3
4  276729  0521795028           6


In [9]:
class MatrixFacto(surprise.AlgoBase):
    '''A basic rating prediction algorithm based on matrix factorization.'''
    skip_train=0

    def __init__(self, learning_rate, n_epochs, n_factors):

        self.lr = learning_rate  # learning rate for SGD
        self.n_epochs = n_epochs  # number of iterations of SGD
        self.n_factors = n_factors  # number of factors

    def train(self, trainset):
        '''Learn the vectors p_u and q_i with SGD'''

        print('Fitting data with SGD...')

        # Randomly initialize the user and item factors.
        p = np.random.normal(0, .1, (trainset.n_users, self.n_factors))
        q = np.random.normal(0, .1, (trainset.n_items, self.n_factors))

        # SGD procedure
        for _ in range(self.n_epochs):
            for u, i, r_ui in trainset.all_ratings():
                err = r_ui - np.dot(p[u], q[i])
                # Update vectors p_u and q_i
                p[u] += self.lr * err * q[i]
                q[i] += self.lr * err * p[u]
                # Note: in the update of q_i, we should actually use the previous (non-updated) value of p_u.
                # In practice it makes almost no difference.

        self.p, self.q = p, q
        self.trainset = trainset

    def estimate(self, u, i):
        '''Return the estmimated rating of user u for item i.'''

        # return scalar product between p_u and q_i if user and item are known,
        # else return the average of all ratings
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            return np.dot(self.p[u], self.q[i])
        else:
            return self.trainset.global_mean

In [4]:
def overWatch(tdata):
    data_df = tdata.loc[tdata['bookRating'] != 0]
    print(data_df['bookRating'].count())
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(data_df[['userID', 'ISBN', 'bookRating']], reader)
    data.split(2)
    s_SVD(data)
    MatFac(data)

In [5]:
def MatFac(dat):
    temp = time.time()
    algo = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10)
    res = surprise.evaluate(algo, dat, measures=['RMSE'])
    time_MF.append(time.time() - temp)
    RMS_MF.append(np.mean(res['rmse']))

In [6]:
def s_SVD(dat):
    temp = time.time()
    algo = surprise.SVD()
    res = surprise.evaluate(algo, dat, measures=['RMSE'])
    time_SVD.append(time.time() - temp)
    RMS_SVD.append(np.mean(res['rmse']))

In [10]:
overWatch(denserRatings)

487018




Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 1.5734
------------
Fold 2
RMSE: 1.5710
------------
------------
Mean RMSE: 1.5722
------------
------------
Evaluating RMSE of algorithm MatrixFacto.

------------
Fold 1
Fitting data with SGD...




RMSE: 2.4712
------------
Fold 2
Fitting data with SGD...




RMSE: 2.4641
------------
------------
Mean RMSE: 2.4676
------------
------------


In [13]:
no_inter = 73101
no_rati = ratings['bookRating'].count()
per1 = (100.0*no_inter)/no_rati
per2 = (100.0*(487018-433671))/no_rati
print("% data used in ESVD's first part: ", per1)
print("% data increase for ESVD's second part: ", per2)

% data used in ESVD's first part:  6.357824975212649
% data increase for ESVD's second part:  4.639757170937049


In [14]:
print("Original Ratings: ")
print("Mean MF_RMSE: ", RMS_MF[1], "Mean SVD_RMSE: ", RMS_SVD[1])
print("Denser Ratings(Enhanced): ")
print("Mean MF_RMSE: ", RMS_MF[0], "Mean SVD_RMSE: ", RMS_SVD[0])
print("Note: Rating Range is 1 to 10 and there is 4.63% more data(total 487K) in the denser ratings matrix, in this case.")

Original Ratings: 
Mean MF_RMSE:  2.4910633790239727 Mean SVD_RMSE:  1.572203667942619
Denser Ratings(Enhanced): 
Mean MF_RMSE:  2.4676401137616546 Mean SVD_RMSE:  1.5735536291280057
Note: Rating Range is 1 to 10 and there is 4.63% more data(total 487K) in the denser ratings matrix, in this case.


In [16]:
data_df = denserRatings.loc[denserRatings['bookRating'] != 0]
print(data_df['bookRating'].count())
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(data_df[['userID', 'ISBN', 'bookRating']], reader)
data.split(2)
algo = surprise.SVD()
res = surprise.evaluate(algo, data, measures=['RMSE'])

487018




Evaluating RMSE of algorithm SVD.

------------
Fold 1
RMSE: 1.5752
------------
Fold 2
RMSE: 1.5697
------------
------------
Mean RMSE: 1.5725
------------
------------


In [17]:
books = pd.read_csv("BX-Books.csv", sep=';', error_bad_lines=False, encoding="latin-1")
books.columns = ['ISBN', 'Book-Title', 'Book-Author','Year-Of-Publication','Publisher','is','im', 'il']
print(books['ISBN'].head())

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


0    0195153448
1    0002005018
2    0060973129
3    0374157065
4    0393045218
Name: ISBN, dtype: object


  interactivity=interactivity, compiler=compiler, result=result)


In [18]:
u1_td = ratings.loc[(ratings['userID'] == 8) & (ratings['bookRating'] != 0)]
allBooks = books['ISBN'] 
ratedBooks = u1_td['ISBN']

print(allBooks.shape)

remainingBooks = pd.concat([allBooks, ratedBooks]).drop_duplicates(keep=False)

print(remainingBooks.shape)
print(remainingBooks.describe())
rBooks = pd.DataFrame(columns = ['userID', 'ISBN', 'ratings'])

rBooks['ISBN'] = remainingBooks
rBooks['userID'] = 8
rBooks['ratings'] = 0

print(rBooks.head())
print(rBooks.describe())

testset = [tuple(x) for x in rBooks.values]
print(len(testset))

(271360,)
(271353,)
count         271353
unique        271353
top       0517141434
freq               1
Name: ISBN, dtype: object
   userID        ISBN  ratings
0       8  0195153448        0
2       8  0060973129        0
3       8  0374157065        0
4       8  0393045218        0
5       8  0399135782        0
         userID   ratings
count  271353.0  271353.0
mean        8.0       0.0
std         0.0       0.0
min         8.0       0.0
25%         8.0       0.0
50%         8.0       0.0
75%         8.0       0.0
max         8.0       0.0
271353


In [20]:
predictions = algo.test(testset)
print(len(predictions))
print(predictions[0:50])

271353
[Prediction(uid=8, iid='0195153448', r_ui=0, est=6.900719298510548, details={'was_impossible': False}), Prediction(uid=8, iid='0060973129', r_ui=0, est=6.914778548430258, details={'was_impossible': False}), Prediction(uid=8, iid='0374157065', r_ui=0, est=6.866552743346413, details={'was_impossible': False}), Prediction(uid=8, iid='0393045218', r_ui=0, est=6.900719298510548, details={'was_impossible': False}), Prediction(uid=8, iid='0399135782', r_ui=0, est=7.105390900247622, details={'was_impossible': False}), Prediction(uid=8, iid='0425176428', r_ui=0, est=6.818065771053425, details={'was_impossible': False}), Prediction(uid=8, iid='0671870432', r_ui=0, est=6.818725597461178, details={'was_impossible': False}), Prediction(uid=8, iid='0679425608', r_ui=0, est=6.900719298510548, details={'was_impossible': False}), Prediction(uid=8, iid='0771074670', r_ui=0, est=6.900719298510548, details={'was_impossible': False}), Prediction(uid=8, iid='080652121X', r_ui=0, est=6.900719298510548

In [32]:
rated = data_df.loc[(data_df['userID'] == 8)]
sRated = rated.sort_values('bookRating', ascending=False)
risbn = [x for x in sRated['ISBN'].values]
print("Top rated books by this user:")
rlen = min(10,len(risbn))
fav_authors = []
for i in range(rlen):
    isbn = risbn[i]
    book = books.loc[(books['ISBN'] == isbn)]
    print(book['Book-Title'].values)
    print(book['Book-Author'].values)
    fav_authors.append(book['Book-Author'].values[0])
    print('-'*60)

Top rated books by this user:
['Goodbye to the Buttermilk Sky']
['Julia Oliver']
------------------------------------------------------------
['The Witchfinder (Amos Walker Mystery Series)']
['Loren D. Estleman']
------------------------------------------------------------
['More Cunning Than Man: A Social History of Rats and Man']
['Robert Hendrickson']
------------------------------------------------------------
['Clara Callan']
['Richard Bruce Wright']
------------------------------------------------------------
["Where You'll Find Me: And Other Stories"]
['Ann Beattie']
------------------------------------------------------------
['The Middle Stories']
['Sheila Heti']
------------------------------------------------------------
['Jane Doe']
['R. J. Kaiser']
------------------------------------------------------------


In [34]:
fav_authors = list(set(fav_authors))

In [23]:
order = sorted(predictions, key=lambda x: x.est, reverse=True)

print("Top reccomended books for this user:")
for i in range(10):
    isbn = order[i].iid
    book = books.loc[(books['ISBN'] == isbn)]
    print(book['Book-Title'].values)
    imlst = book['im'].values
    display(Image(url= imlst[0]))

Top reccomended books for this user:
['Harry Potter and the Prisoner of Azkaban (Book 3)']


['The Return of the King (The Lord of the Rings, Part 3)']


['Where the Sidewalk Ends : Poems and Drawings']


['Harry Potter and the Chamber of Secrets (Book 2)']


['Anne Frank: The Diary of a Young Girl']


['I Know This Much Is True']


["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"]


['The Cat in the Hat']


['Odd Thomas']


['Harry Potter and the Chamber of Secrets (Book 2)']


In [41]:
print(order[0:10])

[Prediction(uid=8, iid='0439136369', r_ui=0, est=8.339846197813394, details={'was_impossible': False}), Prediction(uid=8, iid='0345339738', r_ui=0, est=8.261896981947682, details={'was_impossible': False}), Prediction(uid=8, iid='0060256672', r_ui=0, est=8.112457331198378, details={'was_impossible': False}), Prediction(uid=8, iid='0439064872', r_ui=0, est=8.109038201640523, details={'was_impossible': False}), Prediction(uid=8, iid='0553296981', r_ui=0, est=8.091230714275829, details={'was_impossible': False}), Prediction(uid=8, iid='0060987561', r_ui=0, est=8.07894865292647, details={'was_impossible': False}), Prediction(uid=8, iid='059035342X', r_ui=0, est=8.07848843030184, details={'was_impossible': False}), Prediction(uid=8, iid='039480001X', r_ui=0, est=8.065659074742094, details={'was_impossible': False}), Prediction(uid=8, iid='0553802496', r_ui=0, est=8.06349363579023, details={'was_impossible': False}), Prediction(uid=8, iid='0439064864', r_ui=0, est=8.056157876728578, details=

In [38]:
books_of_fav_author = books.loc[books['Book-Author'].isin(fav_authors)]
print(books_of_fav_author)

              ISBN                                         Book-Title  \
1       0002005018                                       Clara Callan   
9       074322678X            Where You'll Find Me: And Other Stories   
12      0887841740                                 The Middle Stories   
13      1552041778                                           Jane Doe   
15      1567407781       The Witchfinder (Amos Walker Mystery Series)   
16      1575663937  More Cunning Than Man: A Social History of Rat...   
17      1881320189                      Goodbye to the Buttermilk Sky   
19575   0679731946            Picturing Will (Vintage Contemporaries)   
23312   0446605964   Never Street (Amos Walker Mysteries (Paperback))   
24623   1558021256                                     The Black Moon   
35997   067162220X             Where You'll Find Me and Other Stories   
38153   0394744187      Love Always: A Novel (Vintage Contemporaries)   
43073   0743479130  Sweet Women Lie : An Amos Walke

In [70]:
recommended_6_books = []
counter = 0
for i in range(len(predictions)):
    isbn = order[i].iid
    book_1 = books_of_fav_author.loc[(books_of_fav_author['ISBN'] == isbn)]

    if counter == 6:
       break 
    if not book_1.empty:
        counter += 1
        recommended_6_books.append(book_1['Book-Title'].values[0])
print(recommended_6_books)

['The Master Executioner', 'The Hider', 'Sugartown (Amos Walker Mysteries (Paperback))', 'Squeeze Play: A Novel', 'Never Street (Amos Walker Mysteries (Paperback))', 'The Black Moon']


In [71]:
recommended_4_books = []
counter = 0
for i in range(len(predictions)):
    isbn = order[i].iid
    book_2 = books_of_fav_author.loc[(books_of_fav_author['ISBN'] == isbn)]

    if counter == 4:
       break 
    if book_2.empty:
        counter += 1
        recommended_4_books.append(books.loc[(books['ISBN'] == isbn)].values[0][1])
print(recommended_4_books)

['Harry Potter and the Prisoner of Azkaban (Book 3)', 'The Return of the King (The Lord of the Rings, Part 3)', 'Where the Sidewalk Ends : Poems and Drawings', 'Harry Potter and the Chamber of Secrets (Book 2)']


In [74]:
recommneded_ten_books = recommended_6_books + recommended_4_books
print("Final recommendation:")
recommneded_ten_books

Final recommendation:


['The Master Executioner',
 'The Hider',
 'Sugartown (Amos Walker Mysteries (Paperback))',
 'Squeeze Play: A Novel',
 'Never Street (Amos Walker Mysteries (Paperback))',
 'The Black Moon',
 'Harry Potter and the Prisoner of Azkaban (Book 3)',
 'The Return of the King (The Lord of the Rings, Part 3)',
 'Where the Sidewalk Ends : Poems and Drawings',
 'Harry Potter and the Chamber of Secrets (Book 2)']