In [1]:
import operator
import numpy as np
import pandas as pd
import csv
import surprise
import time
import sklearn.preprocessing as prepro
from surprise import SVD
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
import collections
from surprise.model_selection import cross_validate

In [4]:
users=[]
items=[]
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1")
ratings.columns = ['userID', 'ISBN', 'bookRating']
ratings_not_zero = ratings.loc[ratings['bookRating'] != 0]
print(ratings_not_zero.shape)

users = ratings['userID'].values.tolist()
books = ratings['ISBN'].values.tolist()

books_rating_count = pd.DataFrame(ratings.groupby('ISBN')['bookRating'].count())
frequent_books = books_rating_count.sort_values('bookRating', ascending=False).head(1000)
users_rating_count = pd.DataFrame(ratings.groupby('userID')['bookRating'].count())
frequent_users = users_rating_count.sort_values('bookRating', ascending=False).head(1500)

most_rated_books = pd.DataFrame(frequent_books.index, index=np.arange(len(frequent_books.index)), columns = ['ISBN'])
most_rated_books_ratings = pd.merge(most_rated_books, ratings, on='ISBN')

most_rated_users = pd.DataFrame(frequent_users.index, index=np.arange(len(frequent_users.index)), columns = ['userID'])
most_rated_users_ratings = pd.merge(most_rated_users, ratings, on='userID')

most_rated_books_ratings = most_rated_books_ratings.values.tolist()
print("Most rated books ratings: ",len(most_rated_books_ratings))

for item in most_rated_books_ratings:
    item[0], item[1] = item[1], item[0]

most_rated_users_ratings = most_rated_users_ratings.values.tolist()
print("Most rated users ratings: ",len(most_rated_users_ratings))

(433671, 3)
Most rated books ratings:  162767
Most rated users ratings:  620489


In [5]:
intersection = set(map(tuple,most_rated_books_ratings)) & set(map(tuple,most_rated_users_ratings))
intersection = list(map(list,intersection))
print("Intersection list: ",len(intersection))

intersection = pd.DataFrame(intersection, columns = ['userID','ISBN','bookRating'])
print("Intersection dataframe: ",intersection.shape)
print(intersection.head())
print(intersection.describe())

Intersection list:  73101
Intersection dataframe:  (73101, 3)
   userID        ISBN  bookRating
0   67930  0440221315          10
1  234623  0553280368           0
2  216466  0553292722           0
3   47316  0671004530          10
4   43246  0375703764           0
              userID    bookRating
count   73101.000000  73101.000000
mean   139970.253731      2.146811
std     80687.243061      3.639671
min       254.000000      0.000000
25%     69405.000000      0.000000
50%    138844.000000      0.000000
75%    211426.000000      5.000000
max    278418.000000     10.000000


In [6]:
nData = intersection.loc[intersection['bookRating'] != 0]
nData.head()

Unnamed: 0,userID,ISBN,bookRating
0,67930,440221315,10
3,47316,671004530,10
9,125203,786817070,5
13,230522,440940001,10
20,267635,446360589,8


In [8]:
labels = ['userID', 'ISBN', 'bookRating']
reader = Reader(rating_scale=(1, 10))
dataTrain = Dataset.load_from_df(nData[labels], reader)
trainset = dataTrain.build_full_trainset()

In [21]:
svd = surprise.SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1ab17178080>

In [22]:
testData = intersection.loc[intersection['bookRating'] == 0]
print("Test data:")
print(testData.head())
# print(tData.describe())
testset = [tuple(x) for x in testData.values]
# print(testset[0])

Test data:
   userID        ISBN  bookRating
1  234623  0553280368           0
2  216466  0553292722           0
4   43246  0375703764           0
5  127233  0374199698           0
6  245827  0449912558           0


In [26]:
predictions_svd = svd.test(testset)
print(len(predictions_svd))
print(predictions_svd[0])
print(predictions_svd[2])
# print(predictions_svd[0][0])
# print(predictions_svd[0][1])
# print(predictions_svd[0][2])

53347
user: 234623     item: 0553280368 r_ui = 0.00   est = 8.13   {'was_impossible': False}
user: 43246      item: 0375703764 r_ui = 0.00   est = 8.78   {'was_impossible': False}


In [29]:
predictions_svd[:][0]
ratings.loc[(ratings['userID'] == (predictions_svd[1][0])) & (ratings['ISBN'] == str(predictions_svd[1][1]))]

Unnamed: 0,userID,ISBN,bookRating
895204,216466,553292722,0


In [None]:
for i in range(len(predictions_svd)):
    ratings.loc[(ratings['userID'] == (predictions_svd[i][0])) & (ratings['ISBN'] == str(predictions_svd[i][1])), 'bookRating'] = predictions_svd[i][3]
#     print(i)

In [38]:
ratings.to_csv('denser_ratings_1.csv', sep=';', index=False)

In [39]:
denser_ratings = pd.read_csv('denser_ratings_1.csv', sep=';', error_bad_lines=False, encoding="latin-1")
denser_ratings.columns = ['userID', 'ISBN', 'bookRating']
denser_ratings_not_zero = ratings.loc[ratings['bookRating'] != 0]
print(denser_ratings_not_zero.shape)

(487018, 3)
