In [1]:
import numpy as np
import pandas as pd
import surprise as surp

In [2]:
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 100)
#pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Taking a look at our user rating database

In [3]:
df_user_rating = pd.read_csv('ratings.csv')

In [4]:
df_user_rating.head(10)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3
5,2,26,4
6,2,315,3
7,2,33,4
8,2,301,5
9,2,2686,5


In [5]:
df_user_rating.shape

(5976479, 3)

In [6]:
df_user_rating.user_id.nunique()

53424

In [7]:
df_user_rating.book_id.nunique()

10000

In [8]:
df_user_rating.describe()

Unnamed: 0,user_id,book_id,rating
count,5976479.0,5976479.0,5976479.0
mean,26224.46,2006.48,3.92
std,15413.23,2468.5,0.99
min,1.0,1.0,1.0
25%,12813.0,198.0,3.0
50%,25938.0,885.0,4.0
75%,39509.0,2973.0,5.0
max,53424.0,10000.0,5.0


In [9]:
df_user_rating.isna().sum()

user_id    0
book_id    0
rating     0
dtype: int64

In [10]:
df_user_rating.duplicated().sum()

0

In [13]:
df_user_rating_groupby_user = (df_user_rating.groupby(['user_id'], as_index=False).agg(number_of_rated_books = ('rating', 'count')))
df_user_rating_groupby_user.describe()

Unnamed: 0,user_id,number_of_rated_books
count,53424.0,53424.0
mean,26712.5,111.87
std,15422.32,26.07
min,1.0,19.0
25%,13356.75,96.0
50%,26712.5,111.0
75%,40068.25,128.0
max,53424.0,200.0


In [14]:
df_users_wishlist = pd.read_csv('to_read.csv')

In [15]:
df_users_wishlist.isna().sum()

user_id    0
book_id    0
dtype: int64

In [17]:
df_users_wishlist.duplicated().sum()

0

In [18]:
df_users_wishlist.user_id.nunique()

48871

In [19]:
df_user_wishlist_groupby_user = (df_users_wishlist.groupby(['user_id'], as_index=False).agg(number_books_to_read = ('book_id', 'count')))
df_user_wishlist_groupby_user.describe()

Unnamed: 0,user_id,number_books_to_read
count,48871.0,48871.0
mean,26750.21,18.68
std,15265.93,16.09
min,1.0,1.0
25%,13670.5,6.0
50%,26690.0,14.0
75%,39870.5,27.0
max,53424.0,117.0


In [20]:
df_top25_reader_wishlist = (df_user_wishlist_groupby_user.query('number_books_to_read > 26')).copy()
df_top25_reader_wishlist

Unnamed: 0,user_id,number_books_to_read
1,2,32
11,13,28
21,29,39
22,30,28
30,39,54
...,...,...
48855,53407,38
48856,53408,34
48860,53413,34
48861,53414,32


In [24]:
df_top2000_readers = df_user_rating_groupby_user.sort_values(by='number_of_rated_books', ascending=False).head(2000)
df_user_rating_top2000 = (df_user_rating[df_user_rating['user_id'].isin((df_top2000_readers.user_id).to_list())]).copy()

In [25]:
df_user_rating_top2000.shape

(342557, 3)

In [26]:
df_wishlist_top2000_readers = (df_users_wishlist[df_users_wishlist['user_id'].isin((df_top2000_readers.user_id).to_list())]).copy()
df_wishlist_top2000_readers.shape

(13085, 2)

In [29]:
(df_wishlist_top2000_readers.groupby(['user_id'], as_index=False).agg(number_books_to_read = ('book_id', 'count'))).query('number_books_to_read > 9')


Unnamed: 0,user_id,number_books_to_read
4,230,20
12,513,12
14,589,10
15,725,14
20,1019,15
...,...,...
1625,52334,13
1627,52556,13
1628,52668,12
1632,52875,13


In [30]:
df_top25_reader = (df_user_rating_groupby_user.query('number_of_rated_books > 127')).copy()
df_user_rating_top25 = (df_user_rating[df_user_rating['user_id'].isin((df_top25_reader.user_id).to_list())]).copy()

# Setting up a first test run KNNmeans

In [32]:
reader = surp.Reader(rating_scale=(1,5)) #rating scale used in the user rating system

In [33]:
df_data_top25 = surp.Dataset.load_from_df(df_user_rating_top25[['user_id', 'book_id', 'rating']], reader)

In [34]:
algo_knnMeans = surp.KNNWithMeans

In [35]:
param_grid = {'k': [10, 20, 30, 40],
              'sim_options': {'name': ['msd', 'cosine', 'pearson'],
                              'min_support': [3,4,5],
                              'user_based': [True]}
              }

In [36]:
gs = surp.model_selection.GridSearchCV(algo_knnMeans, param_grid, measures=["rmse", "mae"], cv=5)

In [None]:
gs.fit(df_data_top25)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done comp