In [1]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
from surprise import Dataset, KNNBasic, evaluate, Reader, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split

In [2]:
df = pd.read_csv('data/cleaned_ratings.csv', usecols=['user_id', 'book_id', 'rating'])
print(df.shape)
df.head()

(4847141, 3)


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,1,109,3
2,1,131,4
3,1,102,5
4,1,95,4


In [3]:
# # use a smaller sample df
# small_df = df.sample(1000)
# small_df.head(5)
df1 = df.iloc[:30000,:]
print(df1.shape)
df1.head()

(30000, 3)


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,1,109,3
2,1,131,4
3,1,102,5
4,1,95,4


In [4]:
reader = Reader(rating_scale=(1,5)) # rating scale 1 to 5
data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader) #user id, item id and ratings (in that order)

In [5]:
sim_options = {'name': 'cosine', 'user-based': False}
knn = KNNBasic(sim_options=sim_options)

In [6]:
# trainingSet, testSet = train_test_split(data, test_size=.99)
trainingSet = data.build_full_trainset()
knn.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7f450c3d7be0>

In [7]:
testSet = trainingSet.build_anti_testset()
predictions = knn.test(testSet)
predictions

[Prediction(uid=1, iid=118, r_ui=3.7554666666666665, est=3.7000106033786904, details={'was_impossible': False, 'actual_k': 40}),
 Prediction(uid=1, iid=7341, r_ui=3.7554666666666665, est=3.594599879122625, details={'was_impossible': False, 'actual_k': 5}),
 Prediction(uid=1, iid=236, r_ui=3.7554666666666665, est=4.10879181139066, details={'was_impossible': False, 'actual_k': 27}),
 Prediction(uid=1, iid=4059, r_ui=3.7554666666666665, est=3.004768672315713, details={'was_impossible': False, 'actual_k': 2}),
 Prediction(uid=1, iid=301, r_ui=3.7554666666666665, est=3.4609711507624947, details={'was_impossible': False, 'actual_k': 40}),
 Prediction(uid=1, iid=664, r_ui=3.7554666666666665, est=3.737318880996742, details={'was_impossible': False, 'actual_k': 19}),
 Prediction(uid=1, iid=8519, r_ui=3.7554666666666665, est=3.6086516640815094, details={'was_impossible': False, 'actual_k': 5}),
 Prediction(uid=1, iid=2139, r_ui=3.7554666666666665, est=4.495138564387396, details={'was_impossible'

In [8]:
# get top 3 recommendations
from collections import defaultdict

def get_top_rec(predictions, top=3):
    top_recs = defaultdict(list)
    for uid, iid, r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:top]
        
    return top_recs

In [9]:
top_rec = get_top_rec(predictions) # get top rec
top_rec

defaultdict(list,
            {1: [(2166, 5), (9296, 5), (8464, 5)],
             2: [(8464, 5), (1237, 5), (2813, 5)],
             3: [(8464, 5), (1237, 5), (1744, 5)],
             4: [(7233, 5), (2166, 5), (9296, 5)],
             5: [(7233, 5), (2235, 5), (5347, 5)],
             6: [(7233, 5), (2166, 5), (9296, 5)],
             7: [(7233, 5), (2166, 5), (9296, 5)],
             8: [(2139, 5), (7233, 5), (2166, 5)],
             9: [(7233, 5), (2166, 5), (9296, 5)],
             10: [(7233, 5), (2166, 5), (9296, 5)],
             11: [(2166, 5), (9296, 5), (8464, 5)],
             12: [(2139, 5), (7233, 5), (2166, 5)],
             13: [(7233, 5), (8464, 5), (1237, 5)],
             14: [(7233, 5), (8464, 5), (1237, 5)],
             15: [(7233, 5), (2166, 5), (9296, 5)],
             16: [(2139, 5), (7233, 5), (2166, 5)],
             17: [(2139, 5), (7233, 5), (2166, 5)],
             18: [(2139, 5), (2166, 5), (9296, 5)],
             19: [(7233, 5), (2166, 5), (9296, 5)],
   

In [10]:
df_books = pd.read_csv('data/cleaned_books_data.csv', usecols=['book_id', 'original_title'])
df_books.shape

(5764, 2)

In [11]:
df_books.set_index('book_id', inplace=True)
df_books.head()

Unnamed: 0_level_0,original_title
book_id,Unnamed: 1_level_1
1,The Hunger Games
2,Harry Potter and the Philosopher's Stone
3,Twilight
4,To Kill a Mockingbird
5,The Great Gatsby


In [12]:
def name(toprec):
    names = {}
    for rec in range(3):
        names[rec] = df_books['original_title'].loc[toprec[rec][0]]
    return names

In [14]:
print(name(top_rec[2])) # recommend to user number 3

{0: 'Coming Home', 1: 'The Shell Seekers', 2: 'You'}
