Dataset is sourced from Kaggle https://www.kaggle.com/datasets/arashnic/book-recommendation-dataset

In [None]:
conda install -c conda-forge scikit-surprise

In [None]:
# allows us to access dataframes
import pandas as pd
import scipy as sp
import operator

# specialized class for handling sparse matrices (compressed)
from scipy.sparse import csr_matrix, csc_matrix

# Library for visualizing charts
import matplotlib.pylab as plt


# Gives us access to the unsupervised algorithm knn 
from sklearn.neighbors import NearestNeighbors

# Surprise is an easy-to-use Python scikit for recommender systems.
from surprise import SVD #SVD algorithm
from surprise import KNNBasic # Knn algorithm
from surprise import Dataset # Utility for loading datasets
from surprise import Reader # Allows surprise to interpret the ratings
from surprise.model_selection import cross_validate # cross validation utility

EDA


In [None]:
ratings_df = pd.read_csv("Ratings.csv")
ratings_df = ratings_df.drop_duplicates() #remove duplicate rows
ratings_df.dtypes

In [None]:
ratings_df.isnull().sum() #determining null values

In [None]:
ratings_df.head()

Ratings does not have a null value

In [None]:
users_df = pd.read_csv("Users.csv")
users_df = users_df.drop_duplicates() #remove duplicate rows
users_df.dtypes

In [None]:
users_df.isnull().sum() #determining null value

In [None]:
users_df.head()

Most of the users does not have Age on the data

In [None]:
books_df = pd.read_csv("Books.csv")
books_df = books_df.drop_duplicates() #remove duplicate rows
books_df.dtypes

In [None]:
books_df.isnull().sum() #determining null values

In [None]:
books_df.head()

There are 2 books that has a null value on the Author

In [None]:
#understand unique counts for the dataset

num_users = len(ratings_df['User-ID'].unique())
num_books = len(ratings_df.ISBN.unique())

print('There are {} unique users and {} unique books in this data set'.format(num_users, num_books))

In [None]:
#How many ratings per User ID do we have?

ratings_df[['User-ID','ISBN']].groupby(['User-ID']).count().hist()
plt.title("Count of Ratings per User Id")



Most of the books does not have a rating

In [None]:
ratings_df[['User-ID','ISBN']].groupby(['User-ID']).count().describe()

In [None]:
#How many user ratings per movie id do we have

ratings_df[['User-ID','ISBN']].groupby(['ISBN']).count().hist()
plt.title("Count of User Ratings per ISBN")

In [None]:
ratings_df[['User-ID','ISBN']].groupby(['ISBN']).count().describe()

In [None]:
#look at ratings distribution

ratings_df['Book-Rating'].value_counts().plot.bar(title="Count of Rating Score",)

Preprocessing

Based on the EDA, most of the books does not have a rating and not much users give ratings. That being said, we will have to remove unpopular books and users that rarely rating books for dimension reduction of the data set.

In [None]:
popularity_thres = 10
book_rating_cnt_df = ratings_df[['Book-Rating','ISBN']].groupby(['ISBN']).count()
popular_book_df = book_rating_cnt_df[book_rating_cnt_df['Book-Rating']>=popularity_thres]


filtered_pop_book_df = ratings_df[ratings_df.ISBN.isin(popular_book_df.index.values)]
print('shape of original ratings data: ', ratings_df.shape)
print('shape of ratings data after dropping unpopular books: ', filtered_pop_books_df.shape)

In [None]:
#filtering for active users
ratings_thres = 10

active_users_cnt_df = ratings_df[['User-ID','ISBN']].groupby(['User-ID']).count()
active_users_df = active_users_cnt_df[active_users_cnt_df['ISBN']>=ratings_thres]

filtered_popular_active_df = filtered_pop_book_df[filtered_pop_book_df.index.isin(active_users_df.index.values)]
print('shape of original ratings data: ', filtered_pop_book_df.shape)
print('shape of ratings data after dropping both unpopular books and inactive users: ', filtered_popular_active_df.shape)

In [None]:
filtered_popular_active_df.head()

In [None]:
# Let's transform this into a movie-user pivot table
# Let's join them up first using a left join operation on movieId

filtered_book_ratings_merged_df = pd.merge(filtered_popular_active_df,books_df, how='left', on=['ISBN'])
filtered_book_ratings_merged_df.head()

In [None]:
filtered_book_ratings_merged_df.shape

In [None]:
filtered_book_ratings_merged_df

In [None]:
# Now let's pivot the dataframe such that it will have movieId as the index, and userId as columns
filtered_book_ratings_df = pd.pivot(filtered_book_ratings_merged_df,index='ISBN', columns='User-ID', values='Book-Rating').fillna(0)
filtered_book_ratings_df.head()

In [None]:
book_ratings_mat  = sp.sparse.csr_matrix(filtered_book_ratings_df.values)
book_ratings_mat

Model Training

In [None]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

# fit
model_knn.fit(book_ratings_mat)

In [None]:
# Let's try it out
fav_book = "Myth"
print('You have input book:', fav_book)

# find movies that "match" title
book_matches_df = filtered_book_ratings_merged_df[filtered_book_ratings_merged_df['Book-Title'].str.contains(fav_book, na = False)]
book_matches_df

In [None]:
print("Let's start making recommendations!...")

if(book_matches_df is  None):
    print("No Matching books :(")
else:
    top_recommendations = 10

    book_idx= book_matches_df.iloc[0]['ISBN']
    
    book_idx = filtered_book_ratings_merged_df[filtered_book_ratings_merged_df['ISBN'] == book_idx].index[0]

    distances , indices = model_knn.kneighbors(book_ratings_mat[book_idx],n_neighbors=top_recommendations+1)    

    rec_book_indices = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    recommend_frame = []
 
    # we will iterate thru the results and add them to the list
    for val in rec_book_indices:
        
        book_idx = filtered_book_ratings_merged_df.iloc[val[0]]['ISBN']
        idx = filtered_book_ratings_merged_df[filtered_book_ratings_merged_df['ISBN'] == book_idx].index
        recommend_frame.append({'Book-Title':filtered_book_ratings_merged_df.iloc[idx]['Book-Title'].values[0],'Distance':val[1]})

    # place resulting list inside a dataframe
    df = pd.DataFrame(recommend_frame,index=range(1,top_recommendations+1))
    display(df.sort_values("Distance")) #sort by distance to get the top 10

Top N Predictors and Hit Rate

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        # top_n = is a dictionary to map rating to a userid as key
        # est = predicted rating
        # iid = ISBN
        # uid = user id
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        
        # x[1] refers to the est score, which means sort highest to lowest using the rating per user
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
from surprise.model_selection import LeaveOneOut


# Let's rebuild our models
# A reader is still needed but only the rating_scale param is required.
# this indicates our rating is 1 to 10 only

reader = Reader(rating_scale=(1, 10))

# The columns must correspond to user id, ISBN and ratings (in that order).
data = Dataset.load_from_df(filtered_book_ratings_merged_df[['User-ID', 'ISBN', 'Book-Rating']], reader)

#Build a "leave one out" train/test split for evaluating top-N recommenders
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
for train, test in LOOCV.split(data):
    LOOCVTrain = train
    LOOCVTest = test

#Save leave one out test predictions
leftoutpredictions = LOOCVTest


#And build an anti-test-set for building predictions
LOOCVAntiTestSet = LOOCVTrain.build_anti_testset()


# generate two models
knn = KNNBasic()
svd = SVD()

# fit using generated training set from data
knn.fit(LOOCVTrain)
svd.fit(LOOCVTrain)

# Then predict ratings for all pairs (u, i) that are NOT in the training set (from data)
#testset = trainset.build_anti_testset()

# generate predictions
knn_predictions = knn.test(LOOCVAntiTestSet)
svd_predictions = svd.test(LOOCVAntiTestSet)

In [None]:
# generate top ten recommendations based on users NOT in the training set
knn_top_n = get_top_n(knn_predictions, n=10)
svd_top_n = get_top_n(svd_predictions, n=10)

# Print the recommended items for each user
print("")
print("top n recommendations for knn")
for uid, user_ratings in knn_top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

print("")
print("top n recommendations for svd")   
for uid, user_ratings in svd_top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

In [None]:
# utility function to get the book title based on the ISBN
def get_title_from_id(idx):
    return filtered_book_ratings_merged_df[filtered_book_ratings_merged_df['ISBN']==idx]['Book-Title'].values[0]

In [None]:
# Print the recommended items for each user
print("")
print("top n recommendations for knn")
for uid, user_ratings in knn_top_n.items():
    print(uid, [get_title_from_id(iid) for (iid, _) in user_ratings])

print("")
print("top n recommendations for svd")   
for uid, user_ratings in svd_top_n.items():
    print(uid, [get_title_from_id(iid) for (iid, _) in user_ratings])

In [None]:
# HIT RATE - This is usually used for top n recomendation systems, since we're not really predicting against anything
# Tries to measure how many books did we predict are in the top movies of a user we left out,
# are also in the top n movies we predicted

# Generate the top n recommendation for a user and compare them to those the user has rated, liked or read.
# If they match then increase the hit rate by 1, do this for the complete training set to get the hit rate.
# The higher the better, but if its very low or zero, it means we need to use more data 

def hitrate(topNpredictions, leftoutpredictions):
    userHitRates = []  # create list of user hit rates

    # iterate per user per book ISBN
    for leftout in leftoutpredictions:
        uid = leftout[0]
        leftout_isbn = leftout[1]

        print("uid: ", uid)
        print("left out ISBN: ", leftout_isbn)

        predicted_isbns = [predISBN for predISBN, predRating in topNpredictions[uid]]

        print(predicted_isbns)

        hits = 1 if leftout_isbn in predicted_isbns else 0
        total = len(predicted_isbns)
        userHitRate = hits / total if total != 0 else 0  # handle division by zero

        userhitRate_dict = {"uid": uid, "userhitrate": userHitRate}
        userHitRates.append(userhitRate_dict)

    return pd.DataFrame(userHitRates)

In [None]:
knn_top_n[3]

In [None]:
knn_top_n.items()

In [None]:
# Hit rate for our models
hitrate_df = pd.DataFrame()
hitrate_df['knn'] = [hitrate(knn_top_n, leftoutpredictions)]
hitrate_df['svd'] = [hitrate(svd_top_n, leftoutpredictions)]
hitrate_df.index = ['hitrate'] 


In [None]:
# Let's append this to our metric dataframe
final_metric_df = metric_df
final_metric_df.index = ['Fold 1','Fold 2','Fold 3', 'Mean','Std']
print("Metric Comparison (RSME)")
display(final_metric_df)

In [None]:
print("SVD hitrate metrics")
hitrate(svd_top_n, leftoutpredictions)

In [None]:
print("KNN hitrate metrics")
hitrate(knn_top_n, leftoutpredictions)