# Project 4 - Books Recommendation using SVD
Collaborative based filtering->Item based

In [1]:
# !pip install scikit-surprise

# Import Dependencies

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader, SVD, BaselineOnly, PredictionImpossible
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
import random
import numpy as np
import statistics as st
from scipy.sparse.linalg import svds
import math 

# Explore the data and Prepare for Train 

In [3]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [4]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df_original.copy()
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [5]:
duplicated_titles=books_df[books_df.duplicated(subset=['Book-Title'],keep=False)].sort_values(by='Book-Title')
duplicated_titles.head()
# so far we leave those titles as is to not lost ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
75637,1565920465,!%@ (A Nutshell handbook),Donnalyn Frey,1994,O'Reilly,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...
156341,1565920317,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...
140618,792276833,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2000,National Geographic,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...
158204,792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...
10438,451168089,'Salem's Lot,Stephen King,1990,Signet Book,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...


In [6]:
# update the datatype of a 'Year-Of-Publication' field to numeric one
books_df['Year-Of-Publication']=pd.to_numeric(books_df['Year-Of-Publication'],errors='coerce')
# Filter out data with no publication year
books_df = books_df[books_df['Year-Of-Publication'] > 0]
books_df['Year-Of-Publication']=books_df['Year-Of-Publication'].astype(int)
# and check the result
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 266739 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 266739 non-null  object
 1   Book-Title           266739 non-null  object
 2   Book-Author          266737 non-null  object
 3   Year-Of-Publication  266739 non-null  int64 
 4   Publisher            266737 non-null  object
 5   Image-URL-S          266739 non-null  object
 6   Image-URL-M          266739 non-null  object
 7   Image-URL-L          266739 non-null  object
dtypes: int64(1), object(7)
memory usage: 18.3+ MB


In [7]:
ratings_df=ratings_df_original.copy()
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


### Change ISBN with Titles
Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info


In [8]:
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
ratings_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0
...,...,...,...,...,...,...,...,...,...,...
1017118,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7
1017119,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,276579,4
1017120,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,276680,0
1017121,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,276680,0


In [9]:
ratings_df=ratings_df.dropna()
ratings_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0
...,...,...,...,...,...,...,...,...,...,...
1017118,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7
1017119,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,276579,4
1017120,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,276680,0
1017121,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,276680,0


In [10]:
# delete unnecessary columns
ratings_df_adj=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
ratings_df_adj

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1017118,There's a Bat in Bunk Five,276463,7
1017119,From One to One Hundred,276579,4
1017120,Lily Dale : The True Story of the Town that Ta...,276680,0
1017121,Republic (World's Classics),276680,0


In [11]:
# # Filter out data with zero ratings
# ratings_df_adj = ratings_df_adj[ratings_df_adj['Book-Rating'] != 0]

In [12]:
# As an alternative to the above update 0 scores with weighted averages

In [13]:
def average_weighted(row):
    min_th=25 #min number of rating received by the book
    neutral_score=5
    avg_w = ((row['avg_book_rating'] * row['count_book_rating']) + 
      (min_th * neutral_score))/(row['count_book_rating'] + min_th)
    return avg_w

In [14]:
# find average score per each book (only take non-zero into account)
avg_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('Book-Title')['Book-Rating'].mean()
# count of non-zero rating given per book
count_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('Book-Title')['Book-Rating'].count()
# create dataframe with above data (average and count) per book
average_weighted_df=pd.DataFrame(avg_ratings_scored).rename(columns={'Book-Rating':'avg_book_rating'})
count_ratings_scored_df=pd.DataFrame(count_ratings_scored).rename(columns={'Book-Rating':'count_book_rating'})
average_weighted_df=pd.merge(average_weighted_df,count_ratings_scored_df,  on='Book-Title', how='inner')
average_weighted_df=average_weighted_df.sort_values(by='count_book_rating', ascending=False)
# find average weighted per book
average_weighted_df['avg_weighted']=average_weighted_df.apply(average_weighted, axis=1)
# update zero rating values with average weighted
ratings_df_adj=ratings_df.copy()
ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0, 'Book-Rating'] = ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0].index.map(average_weighted_df['avg_weighted'])
# Filter out data with n/a rating score after mapping, as there could be books with only 0 scores
ratings_df_adj=ratings_df_adj.dropna(subset=['Book-Rating'])

In [15]:
# check if there are duplicated records when same user rated book(s) with same title several times
ratings_df_adj[ratings_df_adj.duplicated(subset=['Book-Title', 'User-ID'],keep=False)].sort_values(by=['Book-Title','User-ID'])

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
93704,0451524934,1984,George Orwell,1990,Signet Book,http://images.amazon.com/images/P/0451524934.0...,http://images.amazon.com/images/P/0451524934.0...,http://images.amazon.com/images/P/0451524934.0...,112083,9.0
237681,0451519841,1984,George Orwell,1980,New Amer Library,http://images.amazon.com/images/P/0451519841.0...,http://images.amazon.com/images/P/0451519841.0...,http://images.amazon.com/images/P/0451519841.0...,112083,9.0
85516,0446610038,1st to Die: A Novel,James Patterson,2002,Warner Vision,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...,11676,10.0
148791,0316666009,1st to Die: A Novel,James Patterson,2001,Little Brown and Company,http://images.amazon.com/images/P/0316666009.0...,http://images.amazon.com/images/P/0316666009.0...,http://images.amazon.com/images/P/0316666009.0...,11676,8.0
85683,0446610038,1st to Die: A Novel,James Patterson,2002,Warner Vision,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...,http://images.amazon.com/images/P/0446610038.0...,143175,10.0
...,...,...,...,...,...,...,...,...,...,...
789024,1569317674,Zoids Chaotic Century (Zoids: Chaotic Century ...,Michiro Ueyama,2002,Viz Comics,http://images.amazon.com/images/P/1569317674.0...,http://images.amazon.com/images/P/1569317674.0...,http://images.amazon.com/images/P/1569317674.0...,63714,10.0
789027,1569317666,Zoids Chaotic Century (Zoids: Chaotic Century ...,Michiro Ueyama,2002,Viz Comics,http://images.amazon.com/images/P/1569317666.0...,http://images.amazon.com/images/P/1569317666.0...,http://images.amazon.com/images/P/1569317666.0...,63714,10.0
789029,1569317658,Zoids Chaotic Century (Zoids: Chaotic Century ...,Michiro Ueyama,2002,Viz Comics,http://images.amazon.com/images/P/1569317658.0...,http://images.amazon.com/images/P/1569317658.0...,http://images.amazon.com/images/P/1569317658.0...,63714,10.0
404430,0440203856,Zoya,Danielle Steel,1989,Dell,http://images.amazon.com/images/P/0440203856.0...,http://images.amazon.com/images/P/0440203856.0...,http://images.amazon.com/images/P/0440203856.0...,62272,9.0


In [16]:
# Use avg rate per duplicates set
ratings_df_adj=ratings_df_adj.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df_adj

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,A Light in the Storm: The Civil War Diary of ...,96448,9.0
1,"Ask Lily (Young Women of Faith: Lily Series, ...",269557,8.0
2,Dark Justice,98391,10.0
3,Earth Prayers From around the World: 365 Pray...,26544,9.0
4,Earth Prayers From around the World: 365 Pray...,69120,10.0
...,...,...,...
377056,Ã?Â?rger mit Produkt X. Roman.,133567,8.0
377057,Ã?Â?rger mit Produkt X. Roman.,225343,7.0
377058,Ã?Â?sterlich leben.,256636,7.0
377059,Ã?Â?stlich der Berge.,90839,8.0


### Only leave statistically signifacant data

In [17]:
# Define whar are the number of rates per book and books rated by user we treat as statistically significant
min_books_rated_by_user=5
min_rates_received_by_book=5

In [18]:
groupped_r_users=ratings_df_adj.groupby('User-ID')['Book-Rating'].count()
groupped_r_users[:5]

User-ID
8     7
9     1
12    1
14    3
16    1
Name: Book-Rating, dtype: int64

In [19]:
groupped_r_books=ratings_df_adj.groupby('Book-Title')['User-ID'].count()
groupped_r_books[:5]

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    1
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                         1
 Dark Justice                                                                                                 1
 Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth              7
 Final Fantasy Anthology: Official Strategy Guide (Brady Games)                                               2
Name: User-ID, dtype: int64

In [20]:
#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
titles_with_acceptable_rates_count[:5]

[' Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth',
 '!Yo!',
 "'Salem's Lot",
 '01-01-00: The Novel of the Millennium',
 '10 Lb. Penalty']

In [21]:
#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
user_ids_with_acceptable_books_count_rated[:5]

[8, 99, 114, 242, 243]

In [22]:
# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_final_df=ratings_df_adj[ratings_df_adj['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df_adj['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
rating_final_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
3,Earth Prayers From around the World: 365 Pray...,26544,9.0
5,Earth Prayers From around the World: 365 Pray...,121592,7.0
6,Earth Prayers From around the World: 365 Pray...,179730,1.0
7,Earth Prayers From around the World: 365 Pray...,179744,6.0
8,Earth Prayers From around the World: 365 Pray...,205980,10.0
...,...,...,...
376999,stardust,274393,8.0
377020,why I'm like this : True Stories,36609,6.0
377021,why I'm like this : True Stories,98904,10.0
377022,why I'm like this : True Stories,105317,8.0


# Pure SVD model

### Train/Test split

In [23]:
# split data in such a ways that we have data for all the users in both train and test sets
user_list = rating_final_df['User-ID'].unique() #list of all users
test_set = pd.DataFrame(columns=rating_final_df.columns) # reserve df for a train set
train_set = pd.DataFrame(columns=rating_final_df.columns) # reserve df for a test set
test_ratio = 0.1 # we would have 10% test data
for user in user_list:
    # for each user take their book/rating data 
    user_data_all = rating_final_df[rating_final_df['User-ID'] == user]
    n = len(user_data_all)
    user_data_all = user_data_all.reset_index()
    user_data_all.drop('index', axis=1, inplace=True)
    # split user data into train and test 
    test_size = int(test_ratio*n)
    
    # randomly select roughtly 10% of rows for test set per user using random_state=1, so that result is reproducible
    test = user_data_all.sample(n=test_size, random_state=1)  

    # rows not selected for test set, assigned to train one
    train = user_data_all.drop(test.index)

    test_set = pd.concat([test_set, test], ignore_index=True)
    train_set = pd.concat([train_set, train], ignore_index=True)

# Recommendation generation

In [24]:
def build_prediction_matrix(rating_input_df, latent_factors=70):
## Build the model  
    # Pivot to obtain a matrix that stores original ratings given by users for books and fill sparse values with 0-s
    df_books_ratigs_user=rating_input_df.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
    # Normilize the data, using mean normalization.
    data_original = df_books_ratigs_user.to_numpy() # vectorize the data
    ratings_mean = np.mean(data_original, axis = 1) # find a mean per each vector (user)
    normalized_data = data_original - ratings_mean.reshape(-1, 1) #subtract mean for each user from their ratings, which centers the ratings around 0 for each user
    # Decompose the normilized matrix into 3, with k = latent_factors (70 default) largest singular values in sigma
    U, sigma, Vt = svds(normalized_data, k = latent_factors)
    # Convert vector to a diagonal matrix
    sigma = np.diag(sigma)
    # Compose matrix with predictions, reversing data normalization
    all_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
    # convert numpy array into dataframe
    all_predictions_df = pd.DataFrame(all_predicted_ratings, columns=df_books_ratigs_user.columns)
    # add a colmns with user id, so that we can filter data by it
    all_predictions_df['user_id'] = df_books_ratigs_user.index
    return all_predictions_df

In [25]:
def recommend_books_for_user_svd (user_id, all_predictions_df, ratings_df, books_df, recommendations_count=5):
## Find prediction for a specific user
    # find the books (titles) that were rated and presumably read by a user
    rated_titles=[i for i in ratings_df.loc[ratings_df['User-ID']==user_id,'Book-Title']]
    titles_to_exclude=rated_titles
    # find all the titles within the matrix
    all_titles=all_predictions_df.columns[:-1]
    # separate those titles that were not read
    titles_input_to_recommend=[i for i in all_titles if i not in titles_to_exclude]
    # find predictions for a user
    user_predictions_all=all_predictions_df.loc[all_predictions_df['user_id']==user_id]
    # sort predictions and select top recommendations_count
    col_name=user_predictions_all.loc[user_predictions_all['user_id']==user_id].index[0]
    user_recommendation=user_predictions_all.T
    user_recommendation=user_recommendation.loc[titles_input_to_recommend].sort_values(by=col_name, ascending=False)
    top_recommendations=user_recommendation[:recommendations_count].rename(columns={col_name:'estimated rate'})
    # populate books with full info, selecting those books with the most recent year of publication
    recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
    dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
    for i, row in recommendations_full_info.iterrows():
        if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
            recommendations_full_info.loc[i,'Year-Of-Publication']=0
    recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
    recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
    return recommendations_full_info


In [26]:
all_predictions_df=build_prediction_matrix(rating_final_df)

## Check Accuracy

In [27]:
# define formula for calculation of rmse
def rmse(true, pred):
    error = true - pred
    mean_square_error=sum([i*i for i in error])/len(error)
    return math.sqrt(mean_square_error)

In [28]:
# to test the performance over a different number of laatent factors
k_set = [8, 20, 50, 100, 150]
rmse_scores={}
for l_f in k_set: 
    # Build the prediction matrix using the train_set
    all_predictions_df = build_prediction_matrix(train_set, l_f)

    # reserve a list for predicted ratings
    pred = []
    for i, row in test_set.iterrows():
        user_id = row['User-ID']
        book_title = row['Book-Title']  
        if user_id in all_predictions_df['user_id'].values and book_title in all_predictions_df.columns[:-1]:
            pred_rating = all_predictions_df.loc[all_predictions_df['user_id'] == user_id, book_title].values[0]
        else:
            # If the book or user is not in the train_set, use a default prediction
            # which is the average of all ratings in the training set as a simple approach
            pred_rating = train_set['Book-Rating'].mean()   
        pred.append(pred_rating)
    
    # Calculate RMSE for the current number of features
    current_rmse = rmse(test_set['Book-Rating'], pred)
    rmse_scores[l_f]=current_rmse
rmse_scores

{8: 7.810468546894521,
 20: 7.7739527506547965,
 50: 7.740889009963508,
 100: 7.7470206957312575,
 150: 7.768680395006912}

# SVD Funk
Use of surprise module with built-in SVD algorithm (popularized by Simon Funk during the Netflix Prize):

class surprise.prediction_algorithms.matrix_factorization.SVD(n_factors=100, n_epochs=20, biased=True, init_mean=0, init_std_dev=0.1, lr_all=0.005, reg_all=0.02, lr_bu=None, lr_bi=None, lr_pu=None, lr_qi=None, reg_bu=None, reg_bi=None, reg_pu=None, reg_qi=None, random_state=None, verbose=False)

### Prepare data for parsing by 'surprise algorithms'

In [29]:
# Even having dataframe we need to create a 'reader' with the 'rating_scale' parameter to let know that our dataset has ratings from 1 to 10
# https://surprise.readthedocs.io/en/stable/getting_started.html#load-custom
reader = Reader(rating_scale=(1,10))

# and create respective surprise.dataset object, so that our data are in a proper format for the recommendation algorithms
data_surprise_o = Dataset.load_from_df(rating_final_df[['User-ID','Book-Title','Book-Rating']], reader)

In [30]:
# Split our data into train and test in ratio 8:2, using random_state = 42 so that we receive reproducable output
random_state = 42

# Reference: https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=train_test_split#train-test-split-and-the-fit-method

train_set, test_set = train_test_split(data_surprise_o, test_size=0.2, shuffle=True, random_state=random_state)

# check train and test data
train_data = list(train_set.all_ratings())

display('------Train Set------')
print(f'Size of train_set: {len(train_data)}')
print(f'Number of users in train_set: {train_set.n_users}')
print(f'Number of books in train_set: {train_set.n_items}')
display(f'Few elements of train_set (user, book, rating): {train_data[:15]}')
display('------Test Set------')
print(f'Size of test_set: {len(test_set)}')
display(f'Few elements of test_set (user, book, rating): {test_set[:15]}')

'------Train Set------'

Size of train_set: 112488
Number of users in train_set: 10199
Number of books in train_set: 10619


'Few elements of train_set (user, book, rating): [(0, 0, 8.0), (0, 80, 7.0), (0, 837, 10.0), (0, 1074, 9.0), (0, 3718, 8.0), (0, 4298, 9.0), (0, 259, 7.0), (0, 78, 10.0), (0, 543, 9.0), (0, 6729, 10.0), (0, 1097, 10.0), (0, 3635, 9.0), (0, 1484, 9.0), (0, 5195, 8.0), (0, 1574, 9.0)]'

'------Test Set------'

Size of test_set: 28122


'Few elements of test_set (user, book, rating): [(92405, "Night\'s Landing (Mira)", 8.0), (265889, \'Out of Control\', 8.0), (27313, \'A Walk to Remember\', 9.0), (23571, \'The Two Towers (The Lord of the Rings, Part 2)\', 10.0), (170724, \'Lone Eagle\', 5.0), (95359, \'AGE OF INNOCENCE (MOVIE TIE-IN)\', 10.0), (158295, \'Prime Witness\', 6.0), (263078, \'Wild Animus\', 2.0), (27140, \'DAUGHTER OF TIME\', 10.0), (276925, \'La Sombra del Viento\', 10.0), (234953, \'Primary Colors: A Novel of Politics\', 7.0), (204802, \'Round Robin: An Elm Creek Quilts Novel (Elm Creek Quilters Novels)\', 8.0), (126814, \'The Cat Who Blew the Whistle\', 7.0), (145109, \'The Drawing of the Three (The Dark Tower, Book 2)\', 9.0), (98741, "Suzanne\'s Diary for Nicholas", 7.0)]'

###  Training: cross validate the model
Make preliminary estimation of error in predictions using RMSE metric for SVD Funk model and the whole set of data split into 5 folds to verify there is no overfitting

In [31]:
# reference: https://surprise.readthedocs.io/en/stable/getting_started.html?highlight=cross_validate#use-cross-validation-iterators

# define number of subsets the dataset will be partitioned for cross validations
folds_n=5

# define a cross-validation iterator
kf = KFold(n_splits=folds_n,random_state=42)
algo = [BaselineOnly(),SVD()]
for i in algo:
    mean_rmse=[]
    for trainset, testset in kf.split(data_surprise_o):
        # train and test algorithm.
        i.fit(trainset)
        predictions = i.test(testset)
        # Compute and print Root Mean Squared Error
        mean_rmse.append(accuracy.rmse(predictions))
    # Find an average rmse for all the folds
    mean_rmse=st.mean(mean_rmse)
    display(f'Average Root Mean Square Error (RMSE) for {i.__class__.__name__} is {round(mean_rmse,2)}')

Estimating biases using als...
RMSE: 1.5961
Estimating biases using als...
RMSE: 1.5709
Estimating biases using als...
RMSE: 1.5745
Estimating biases using als...
RMSE: 1.5783
Estimating biases using als...
RMSE: 1.5620


'Average Root Mean Square Error (RMSE) for BaselineOnly is 1.58'

RMSE: 1.5918
RMSE: 1.5715
RMSE: 1.5688
RMSE: 1.5747
RMSE: 1.5572


'Average Root Mean Square Error (RMSE) for SVD is 1.57'

### Training: identify best parameters¶

In [32]:
# reference: https://medium.com/p/61c269402919
# within the dictionary param_grid set ranges for parameters to try out, where 
#  - key: parameter name, 
#  - value:  list of parameter values to try
param_grid = {'n_factors':[10,20,50,100,150], 'n_epochs':[10,20,30,50,100], 'lr_all':[0.002,0.005,0.01,0.3],'reg_all':[0.02, 0.1, 0.2]}
# n_factors - number of latent factors (Default is 100)
# n_epochs - number of iterations (Default is 20)
# lr_all - step size for the gradient descent optimization (learning rate, Default is 0.005)
# reg_all -  regularization term for all parameter, used to prevent overfitting (Default is 0.02)

# Select parameters for our SVD algorithm by cross validation and looking for rmse metric
# GridSearchCV() calculates a score for each combination of hyperparameters on a k-fold cross validated dataset 
# and returns the set of parameters that minimises the mean score across folds
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5)

# train the model for all the combinations of parameters
gs.fit(data_surprise_o)

# select the set of parameters that produce lowest value for rmse metric
params = gs.best_params['rmse']
params


{'n_factors': 10, 'n_epochs': 100, 'lr_all': 0.002, 'reg_all': 0.2}

### Training: Build models

In [33]:
# Build SVD with default parameters
svd_default = SVD()
# Build SVD, using hyperparameters recevied in the result of hypertuning
svd_best_parameters = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'],lr_all=params['lr_all'], reg_all=params['reg_all'])

### Training: Fit models

In [34]:
# Train the models on trainset
svd_best_parameters_model = svd_best_parameters.fit(train_set)
svd_default_model = svd_default.fit(train_set)

## Recommendation generation

In [35]:
def recommend_books_for_user_svd_funk (user_id, model, ratings_df_adj, books_df, recommendations_count=5):
    # find all the books (titles)
    all_titles=ratings_df_adj['Book-Title'].unique()
    # find the books (titles) that were rated and presumably read by a user
    rated_titles=[i for i in ratings_df_adj.loc[ratings_df_adj['User-ID']==user_id,'Book-Title']]
    # find the books (titles) that were not rated and presumably not read by a user
    titles_input_to_recommend=[i for i in all_titles if i not in rated_titles]
    # find predictions for a user
        # reference: https://surprise.readthedocs.io/en/stable/algobase.html?highlight=predict
        # uid – (Raw) id of the user. 
        # iid – (Raw) id of the item.
        # verbose (bool) – Whether to print details of the prediction. Default is False.
    predictions=[model.predict(uid=user_id, iid=i) for i in titles_input_to_recommend]
    # get ratings estimate for books by the user
    ratings=[i.est for i in predictions]
    # convert predicted estimates by the user for not read books into df
    pred_dict={
        'Book-Title':titles_input_to_recommend,
        'Estimated_Rate':ratings}
    predictions_book=pd.DataFrame(pred_dict).sort_values('Estimated_Rate',ascending = False)
    top_recommendations=predictions_book.head(recommendations_count)
    # populate books with full info, selecting those books with the most recent year of publication
    recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
    dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
    for i, row in recommendations_full_info.iterrows():
        if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
            recommendations_full_info.loc[i,'Year-Of-Publication']=0
    recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
    recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
    return recommendations_full_info

## Check Accuracy

In [36]:
# Check the accuracy for both default vs tuned method
print(f'''Accuracy, based on Root Mean Square Error, of a Tuned Model: 
{accuracy.rmse(svd_best_parameters_model.test(test_set), verbose=False)}''')

print(f'''Accuracy, based on Root Mean Square Error, of a Model with Default Parameters:
{accuracy.rmse(svd_default_model.test(test_set), verbose=False)}''')

Accuracy, based on Root Mean Square Error, of a Tuned Model: 
1.539861890078707
Accuracy, based on Root Mean Square Error, of a Model with Default Parameters:
1.5585629279261572


In [37]:
print(f'''Accuracy, based on Mean Absolute Error, of a Model with Default Parameters:
{accuracy.mae(svd_default_model.test(test_set), verbose=False)}''')

Accuracy, based on Mean Absolute Error, of a Model with Default Parameters:
1.19936666813739


# Check predictions for specific user

In [38]:
# find original user ratings
u_data=ratings_df[ratings_df['User-ID']==252676].sort_values(by='Book-Rating', ascending=False)
u_data[['User-ID','Book-Title','Book-Rating','Book-Author','Year-Of-Publication','Publisher']]

Unnamed: 0,User-ID,Book-Title,Book-Rating,Book-Author,Year-Of-Publication,Publisher
1016791,252676,The Twelve Days of Christmas,10,Anne Geddes,1997,Cedco Publishing Company
1016789,252676,"The Flavors of Bon Appetit 1997 (Bon Appetit ,...",8,Editors of Bon Appetit,1997,Pantheon Books
533707,252676,The Importance of Being Earnest: A Trivial Nov...,8,Charles Osborne,2000,St Martins Pr
1016788,252676,Lilith's Cave: Jewish Tales of the Supernatural,7,Howard Schwartz,1988,Harpercollins
83181,252676,The Brethren,7,John Grisham,2000,Island
957473,252676,The Devil's Cat,6,William W. Johnstone,1987,Zebra Books
182440,252676,Hideaway,5,Dean R. Koontz,1992,Berkley Publishing Group
42777,252676,Skipping Christmas,0,JOHN GRISHAM,2001,Doubleday
740036,252676,Vespers,0,Jeff Rovin,1999,St. Martin's Press
1016790,252676,"The Best of Gourmet 1997 (Best of Gourmet, 1997)",0,The Editors of Gourmet,1997,Random House Trade


In [39]:
# preview recommendations by pure SVD
recommend_books_for_user_svd(252676, all_predictions_df, ratings_df, books_df)

Unnamed: 0,Book-Title,estimated rate,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
1,The Partner,0.831829,0440224764,John Grisham,1998,Dell Publishing Company,http://images.amazon.com/images/P/0440224764.0...,http://images.amazon.com/images/P/0440224764.0...,http://images.amazon.com/images/P/0440224764.0...
5,The Poisonwood Bible,0.423609,0060512822,Barbara Kingsolver,2003,HarperTorch,http://images.amazon.com/images/P/0060512822.0...,http://images.amazon.com/images/P/0060512822.0...,http://images.amazon.com/images/P/0060512822.0...
10,The Rainmaker,0.403332,0099271273,John Grisham,1998,Arrow,http://images.amazon.com/images/P/0099271273.0...,http://images.amazon.com/images/P/0099271273.0...,http://images.amazon.com/images/P/0099271273.0...
17,Sphere,0.373383,0553702327,Michael Crichton,2001,Random House Audio Publishing Group,http://images.amazon.com/images/P/0553702327.0...,http://images.amazon.com/images/P/0553702327.0...,http://images.amazon.com/images/P/0553702327.0...
19,Bag of Bones,0.329327,067102423X,Stephen King,1999,Pocket,http://images.amazon.com/images/P/067102423X.0...,http://images.amazon.com/images/P/067102423X.0...,http://images.amazon.com/images/P/067102423X.0...


In [40]:
# preview recommendations by Funk SVD, default
recommend_books_for_user_svd_funk(252676, svd_default_model, ratings_df_adj, books_df)

Unnamed: 0,Book-Title,Estimated_Rate,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,Harry Potter and the Chamber of Secrets Postca...,9.096722,439425220,J. K. Rowling,2002,Scholastic,http://images.amazon.com/images/P/0439425220.0...,http://images.amazon.com/images/P/0439425220.0...,http://images.amazon.com/images/P/0439425220.0...
1,"My Sister's Keeper : A Novel (Picoult, Jodi)",8.946838,743454529,Jodi Picoult,2004,Atria,http://images.amazon.com/images/P/0743454529.0...,http://images.amazon.com/images/P/0743454529.0...,http://images.amazon.com/images/P/0743454529.0...
2,Dilbert: A Book of Postcards,8.933334,836213319,Scott Adams,1996,Andrews McMeel Pub,http://images.amazon.com/images/P/0836213319.0...,http://images.amazon.com/images/P/0836213319.0...,http://images.amazon.com/images/P/0836213319.0...
4,Harry Potter and the Prisoner of Azkaban (Book 3),8.915847,439136369,J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/0439136369.0...,http://images.amazon.com/images/P/0439136369.0...,http://images.amazon.com/images/P/0439136369.0...
6,Weirdos From Another Planet!,8.874482,836218620,Bill Watterson,1990,Andrews McMeel Publishing,http://images.amazon.com/images/P/0836218620.0...,http://images.amazon.com/images/P/0836218620.0...,http://images.amazon.com/images/P/0836218620.0...


In [41]:
# preview recommendations by Funk SVD, tuned
recommend_books_for_user_svd_funk(252676, svd_best_parameters_model, ratings_df_adj, books_df)

Unnamed: 0,Book-Title,Estimated_Rate,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,Dilbert: A Book of Postcards,8.957098,0836213319,Scott Adams,1996,Andrews McMeel Pub,http://images.amazon.com/images/P/0836213319.0...,http://images.amazon.com/images/P/0836213319.0...,http://images.amazon.com/images/P/0836213319.0...
1,Harry Potter and the Chamber of Secrets Postca...,8.844976,0439425220,J. K. Rowling,2002,Scholastic,http://images.amazon.com/images/P/0439425220.0...,http://images.amazon.com/images/P/0439425220.0...,http://images.amazon.com/images/P/0439425220.0...
2,"My Sister's Keeper : A Novel (Picoult, Jodi)",8.782288,0743454529,Jodi Picoult,2004,Atria,http://images.amazon.com/images/P/0743454529.0...,http://images.amazon.com/images/P/0743454529.0...,http://images.amazon.com/images/P/0743454529.0...
3,Scientific Progress Goes 'Boink': A Calvin an...,8.673051,0836218787,Bill Watterson,1991,Andrews McMeel Publishing,http://images.amazon.com/images/P/0836218787.0...,http://images.amazon.com/images/P/0836218787.0...,http://images.amazon.com/images/P/0836218787.0...
4,The Time Traveler's Wife,8.65804,193156146X,Audrey Niffenegger,2003,MacAdam/Cage Publishing,http://images.amazon.com/images/P/193156146X.0...,http://images.amazon.com/images/P/193156146X.0...,http://images.amazon.com/images/P/193156146X.0...
