# Project 4 - Books Recommendation using SVD
Collaborative based filtering->Item based

In [1]:
# !pip install scikit-surprise

# Import Dependencies

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader, SVD, BaselineOnly, PredictionImpossible
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
import random
import numpy as np
import statistics as st
from scipy.sparse.linalg import svds

# Explore the data

In [3]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [4]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df_original.copy()
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [5]:
duplicated_titles=books_df[books_df.duplicated(subset=['Book-Title'],keep=False)].sort_values(by='Book-Title')
duplicated_titles.head()
# so far we leave those titles as is to not lost ratings

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
75637,1565920465,!%@ (A Nutshell handbook),Donnalyn Frey,1994,O'Reilly,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...
156341,1565920317,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...
140618,792276833,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2000,National Geographic,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...
158204,792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...
10438,451168089,'Salem's Lot,Stephen King,1990,Signet Book,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...


In [6]:
# update the datatype of a 'Year-Of-Publication' field to numeric one
books_df['Year-Of-Publication']=pd.to_numeric(books_df['Year-Of-Publication'],errors='coerce')
# Filter out data with no publication year
books_df = books_df[books_df['Year-Of-Publication'] > 0]
books_df['Year-Of-Publication']=books_df['Year-Of-Publication'].astype(int)
# and check the result
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 266739 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 266739 non-null  object
 1   Book-Title           266739 non-null  object
 2   Book-Author          266737 non-null  object
 3   Year-Of-Publication  266739 non-null  int64 
 4   Publisher            266737 non-null  object
 5   Image-URL-S          266739 non-null  object
 6   Image-URL-M          266739 non-null  object
 7   Image-URL-L          266739 non-null  object
dtypes: int64(1), object(7)
memory usage: 18.3+ MB


In [7]:
ratings_df=ratings_df_original.copy()
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


# Training: data preparation

### Change ISBN with Titles
Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info


In [8]:
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
ratings_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0
...,...,...,...,...,...,...,...,...,...,...
1017118,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7
1017119,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,276579,4
1017120,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,276680,0
1017121,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,276680,0


In [9]:
# delete unnecessary columns
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1017118,There's a Bat in Bunk Five,276463,7
1017119,From One to One Hundred,276579,4
1017120,Lily Dale : The True Story of the Town that Ta...,276680,0
1017121,Republic (World's Classics),276680,0


In [10]:
ratings_df=ratings_df.dropna()
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1017118,There's a Bat in Bunk Five,276463,7
1017119,From One to One Hundred,276579,4
1017120,Lily Dale : The True Story of the Town that Ta...,276680,0
1017121,Republic (World's Classics),276680,0


In [11]:
# Filter out data with zero ratings
ratings_df = ratings_df[ratings_df['Book-Rating'] != 0]

In [12]:
# check if there are duplicated records when same user rated book(s) with same title several times
ratings_df[ratings_df.duplicated(subset=['Book-Title', 'User-ID'],keep=False)].sort_values(by=['Book-Title','User-ID'])

Unnamed: 0,Book-Title,User-ID,Book-Rating
93704,1984,112083,9
237681,1984,112083,9
85516,1st to Die: A Novel,11676,10
148791,1st to Die: A Novel,11676,8
85683,1st to Die: A Novel,143175,10
...,...,...,...
789024,Zoids Chaotic Century (Zoids: Chaotic Century ...,63714,10
789027,Zoids Chaotic Century (Zoids: Chaotic Century ...,63714,10
789029,Zoids Chaotic Century (Zoids: Chaotic Century ...,63714,10
404430,Zoya,62272,9


In [13]:
# Use avg rate per duplicates set
ratings_df=ratings_df.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,A Light in the Storm: The Civil War Diary of ...,96448,9.0
1,"Ask Lily (Young Women of Faith: Lily Series, ...",269557,8.0
2,Dark Justice,98391,10.0
3,Earth Prayers From around the World: 365 Pray...,26544,9.0
4,Earth Prayers From around the World: 365 Pray...,69120,10.0
...,...,...,...
377060,Ã?Â?rger mit Produkt X. Roman.,133567,8.0
377061,Ã?Â?rger mit Produkt X. Roman.,225343,7.0
377062,Ã?Â?sterlich leben.,256636,7.0
377063,Ã?Â?stlich der Berge.,90839,8.0


### Only leave statistically signifacant data

In [14]:
# Define whar are the number of rates per book and books rated by user we treat as statistically significant
min_books_rated_by_user=5
min_rates_received_by_book=5

In [15]:
groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()
groupped_r_users[:5]

User-ID
8     7
9     1
12    1
14    3
16    1
Name: Book-Rating, dtype: int64

In [16]:
groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()
groupped_r_books[:5]

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    1
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                         1
 Dark Justice                                                                                                 1
 Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth              7
 Final Fantasy Anthology: Official Strategy Guide (Brady Games)                                               2
Name: User-ID, dtype: int64

In [17]:
#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
titles_with_acceptable_rates_count[:5]

[' Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth',
 '!Yo!',
 "'Salem's Lot",
 '01-01-00: The Novel of the Millennium',
 '10 Lb. Penalty']

In [18]:
#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
user_ids_with_acceptable_books_count_rated[:5]

[8, 99, 114, 242, 243]

In [19]:
# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_input_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
rating_input_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
3,Earth Prayers From around the World: 365 Pray...,26544,9.0
5,Earth Prayers From around the World: 365 Pray...,121592,7.0
6,Earth Prayers From around the World: 365 Pray...,179730,1.0
7,Earth Prayers From around the World: 365 Pray...,179744,6.0
8,Earth Prayers From around the World: 365 Pray...,205980,10.0
...,...,...,...
377003,stardust,274393,8.0
377024,why I'm like this : True Stories,36609,6.0
377025,why I'm like this : True Stories,98904,10.0
377026,why I'm like this : True Stories,105317,8.0


### Prepare data for parsing 

In [20]:
# Pivot to obtain a matrix that stores original ratings given by users for books and fill sparse values with 0-s
df_books_ratigs_user=rating_input_df.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
df_books_ratigs_user

Book-Title,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",!Yo!,'Salem's Lot,01-01-00: The Novel of the Millennium,10 Lb. Penalty,"10,000 dreams interpreted: A dictionary of dreams",100 Best-Loved Poems (Dover Thrift Editions),100 Selected Poems by E. E. Cummings,1001 Things Everyone Should Know About Science,1001 Ways to Be Romantic,...,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""","\The Happy Prince\"" and Other Stories (Penguin Popular Classics)""","\What Do You Care What Other People Think?\"": Further Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,murder@maggody.com : An Arly Hanks Mystery (Arly Hanks Mysteries (Paperback)),one hundred years of solitude,stardust,why I'm like this : True Stories
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
# Normilize the data, using mean normalization.
data_original = df_books_ratigs_user.to_numpy() # vectorize the data
ratings_mean = np.mean(data_original, axis = 1) # find a mean per each vector (user)
normalized_data = data_original - ratings_mean.reshape(-1, 1) #subtract mean for each user from their ratings, which centers the ratings around 0 for each user
normalized_data

array([[-0.00047032, -0.00047032, -0.00047032, ..., -0.00047032,
        -0.00047032, -0.00047032],
       [-0.00338632, -0.00338632, -0.00338632, ..., -0.00338632,
        -0.00338632, -0.00338632],
       [-0.00545574, -0.00545574, -0.00545574, ..., -0.00545574,
        -0.00545574, -0.00545574],
       ...,
       [-0.00921832, -0.00921832, -0.00921832, ..., -0.00921832,
        -0.00921832, -0.00921832],
       [-0.00216348, -0.00216348, -0.00216348, ..., -0.00216348,
        -0.00216348, -0.00216348],
       [-0.00206942, -0.00206942, -0.00206942, ..., -0.00206942,
        -0.00206942, -0.00206942]])

In [22]:
# Decompose the normilized matrix into 3, with k = 20 largest singular values in sigma
U, sigma, Vt = svds(normalized_data, k = 20)

In [23]:
# Convert vector to a diagonal matrix
sigma = np.diag(sigma)

In [24]:
# Compose matrix with predictions, reversing data normalization
all_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
all_predicted_ratings

array([[ 2.76749234e-04,  2.56617050e-04, -3.18250750e-04, ...,
         2.26621276e-04,  2.05652825e-03,  2.92103638e-04],
       [ 3.06283311e-03,  2.38460258e-03, -3.22931267e-03, ...,
         1.97707114e-03,  2.02013096e-03,  2.46152357e-03],
       [-1.95257061e-03,  2.18898890e-03, -1.63343589e-02, ...,
         8.18736062e-03,  2.37669488e-02, -1.57228824e-03],
       ...,
       [-5.72646548e-03, -1.05212318e-02, -1.57776521e-02, ...,
         1.13633683e-03, -3.01041351e-03, -1.04920651e-02],
       [ 3.68661371e-04, -2.77627166e-05, -1.59292365e-03, ...,
         5.89577267e-04,  6.57352682e-03, -3.58964724e-04],
       [ 5.71722288e-04,  5.83140687e-04, -4.79874319e-03, ...,
        -5.44191998e-04, -3.99197633e-03, -2.00798000e-03]])

In [25]:
# convert numpy array into dataframe
all_predictions_df = pd.DataFrame(all_predicted_ratings, columns=df_books_ratigs_user.columns)
all_predictions_df

Book-Title,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",!Yo!,'Salem's Lot,01-01-00: The Novel of the Millennium,10 Lb. Penalty,"10,000 dreams interpreted: A dictionary of dreams",100 Best-Loved Poems (Dover Thrift Editions),100 Selected Poems by E. E. Cummings,1001 Things Everyone Should Know About Science,1001 Ways to Be Romantic,...,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""","\The Happy Prince\"" and Other Stories (Penguin Popular Classics)""","\What Do You Care What Other People Think?\"": Further Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,murder@maggody.com : An Arly Hanks Mystery (Arly Hanks Mysteries (Paperback)),one hundred years of solitude,stardust,why I'm like this : True Stories
0,0.000277,0.000257,-0.000318,0.001866,-0.000346,0.000258,0.000156,0.000162,-0.000324,0.000216,...,0.000501,0.001994,0.000309,0.000296,-0.000329,0.000295,0.000314,0.000227,0.002057,0.000292
1,0.003063,0.002385,-0.003229,0.001885,-0.001978,0.001394,0.001731,0.001224,0.000148,0.002689,...,0.018959,-0.000470,0.002694,0.002049,-0.001057,0.002400,0.002802,0.001977,0.002020,0.002462
2,-0.001953,0.002189,-0.016334,0.007484,-0.027836,-0.004844,-0.001611,-0.012436,0.038168,0.002725,...,0.051049,0.000770,0.004305,-0.001946,0.041396,-0.000579,0.000191,0.008187,0.023767,-0.001572
3,-0.002240,-0.000589,0.007707,0.000620,-0.004459,-0.003750,0.004729,0.007175,-0.010210,0.000157,...,-0.026378,0.022362,-0.002185,-0.001368,-0.013065,-0.002317,-0.004911,0.004673,0.024862,-0.003616
4,0.004814,0.000572,-0.034184,-0.003507,-0.034418,-0.004867,-0.011252,-0.001586,0.030669,-0.003335,...,0.252192,-0.013348,-0.005676,-0.003683,0.035654,-0.003713,0.005424,-0.009533,-0.032461,0.002700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10342,0.001030,0.001929,0.003451,0.003027,-0.049792,0.010983,-0.022899,-0.000859,0.096796,0.026624,...,0.134053,-0.018559,-0.004138,-0.001490,0.087351,0.000297,0.003820,-0.001500,-0.088214,0.055816
10343,0.001905,0.002045,-0.000378,0.005795,-0.000449,0.001837,0.002831,0.001621,0.000263,0.001299,...,0.025783,0.005504,0.002392,0.002310,-0.000743,0.002223,0.002234,0.000897,0.002908,0.001821
10344,-0.005726,-0.010521,-0.015778,-0.010964,0.014783,-0.003116,-0.007943,0.010750,0.043023,-0.004163,...,0.007296,-0.007664,-0.011065,-0.007550,0.061719,-0.007760,-0.005632,0.001136,-0.003010,-0.010492
10345,0.000369,-0.000028,-0.001593,0.005435,0.001273,-0.000064,0.000979,0.000445,-0.001272,0.000344,...,0.007966,0.005704,0.000250,0.000405,-0.000589,0.000291,0.000316,0.000590,0.006574,-0.000359


In [26]:
# add a colmns with user id, so that we can filter data by it
all_predictions_df['user_id'] = df_books_ratigs_user.index
all_predictions_df

Book-Title,"Earth Prayers From around the World: 365 Prayers, Poems, and Invocations for Honoring the Earth",!Yo!,'Salem's Lot,01-01-00: The Novel of the Millennium,10 Lb. Penalty,"10,000 dreams interpreted: A dictionary of dreams",100 Best-Loved Poems (Dover Thrift Editions),100 Selected Poems by E. E. Cummings,1001 Things Everyone Should Know About Science,1001 Ways to Be Romantic,...,"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""","\The Happy Prince\"" and Other Stories (Penguin Popular Classics)""","\What Do You Care What Other People Think?\"": Further Adventures of a Curious Character""",e,iI Paradiso Degli Orchi,murder@maggody.com : An Arly Hanks Mystery (Arly Hanks Mysteries (Paperback)),one hundred years of solitude,stardust,why I'm like this : True Stories,user_id
0,0.000277,0.000257,-0.000318,0.001866,-0.000346,0.000258,0.000156,0.000162,-0.000324,0.000216,...,0.001994,0.000309,0.000296,-0.000329,0.000295,0.000314,0.000227,0.002057,0.000292,8
1,0.003063,0.002385,-0.003229,0.001885,-0.001978,0.001394,0.001731,0.001224,0.000148,0.002689,...,-0.000470,0.002694,0.002049,-0.001057,0.002400,0.002802,0.001977,0.002020,0.002462,99
2,-0.001953,0.002189,-0.016334,0.007484,-0.027836,-0.004844,-0.001611,-0.012436,0.038168,0.002725,...,0.000770,0.004305,-0.001946,0.041396,-0.000579,0.000191,0.008187,0.023767,-0.001572,114
3,-0.002240,-0.000589,0.007707,0.000620,-0.004459,-0.003750,0.004729,0.007175,-0.010210,0.000157,...,0.022362,-0.002185,-0.001368,-0.013065,-0.002317,-0.004911,0.004673,0.024862,-0.003616,242
4,0.004814,0.000572,-0.034184,-0.003507,-0.034418,-0.004867,-0.011252,-0.001586,0.030669,-0.003335,...,-0.013348,-0.005676,-0.003683,0.035654,-0.003713,0.005424,-0.009533,-0.032461,0.002700,243
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10342,0.001030,0.001929,0.003451,0.003027,-0.049792,0.010983,-0.022899,-0.000859,0.096796,0.026624,...,-0.018559,-0.004138,-0.001490,0.087351,0.000297,0.003820,-0.001500,-0.088214,0.055816,278633
10343,0.001905,0.002045,-0.000378,0.005795,-0.000449,0.001837,0.002831,0.001621,0.000263,0.001299,...,0.005504,0.002392,0.002310,-0.000743,0.002223,0.002234,0.000897,0.002908,0.001821,278694
10344,-0.005726,-0.010521,-0.015778,-0.010964,0.014783,-0.003116,-0.007943,0.010750,0.043023,-0.004163,...,-0.007664,-0.011065,-0.007550,0.061719,-0.007760,-0.005632,0.001136,-0.003010,-0.010492,278843
10345,0.000369,-0.000028,-0.001593,0.005435,0.001273,-0.000064,0.000979,0.000445,-0.001272,0.000344,...,0.005704,0.000250,0.000405,-0.000589,0.000291,0.000316,0.000590,0.006574,-0.000359,278851


# Recommendation generation

In [27]:
def recommend_books_for_user(user_id, recommendations_count, all_predictions_df, ratings_df, books_df):
    # find the books (titles) that were rated and presumably read by a user
    rated_titles=[i for i in ratings_df.loc[ratings_df['User-ID']==user_id,'Book-Title']]
    titles_to_exclude=rated_titles
    titles_to_exclude.append(all_predictions_df.columns[-1])
    # find all the titles within the matrix
    all_titles=all_predictions_df.columns
    # separate those titles that were not read
    titles_input_to_recommend=[i for i in all_titles if i not in titles_to_exclude]
    # find predictions for a user
    user_predictions_all=all_predictions_df.loc[all_predictions_df['user_id']==user_id]
    # sort predictions and select top recommendations_count
    col_name=user_predictions_all.loc[user_predictions_all['user_id']==user_id].index[0]
    user_recommendation=user_predictions_all.T
    user_recommendation=user_recommendation.loc[titles_input_to_recommend].sort_values(by=col_name, ascending=False)
    top_recommendations=user_recommendation[:recommendations_count].rename(columns={col_name:'estimated rate'})
    # populate books with full info, selecting those books with the most recent year of publication
    recommendations_full_info=pd.merge(top_recommendations, books_df, left_on='Book-Title',right_on='Book-Title', how='left')
    dict_years=dict(recommendations_full_info.groupby('Book-Title')['Year-Of-Publication'].max())
    for i, row in recommendations_full_info.iterrows():
        if row['Year-Of-Publication']!=dict_years[row['Book-Title']]:
            recommendations_full_info.loc[i,'Year-Of-Publication']=0
    recommendations_full_info=recommendations_full_info[recommendations_full_info['Year-Of-Publication'] != 0]
    recommendations_full_info=recommendations_full_info.drop_duplicates(subset=['Book-Title'])
    return recommendations_full_info


In [28]:
recommend_books_for_user(6251, 5, all_predictions_df, ratings_df, books_df)

Unnamed: 0,Book-Title,estimated rate,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
2,Harry Potter and the Chamber of Secrets (Book 2),6.761486,0439420105,J. K. Rowling,2002,Scholastic,http://images.amazon.com/images/P/0439420105.0...,http://images.amazon.com/images/P/0439420105.0...,http://images.amazon.com/images/P/0439420105.0...
4,Harry Potter and the Sorcerer's Stone (Book 1),3.763791,043936213X,J. K. Rowling,2001,Scholastic,http://images.amazon.com/images/P/043936213X.0...,http://images.amazon.com/images/P/043936213X.0...,http://images.amazon.com/images/P/043936213X.0...
7,Harry Potter and the Sorcerer's Stone (Harry P...,3.3014,059035342X,J. K. Rowling,1999,Arthur A. Levine Books,http://images.amazon.com/images/P/059035342X.0...,http://images.amazon.com/images/P/059035342X.0...,http://images.amazon.com/images/P/059035342X.0...
9,High Five (A Stephanie Plum Novel),2.7022,0312971346,Janet Evanovich,2000,St. Martin's Paperbacks,http://images.amazon.com/images/P/0312971346.0...,http://images.amazon.com/images/P/0312971346.0...,http://images.amazon.com/images/P/0312971346.0...
11,Two for the Dough,2.606972,0671001795,Janet Evanovich,1996,Pocket,http://images.amazon.com/images/P/0671001795.0...,http://images.amazon.com/images/P/0671001795.0...,http://images.amazon.com/images/P/0671001795.0...


# Check Accuracy