# Project 4 - Books Recommendation using SVD
Collaborative based filtering->Item based

In [1]:
# !pip install scikit-surprise

# Import Dependencies

In [28]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from surprise import accuracy, Dataset, Reader, SVD, BaselineOnly, PredictionImpossible
from surprise.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
import random
import numpy as np
import statistics as st
from scipy.sparse.linalg import svds

# Explore the data

In [3]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
users_df = pd.read_csv('./Resources/Users.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [4]:
# Filter out data with no publication year
books_df = books_df_original[books_df_original['Year-Of-Publication'] != 0]

In [5]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [6]:
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [7]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 267790 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 267790 non-null  object
 1   Book-Title           267790 non-null  object
 2   Book-Author          267788 non-null  object
 3   Year-Of-Publication  267790 non-null  object
 4   Publisher            267788 non-null  object
 5   Image-URL-S          267790 non-null  object
 6   Image-URL-M          267790 non-null  object
 7   Image-URL-L          267787 non-null  object
dtypes: object(8)
memory usage: 18.4+ MB


In [8]:
duplicated_titles=books_df[books_df.duplicated(subset=['Book-Title'],keep=False)].sort_values(by='Book-Title')
duplicated_titles.head()
# ????????how to handle those duplicates????????

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
75637,1565920465,!%@ (A Nutshell handbook),Donnalyn Frey,1994,O'Reilly,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...,http://images.amazon.com/images/P/1565920465.0...
156341,1565920317,!%@ (A Nutshell handbook),Donnalyn Frey,1993,O'Reilly,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...,http://images.amazon.com/images/P/1565920317.0...
140618,792276833,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2000,National Geographic,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...,http://images.amazon.com/images/P/0792276833.0...
158204,792277295,'A Hell of a Place to Lose a Cow': An American...,Tim Brookes,2001,National Geographic,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...,http://images.amazon.com/images/P/0792277295.0...
10438,451168089,'Salem's Lot,Stephen King,1990,Signet Book,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...,http://images.amazon.com/images/P/0451168089.0...


In [9]:
ratings_df=ratings_df_original.copy()
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [10]:
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')
# and check the result
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


# Training: data preparation

### Change ISBN with Titles
Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info


In [11]:
ratings_df

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [12]:
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
ratings_df

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0
...,...,...,...,...,...,...,...,...,...,...
1018387,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,276463,7
1018388,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,276579,4
1018389,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,276680,0
1018390,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,276680,0


In [13]:
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1018387,There's a Bat in Bunk Five,276463,7
1018388,From One to One Hundred,276579,4
1018389,Lily Dale : The True Story of the Town that Ta...,276680,0
1018390,Republic (World's Classics),276680,0


In [14]:
ratings_df=ratings_df.dropna()
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1018387,There's a Bat in Bunk Five,276463,7
1018388,From One to One Hundred,276579,4
1018389,Lily Dale : The True Story of the Town that Ta...,276680,0
1018390,Republic (World's Classics),276680,0


In [15]:
# check if there are duplicated records when same user rated book(s) with same title several times
ratings_df[ratings_df.duplicated(subset=['Book-Title', 'User-ID'],keep=False)].sort_values(by=['Book-Title','User-ID'])

Unnamed: 0,Book-Title,User-ID,Book-Rating
494566,10 Lb. Penalty,94923,0
605017,10 Lb. Penalty,94923,0
494570,10 Lb. Penalty,128835,0
605021,10 Lb. Penalty,128835,9
494575,10 Lb. Penalty,198711,0
...,...,...,...
278376,"\O\"" Is for Outlaw""",155147,0
106623,"\O\"" Is for Outlaw""",158295,0
278379,"\O\"" Is for Outlaw""",158295,6
106648,"\O\"" Is for Outlaw""",196077,0


In [16]:
# drop duplicated records when same user could rate book(s) with same title
ratings_df=ratings_df.drop_duplicates()
ratings_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
0,Classical Mythology,2,0
1,Clara Callan,8,5
2,Clara Callan,11400,0
3,Clara Callan,11676,8
4,Clara Callan,41385,0
...,...,...,...
1018387,There's a Bat in Bunk Five,276463,7
1018388,From One to One Hundred,276579,4
1018389,Lily Dale : The True Story of the Town that Ta...,276680,0
1018390,Republic (World's Classics),276680,0


In [17]:
#??????????
# should we may have avg rate here per duplicates set????
#??????????

### Only leave statistically signifacant data

In [18]:
# Define whar are the number of rates per book and books rated by user we treat as statistically significant
min_books_rated_by_user=50
min_rates_received_by_book=25

In [19]:
groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()
groupped_r_users[:5]

User-ID
2      1
8     17
9      3
10     1
12     1
Name: Book-Rating, dtype: int64

In [20]:
groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()
groupped_r_books[:5]

Book-Title
 A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)    4
 Always Have Popsicles                                                                                        1
 Apple Magic (The Collector's series)                                                                         1
 Ask Lily (Young Women of Faith: Lily Series, Book 5)                                                         1
 Beyond IBM: Leadership Marketing and Finance for the 1990s                                                   1
Name: User-ID, dtype: int64

In [21]:
#select only those books which were rated more than min_rates_received_by_book
titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
titles_with_acceptable_rates_count[:5]

["'Salem's Lot",
 '10 Lb. Penalty',
 '101 Dalmatians',
 '14,000 Things to Be Happy About',
 '16 Lighthouse Road']

In [22]:
#select only those users (user_id) who rated more than min_books_rated_by_user books
user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
user_ids_with_acceptable_books_count_rated[:5]

[243, 254, 507, 638, 643]

In [23]:
# filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
rating_input_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
rating_input_df

Unnamed: 0,Book-Title,User-ID,Book-Rating
31,The Kitchen God's Wife,11676,9
32,The Kitchen God's Wife,29526,9
33,The Kitchen God's Wife,36836,0
34,The Kitchen God's Wife,46398,9
38,The Kitchen God's Wife,113270,0
...,...,...,...
1017769,Angel Falls,244688,0
1018040,Naked Prey,250405,0
1018092,The Thin Woman,259260,0
1018240,"The Two Towers (The Lord of the Rings, Part 2)",259901,10


### Prepare data for parsing 

In [45]:
# Pivot to summarise and count
df_books_ratigs_user=rating_input_df.pivot_table(index='User-ID', columns='Book-Title', values='Book-Rating').fillna(0)
df_books_ratigs_user

Book-Title,'Salem's Lot,10 Lb. Penalty,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,...,Zia,Zodiac: The Eco-Thriller,Zombies of the Gene Pool,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# Demean the data
mtrx = df_books_ratigs_user.to_numpy()
ratings_mean = np.mean(mtrx, axis = 1)
normalized_mtrx = mtrx - ratings_mean.reshape(-1, 1)

In [26]:
normalized_mtrx

array([[-0.02290493, -0.02290493, -0.02290493, ..., -0.02290493,
        -0.02290493, -0.02290493],
       [-0.0354481 , -0.0354481 , -0.0354481 , ..., -0.0354481 ,
        -0.0354481 , -0.0354481 ],
       [-0.02726777, -0.02726777, -0.02726777, ..., -0.02726777,
        -0.02726777, -0.02726777],
       ...,
       [-0.02417742, -0.02417742, -0.02417742, ..., -0.02417742,
        -0.02417742, -0.02417742],
       [-0.03326668, -0.03326668, -0.03326668, ..., -0.03326668,
        -0.03326668, -0.03326668],
       [-0.01781494, -0.01781494, -0.01781494, ..., -0.01781494,
        -0.01781494, -0.01781494]])

In [29]:
# Singular value decomposition
U, sigma, Vt = svds(normalized_mtrx, k = 50)

In [30]:
# Convert to diagonal form
sigma = np.diag(sigma)

In [31]:
sigma

array([[101.02923368,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        , 101.78194169,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        , 102.33660645, ...,   0.        ,
          0.        ,   0.        ],
       ...,
       [  0.        ,   0.        ,   0.        , ..., 216.03192427,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
        238.4299931 ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        , 294.45675038]])

In [46]:
# Make predictions for any user
all_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + ratings_mean.reshape(-1, 1)
all_predicted_ratings

array([[-0.14424077, -0.00259141, -0.11721127, ...,  0.06123577,
         0.00253428, -0.03018032],
       [-0.09990855,  0.25106506,  0.00071853, ...,  0.13603517,
        -0.15587935,  0.31360657],
       [ 0.01801908,  0.01312671, -0.03609767, ...,  0.01617085,
        -0.08357854,  0.13767384],
       ...,
       [-0.00979758, -0.03800137,  0.06907432, ..., -0.04828494,
        -0.05670352,  0.01426735],
       [-0.05353365, -0.11429539, -0.03932996, ..., -0.14275768,
         0.10207638, -0.15368997],
       [ 0.06173385,  0.00898167, -0.08561376, ...,  0.00669095,
         0.15228131,  0.05863718]])

In [47]:
preds_df = pd.DataFrame(all_predicted_ratings, columns=df_books_ratigs_user.columns)
preds_df

Book-Title,'Salem's Lot,10 Lb. Penalty,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,...,Zia,Zodiac: The Eco-Thriller,Zombies of the Gene Pool,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,stardust
0,-0.144241,-0.002591,-0.117211,-0.007607,0.012945,-0.242277,0.265008,-0.047088,-0.033509,-0.059662,...,-0.021048,-0.012775,0.011803,-0.080186,-0.004130,-0.006264,0.115745,0.061236,0.002534,-0.030180
1,-0.099909,0.251065,0.000719,-0.082823,-0.072575,1.137276,-0.057849,0.090232,-0.019175,0.100515,...,-0.066169,-0.038016,-0.036261,-0.138319,-0.027700,-0.166138,0.534403,0.136035,-0.155879,0.313607
2,0.018019,0.013127,-0.036098,-0.036381,0.029571,-0.166750,0.371524,0.015332,-0.046320,0.019409,...,-0.023027,-0.030866,0.029620,-0.091450,0.011667,-0.004121,0.040088,0.016171,-0.083579,0.137674
3,-0.094840,-0.201020,-0.083896,0.081388,0.067674,0.237455,1.193102,-0.055528,0.320374,-0.063345,...,0.047124,-0.263175,-0.052005,-0.003858,-0.062884,0.383126,-0.737605,0.156780,-0.196635,-0.228960
4,-0.006001,0.012014,0.023936,0.018510,-0.001548,-0.056937,-0.000937,0.005532,-0.002567,0.012163,...,0.021089,0.012961,0.011113,-0.005669,0.013244,0.012148,0.037577,0.005636,0.014572,0.012939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2948,-0.016842,0.016580,0.093734,0.018437,0.034791,0.081165,0.144104,-0.011882,0.034531,-0.002066,...,0.027538,0.009644,-0.003951,0.021809,0.017925,0.006397,0.065912,0.023159,-0.018942,-0.025009
2949,0.009217,0.021066,-0.018854,0.011058,0.109338,-0.004169,-0.062109,0.041287,0.042265,-0.013872,...,0.007935,0.007086,0.022516,-0.003510,0.012895,0.002142,-0.029461,-0.001985,0.026248,-0.014855
2950,-0.009798,-0.038001,0.069074,-0.017535,0.070475,-0.034533,0.298314,-0.087773,0.146207,0.004156,...,-0.022475,-0.063280,0.035762,0.019498,-0.031090,0.101748,0.475979,-0.048285,-0.056704,0.014267
2951,-0.053534,-0.114295,-0.039330,-0.004223,-0.095233,-0.143451,0.065309,0.098749,0.175882,-0.031390,...,0.041965,-0.007244,-0.054857,0.159924,-0.004602,0.091621,-0.265028,-0.142758,0.102076,-0.153690


In [48]:
preds_df['user_id'] = df_books_ratigs_user.index
preds_df

Book-Title,'Salem's Lot,10 Lb. Penalty,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,...,Zodiac: The Eco-Thriller,Zombies of the Gene Pool,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,stardust,user_id
0,-0.144241,-0.002591,-0.117211,-0.007607,0.012945,-0.242277,0.265008,-0.047088,-0.033509,-0.059662,...,-0.012775,0.011803,-0.080186,-0.004130,-0.006264,0.115745,0.061236,0.002534,-0.030180,243
1,-0.099909,0.251065,0.000719,-0.082823,-0.072575,1.137276,-0.057849,0.090232,-0.019175,0.100515,...,-0.038016,-0.036261,-0.138319,-0.027700,-0.166138,0.534403,0.136035,-0.155879,0.313607,254
2,0.018019,0.013127,-0.036098,-0.036381,0.029571,-0.166750,0.371524,0.015332,-0.046320,0.019409,...,-0.030866,0.029620,-0.091450,0.011667,-0.004121,0.040088,0.016171,-0.083579,0.137674,507
3,-0.094840,-0.201020,-0.083896,0.081388,0.067674,0.237455,1.193102,-0.055528,0.320374,-0.063345,...,-0.263175,-0.052005,-0.003858,-0.062884,0.383126,-0.737605,0.156780,-0.196635,-0.228960,638
4,-0.006001,0.012014,0.023936,0.018510,-0.001548,-0.056937,-0.000937,0.005532,-0.002567,0.012163,...,0.012961,0.011113,-0.005669,0.013244,0.012148,0.037577,0.005636,0.014572,0.012939,643
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2948,-0.016842,0.016580,0.093734,0.018437,0.034791,0.081165,0.144104,-0.011882,0.034531,-0.002066,...,0.009644,-0.003951,0.021809,0.017925,0.006397,0.065912,0.023159,-0.018942,-0.025009,278188
2949,0.009217,0.021066,-0.018854,0.011058,0.109338,-0.004169,-0.062109,0.041287,0.042265,-0.013872,...,0.007086,0.022516,-0.003510,0.012895,0.002142,-0.029461,-0.001985,0.026248,-0.014855,278418
2950,-0.009798,-0.038001,0.069074,-0.017535,0.070475,-0.034533,0.298314,-0.087773,0.146207,0.004156,...,-0.063280,0.035762,0.019498,-0.031090,0.101748,0.475979,-0.048285,-0.056704,0.014267,278582
2951,-0.053534,-0.114295,-0.039330,-0.004223,-0.095233,-0.143451,0.065309,0.098749,0.175882,-0.031390,...,-0.007244,-0.054857,0.159924,-0.004602,0.091621,-0.265028,-0.142758,0.102076,-0.153690,278633


# Recommendation generation

In [51]:
# Define the number of books to recommend by algorithm
recommendations_count=5

### Find list of books (titles) among which recommendations will be selected

In [50]:
# let us select dummy user
# to update into function later!!!!!!!!!!!!!!
user_id=278418

# find all the books (titles) in ratings_df. 
# Note, this only include those books which both: 
# (1) were rated at least min_rates_received_by_book times by users, who rated at least min_books_rated_by_user
# and (2) we have titles for those books
# ??????????We may need to remove (1) condition and look for books, which were rated at least once??????????
all_titles=ratings_df['Book-Title'].unique()
all_titles

# find the books (titles) that were rated and presumably read by a user
rated_titles=[i for i in ratings_df.loc[ratings_df['User-ID']==user_id,'Book-Title']]
rated_titles

# find the books (titles) that were not rated and presumably not read by a user
titles_input_to_recommend=[i for i in all_titles if i not in rated_titles]
titles_input_to_recommend[:5]

['Classical Mythology',
 'Clara Callan',
 'Decision in Normandy',
 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It',
 'The Mummies of Urumchi']

In [52]:
# user's predictons
user_predictions = preds_df[preds_df['user_id']==user_id]
user_predictions[:5]

Book-Title,'Salem's Lot,10 Lb. Penalty,101 Dalmatians,"14,000 Things to Be Happy About",16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,...,Zodiac: The Eco-Thriller,Zombies of the Gene Pool,Zoya,ZwÃ?Â¶lf.,"\ Lamb to the Slaughter and Other Stories (Penguin 60s S.)""","\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",e,stardust,user_id
2949,0.009217,0.021066,-0.018854,0.011058,0.109338,-0.004169,-0.062109,0.041287,0.042265,-0.013872,...,0.007086,0.022516,-0.00351,0.012895,0.002142,-0.029461,-0.001985,0.026248,-0.014855,278418


In [54]:
user_predictions_t=user_predictions.T
user_predictions_t.sort_values(2949,ascending = False)

Unnamed: 0_level_0,2949
Book-Title,Unnamed: 1_level_1
user_id,278418.000000
Dance upon the Air (Three Sisters Island Trilogy),0.665423
Face the Fire (Three Sisters Island Trilogy),0.583523
Heaven and Earth (Three Sisters Island Trilogy),0.544245
Heart of the Sea (Irish Trilogy),0.520371
...,...
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)),-0.191411
Empire Falls,-0.194931
"Tuesdays with Morrie: An Old Man, a Young Man, and Life's Greatest Lesson",-0.202966
She's Come Undone (Oprah's Book Club (Paperback)),-0.212456


In [58]:
user_predictions_f=user_predictions_t[user_predictions_t.index.isin(titles_input_to_recommend)]
pred=user_predictions_f.index[:recommendations_count]
pred

Index([''Salem's Lot', '10 Lb. Penalty', '14,000 Things to Be Happy About',
       '16 Lighthouse Road', '1984'],
      dtype='object', name='Book-Title')

In [59]:
print(f'for a user {user_id}, who read the following book(s):') #check ratings as well
display(rated_titles)
print('The recommendation is following:')
display(pred)

for a user 278418, who read the following book(s):


['Beloved (Plume Contemporary Fiction)',
 'Mary-Kate &amp; Ashley Switching Goals (Mary-Kate and Ashley Starring in)',
 'To Kill a Mockingbird',
 'Purity in Death',
 'This Year It Will Be Different: And Other Stories',
 'The Street Lawyer',
 "Tess of the D'Urbervilles (Wordsworth Classics)",
 'Slow Waltz in Cedar Bend',
 'The Cat Who Came to Breakfast (Cat Who... (Hardcover))',
 'Petals on the River',
 'Pet Sematary',
 'A Man in Full',
 'A Painted House',
 'The Hunt for Red October',
 'The Little Prince',
 'Moonlight Becomes You',
 'The Firm',
 'Murder at the Kennedy Center (Capital Crime Mysteries)',
 'Frankenstein (Illustrated Classics Series)',
 'The First Wives Club Movie Tie In',
 "River's End",
 'Night Sins',
 'Prince Caspian',
 'The Vampire Lestat (Vampire Chronicles, Book II)',
 "Her Mother's Daughter",
 'Vanished',
 '101 Dalmatians',
 'My First Book about Space (Golden Look-Look Books (Paperback))',
 'Along Came a Spider (Alex Cross Novels)',
 "Where the Heart Is (Oprah's Book

The recommendation is following:


Index([''Salem's Lot', '10 Lb. Penalty', '14,000 Things to Be Happy About',
       '16 Lighthouse Road', '1984'],
      dtype='object', name='Book-Title')

# Check Accuracy

In [None]:
# Check the accuracy for both default vs tuned method
# display(f'Accuracy of a Tuned Model: {accuracy.rmse(svd_best_parameters_model.test(test_set), verbose=False)}')
display(f'Accuracy of a Model with Default Parameters: {accuracy.rmse(svd_default_model.test(test_set), verbose=False)}')