In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import GridSearchCV
import pandas as pd

In [2]:
df = pd.read_csv('Ratings.csv', low_memory=False)
books = pd.read_csv('Books.csv', low_memory=False)

df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
books = books[['ISBN', 'Book-Title']]
books.head()

Unnamed: 0,ISBN,Book-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [5]:
df = pd.merge(df, books, on='ISBN')
df

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0,Flesh Tones: A Novel
1,2313,034545104X,5,Flesh Tones: A Novel
2,6543,034545104X,0,Flesh Tones: A Novel
3,8680,034545104X,5,Flesh Tones: A Novel
4,10314,034545104X,9,Flesh Tones: A Novel
...,...,...,...,...
1031131,276688,0517145553,0,Mostly Harmless
1031132,276688,1575660792,7,Gray Matter
1031133,276690,0590907301,0,Triplet Trouble and the Class Trip (Triplet Tr...
1031134,276704,0679752714,0,A Desert of Pure Feeling (Vintage Contemporaries)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1031136 entries, 0 to 1031135
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1031136 non-null  int64 
 1   ISBN         1031136 non-null  object
 2   Book-Rating  1031136 non-null  int64 
 3   Book-Title   1031136 non-null  object
dtypes: int64(2), object(2)
memory usage: 39.3+ MB


In [7]:
df = df[df['Book-Rating'] != 0]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383842 entries, 1 to 1031135
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      383842 non-null  int64 
 1   ISBN         383842 non-null  object
 2   Book-Rating  383842 non-null  int64 
 3   Book-Title   383842 non-null  object
dtypes: int64(2), object(2)
memory usage: 14.6+ MB


In [8]:
counts = df['User-ID'].value_counts()
df = df[df['User-ID'].isin(counts[counts > 10].index)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 255819 entries, 1 to 1031132
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   User-ID      255819 non-null  int64 
 1   ISBN         255819 non-null  object
 2   Book-Rating  255819 non-null  int64 
 3   Book-Title   255819 non-null  object
dtypes: int64(2), object(2)
memory usage: 9.8+ MB


In [9]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df['UserID']=le.fit_transform(df['User-ID'])
df['BookID']=le.fit_transform(df['ISBN'])
df.drop('User-ID',axis=1,inplace=True)
df

Unnamed: 0,ISBN,Book-Rating,Book-Title,UserID,BookID
1,034545104X,5,Flesh Tones: A Novel,46,21549
3,034545104X,5,Flesh Tones: A Novel,165,21549
4,034545104X,9,Flesh Tones: A Novel,199,21549
14,034545104X,8,Flesh Tones: A Novel,1658,21549
18,034545104X,9,Flesh Tones: A Novel,2119,21549
...,...,...,...,...,...
1031120,0140290788,9,Time Out Chicago (Time Out Chicago),5943,8527
1031121,0609806491,7,The Zen of Proposal Writing: An Expert's Stres...,5943,59641
1031123,1931333246,10,Chorus,5943,110132
1031127,0312264186,8,You Can't Catch Death: A Daughter's Memoir,5944,14956


In [10]:
reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(df[['UserID', 'BookID', 'Book-Rating']], reader=reader)

In [11]:
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, n_jobs=-1, joblib_verbose=1)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


1.6003425657226797
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.3min finished


In [12]:
svd = SVD(n_epochs = 10, lr_all=0.005, reg_all=0.4)

In [13]:
user = df['UserID'].sample()
user

103538    1001
Name: UserID, dtype: int64

In [14]:
rated = df[df['UserID'].isin(user)]
rated

Unnamed: 0,ISBN,Book-Rating,Book-Title,UserID,BookID
24563,0812571118,5,Hope of Earth (Geodyssey),1001,83444
43354,0804109729,5,Russka : The Novel of Russia,1001,80199
72861,080410526X,5,All I Really Need to Know,1001,80129
102506,0345354613,5,Eaters of the Dead,1001,20147
102618,0345378482,5,The Andromeda Strain,1001,20444
103538,034540288X,5,The Lost World,1001,20762
103825,0345417623,5,Timeline,1001,20928
131096,0380729342,5,Tales from Watership Down,1001,29252
188131,0451525221,5,Scarlet Letter,1001,47664
219207,0345370775,5,Jurassic Park,1001,20334


In [15]:
recommendations = df[~df['UserID'].isin(user)]
recommendations

Unnamed: 0,ISBN,Book-Rating,Book-Title,UserID,BookID
1,034545104X,5,Flesh Tones: A Novel,46,21549
3,034545104X,5,Flesh Tones: A Novel,165,21549
4,034545104X,9,Flesh Tones: A Novel,199,21549
14,034545104X,8,Flesh Tones: A Novel,1658,21549
18,034545104X,9,Flesh Tones: A Novel,2119,21549
...,...,...,...,...,...
1031120,0140290788,9,Time Out Chicago (Time Out Chicago),5943,8527
1031121,0609806491,7,The Zen of Proposal Writing: An Expert's Stres...,5943,59641
1031123,1931333246,10,Chorus,5943,110132
1031127,0312264186,8,You Can't Catch Death: A Daughter's Memoir,5944,14956


In [16]:
recommendations = recommendations.drop_duplicates(subset=['BookID'])
recommendations

Unnamed: 0,ISBN,Book-Rating,Book-Title,UserID,BookID
1,034545104X,5,Flesh Tones: A Novel,46,21549
66,0446520802,10,The Notebook,134,42516
181,2080674722,7,Les Particules Elementaires,4121,111065
189,038550120X,10,A Painted House,221,32359
370,0425115801,10,Lightning,45,36898
...,...,...,...,...,...
1031120,0140290788,9,Time Out Chicago (Time Out Chicago),5943,8527
1031121,0609806491,7,The Zen of Proposal Writing: An Expert's Stres...,5943,59641
1031123,1931333246,10,Chorus,5943,110132
1031127,0312264186,8,You Can't Catch Death: A Daughter's Memoir,5944,14956


In [17]:
recommendations = recommendations[['BookID', 'Book-Title']]
recommendations = recommendations.reset_index()
recommendations

Unnamed: 0,index,BookID,Book-Title
0,1,21549,Flesh Tones: A Novel
1,66,42516,The Notebook
2,181,111065,Les Particules Elementaires
3,189,32359,A Painted House
4,370,36898,Lightning
...,...,...,...
119468,1031120,8527,Time Out Chicago (Time Out Chicago)
119469,1031121,59641,The Zen of Proposal Writing: An Expert's Stres...
119470,1031123,110132,Chorus
119471,1031127,14956,You Can't Catch Death: A Daughter's Memoir


In [18]:
recommendations = recommendations.drop(['index'], axis=1)
recommendations

Unnamed: 0,BookID,Book-Title
0,21549,Flesh Tones: A Novel
1,42516,The Notebook
2,111065,Les Particules Elementaires
3,32359,A Painted House
4,36898,Lightning
...,...,...
119468,8527,Time Out Chicago (Time Out Chicago)
119469,59641,The Zen of Proposal Writing: An Expert's Stres...
119470,110132,Chorus
119471,14956,You Can't Catch Death: A Daughter's Memoir


In [19]:
userid = user.values[0]

In [20]:
trainset = data.build_full_trainset()
svd.fit(trainset)
recommendations['Predicted-Rating'] = recommendations['BookID'].apply(lambda x: svd.predict(userid, x).est)
recommendations = recommendations.sort_values('Predicted-Rating', ascending=False)
recommendations.head(10)

Unnamed: 0,BookID,Book-Title,Predicted-Rating
25872,38769,Harry Potter and the Chamber of Secrets Postca...,7.05771
22197,73132,"My Sister's Keeper : A Novel (Picoult, Jodi)",7.018592
6579,19948,"The Return of the King (The Lord of the Rings,...",7.015286
12666,7643,84 Charing Cross Road,6.961999
5220,19947,"The Two Towers (The Lord of the Rings, Part 2)",6.951351
4844,59813,"The Two Towers (The Lord of the Rings, Part 2)",6.9436
25341,2144,The Giving Tree,6.912093
48803,86366,Dilbert: A Book of Postcards,6.910684
4796,38487,Harry Potter and the Goblet of Fire (Book 4),6.898763
7890,86425,Weirdos From Another Planet!,6.896178
