# Modeling Recommender System with Surprise

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('fast')

from src.utilities import *
import surprise
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise import BaselineOnly, SVD, NMF, KNNBasic, KNNBaseline, KNNWithMeans, NormalPredictor
from surprise.model_selection import cross_validate, train_test_split
from surprise import accuracy
from collections import defaultdict

In [5]:
ratings_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Book-Ratings.csv')
books_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Books.csv')
users_df = csv('https://markg110.s3-us-west-1.amazonaws.com/data/BX-Users.csv')

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  if self.run_code(code, result):


In [6]:
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [7]:
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [9]:
print(books_df.shape)
print('')
books_df.head()

(271360, 8)



Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [10]:
books_new = books_df.copy()
books_new.dropna(inplace=True)
columns = ['Book-Author', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
books_df.drop(columns=columns, inplace=True)
books_df.rename(columns={'Year-Of-Publication':'Publication-Year'},inplace=True)
books_ratings_df = pd.merge(ratings_df, books_df, on='ISBN')
print(books_ratings_df.shape)
print('===========================================================')
books_ratings_df.head()

(1031136, 5)


Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Publication-Year
0,276725,034545104X,0,Flesh Tones: A Novel,2002
1,2313,034545104X,5,Flesh Tones: A Novel,2002
2,6543,034545104X,0,Flesh Tones: A Novel,2002
3,8680,034545104X,5,Flesh Tones: A Novel,2002
4,10314,034545104X,9,Flesh Tones: A Novel,2002


In [12]:
# Removing irrelevant data and casting publication year to int64
books_ratings_df = books_ratings_df[books_ratings_df['Publication-Year'] != 'Gallimard']
books_ratings_df = books_ratings_df[books_ratings_df['Publication-Year'] != 'DK Publishing Inc']
books_ratings_df['Publication-Year'] = books_ratings_df['Publication-Year'].apply(lambda x: int(x))

In [16]:
# Isolating most rated books which were published between 1975 and 2002
books_ratings_df = books_ratings_df[books_ratings_df['Publication-Year'] >= 1975]
books_ratings_df = books_ratings_df[books_ratings_df['Publication-Year'] <= 2002]
books_ratings_df.shape

(902959, 5)

In [17]:
min_book_ratings = 50 # Books: At least 50 ratings received
min_user_ratings = 50 # Users: At least 50 ratings given

In [19]:
# Trimming down dataset
filter_books = books_ratings_df['ISBN'].value_counts() > min_book_ratings # Isolating books with more than 50 ratings received
filter_books = filter_books[filter_books].index.tolist() # Isolating ISBNs
filter_users = books_ratings_df['User-ID'].value_counts() > min_user_ratings # Isolating users with more than 50 ratings given
filter_users = filter_users[filter_users].index.tolist() # Isolating User-IDs

In [23]:
# Filtering dataframe based on minimum number of ratings by ISBNs and User-IDs
filtered_df = books_ratings_df[(books_ratings_df['ISBN'].isin(filter_books)) & (books_ratings_df['User-ID'].isin(filter_users))]

## Modeling: <code>filtered_df</code>

In [32]:
columns=['Book-Title', 'Publication-Year']
filtered_df.drop(columns=columns, inplace=True)
print(filtered_df.shape)
print('')
filtered_df.head()

(117267, 3)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,User-ID,ISBN,Book-Rating
2,6543,034545104X,0
4,10314,034545104X,9
5,23768,034545104X,0
7,28523,034545104X,0
10,56157,034545104X,0


In [33]:
# Instantiate Surprise classes
reader = Reader(rating_scale=(0,10))
data = Dataset.load_from_df(filtered_df, reader)

In [34]:
# Algorithm iterator for cross_validation
algorithm = [BaselineOnly(), SVD(), KNNBasic(), KNNBaseline(), KNNWithMeans(), NormalPredictor()]

In [35]:
comparison_list = []

for algo in algorithm:
    # Perform cross validation with RMSE as evaluation metric
    results = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Append to comparison list
    get_result = pd.DataFrame.from_dict(results).mean(axis=0)
    get_result = get_result.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    comparison_list.append(get_result)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [36]:
# Generating cross validation results
# We choose BaselineOnly algorithm as it yields the lowest RMSE
surprise_results = pd.DataFrame(comparison_list).set_index('Algorithm').sort_values('test_rmse')
surprise_results

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,3.329897,0.133036,0.125911
KNNWithMeans,3.41999,0.504006,1.885585
KNNBaseline,3.427019,0.585988,2.249084
SVD,3.512723,3.93887,0.164002
KNNBasic,3.644096,0.466363,1.781149
NormalPredictor,4.665204,0.109652,0.168673


In [37]:
# Configuring baselines using alternating least squares (ALS) 
# and stochastic gradient descent (SGD) for comparison
print('Using SGD: Stochastic Gradient Descent')
bsl_options1 = {'method': 'sgd',
               'learning_rate': 0.00005}

algo1 = BaselineOnly(bsl_options=bsl_options1) # Note algo1: SGD
print('==========================================')

Using SGD: Stochastic Gradient Descent


In [38]:
print('Using ALS: Alternating Least Squares')
bsl_options = {'method':'als', 'n_epochs':5, 'regu_u': 12, 'reg_i':5}

algo = BaselineOnly(bsl_options=bsl_options) # Note algo: ALS

Using ALS: Alternating Least Squares


In [41]:
print('Cross validate SGD')
print('==========================================')
cross_validate(algo1, data, measures=['RMSE'], cv=5, verbose=False)

Cross validate SGD
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...
Estimating biases using sgd...


{'test_rmse': array([3.61970504, 3.62817462, 3.61813693, 3.62072998, 3.6407386 ]),
 'fit_time': (0.2858119010925293,
  0.28494977951049805,
  0.28933119773864746,
  0.2842090129852295,
  0.2846250534057617),
 'test_time': (0.08953022956848145,
  0.2155609130859375,
  0.08902454376220703,
  0.2149209976196289,
  0.09046268463134766)}

In [42]:
# Alternating Least Squares returns lower array of test RMSE
print('Cross validate ALS')
print('==========================================')
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

Cross validate ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([3.31228886, 3.3554559 , 3.33619559, 3.35197517, 3.32144036]),
 'fit_time': (0.1008291244506836,
  0.10027408599853516,
  0.10046601295471191,
  0.09932065010070801,
  0.10054135322570801),
 'test_time': (0.21363592147827148,
  0.08981657028198242,
  0.21383142471313477,
  0.08935928344726562,
  0.09006977081298828)}

In [46]:
# Train, test, split to generate ratings predictions of each book
trainset, testset = train_test_split(data, test_size=0.20)
algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 3.3504


3.3503985653366954

In [None]:
df_pred = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_pred['Iu']=df_pred['uid'].apply(get_Iu)
df_pred['Ui']=df_pred['iid'].apply(get_Ui)
df_pred['err']=abs(df_pred.est - df_pred.rui)

print('Shape of df_pred:', df_pred.shape)
print('==========================================')
print('')

df_pred.head()