# Investigation into Suprise module for building recommendations

Using Blog post http://blog.fastforwardlabs.com/2018/01/22/exploring-recommendation-systems.html

and surprise module https://github.com/NicolasHug/Surprise

with documentation https://surprise.readthedocs.io/en/stable/index.html

In [1]:
import numpy as np
import pandas as pd
import re
import os
from pandas.plotting import scatter_matrix

get_ipython().magic(u'env OMP_NUM_THREADS=2')

from IPython.display import display, HTML

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn

import surprise
import sklearn
import sklearn.model_selection

#get_ipython().magic(u'matplotlib')
#get_ipython().magic(u'matplotlib inline')

# Set the ransom seed used for the whole program to allow reprocibility
np.random.seed(3214412)

DEBUG = True # If true, pull a sample of the dataset for development 

env: OMP_NUM_THREADS=2


In [2]:
raw_df = pd.read_csv('ratings.csv')
raw_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [3]:
data_df = pd.DataFrame.copy(raw_df)
if DEBUG:
    data_df = data_df.sample(n=100000)
else:
    del raw_df  # Save on some memory

In [4]:
print("Count of users: {}".format(data_df.userId.nunique()))
print("Count of movies: {}".format(data_df.movieId.nunique()))
print("Count of ratings: {}".format(data_df.rating.nunique()))
print("Ratings: {}".format(', '.join(map(str, data_df.rating.sort_values().unique()))))

Count of users: 58706
Count of movies: 9487
Count of ratings: 10
Ratings: 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0


# Initial Recommendations
If the user saw the movie, chalk it up as a 1 and recommend from these 1/0 values 

In [5]:
recs_df = pd.DataFrame.copy(data_df[['userId', 'movieId', 'rating']])
recs_df.head()

Unnamed: 0,userId,movieId,rating
231381,2484,2001,3.0
3262121,33940,1409,2.5
15935743,165778,743,4.0
24673730,256471,1036,5.0
18393560,190907,2080,3.5


In [6]:
reader = surprise.Reader(rating_scale=(1, 5))
dataset_full = surprise.Dataset.load_from_df(recs_df, reader)

In [7]:
from surprise.accuracy import rmse
from surprise.model_selection import cross_validate

# algo = surprise.KNNBasic(min_k=3)  # Memory issues when using KNNBasic

algo = surprise.SVD()
algo_cv = cross_validate(algo, dataset_full, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9852  0.9821  0.9943  0.9779  0.9694  0.9818  0.0082  
MAE (testset)     0.7636  0.7643  0.7700  0.7574  0.7510  0.7613  0.0065  
Fit time          5.72    5.73    6.28    6.21    6.55    6.10    0.33    
Test time         0.23    0.27    0.24    0.19    0.19    0.23    0.03    


# Ok, so what's the best number of factors for SVD?

In [8]:
# https://surprise.readthedocs.io/en/stable/getting_started.html#tuning-algorithm-parameters
from surprise.model_selection import GridSearchCV

gs = GridSearchCV(surprise.SVD, param_grid=dict(n_factors=[10, 50, 100, 150, 200]))

gs.fit(dataset_full)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.977030099437821
{'n_factors': 10}


# Great, I built a recommender, now what?
How do I actually use a constructed recommender?

In [20]:
from surprise import accuracy
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(dataset_full, test_size=.25)

algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

predictions[:3]

RMSE: 0.9844


[Prediction(uid=120796, iid=4920, r_ui=3.0, est=3.7257065761153885, details={'was_impossible': False}),
 Prediction(uid=208752, iid=3101, r_ui=4.0, est=3.5839663805202786, details={'was_impossible': False}),
 Prediction(uid=101544, iid=477, r_ui=2.0, est=3.6369113726094993, details={'was_impossible': False})]

In [21]:
# Predict a previously seen user/movie pair
print(algo.predict(recs_df.iloc[0].userId, recs_df.iloc[0].movieId))
print(recs_df.iloc[0])

user: 2484.0     item: 2001.0     r_ui = None   est = 3.09   {'was_impossible': False}
userId     2484.0
movieId    2001.0
rating        3.0
Name: 231381, dtype: float64


In [22]:
# Predict an unseen user/movie pair
print("User 2484 has only rated a single movie\n", recs_df[recs_df.userId == 2484])

npred = algo.predict(2484, recs_df[recs_df.movieId != 2001].iloc[0].movieId)

print("Prediction for another movie\n", npred)

User 2484 has only rated a single movie
         userId  movieId  rating
231381    2484     2001     3.0
Prediction for another movie
 user: 2484       item: 1409.0     r_ui = None   est = 3.37   {'was_impossible': False}


In [23]:
%%timeit
mid = recs_df[recs_df.movieId != 2001].sample(n=1).iloc[0].movieId
algo.predict(2484, mid)

4.08 ms ± 12.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
%%timeit
# Calculate the scores for all items for user 2484
# Output the top 10
# I can't find a way to do this in bulk.  I think the predict function is it, but it's fairly quick.
ruid = recs_df.sample(n=1).iloc[0].userId  # Pull a random customerid for the predictions
[pred.iid for pred in sorted([algo.predict(ruid, mid) for mid in recs_df.movieId.unique()], 
                             key=lambda x: x.est, reverse=True)[0:10]]

79.5 ms ± 527 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
