In [16]:
%%markdown
# References
* [Exploring Recommendation Systems](http://blog.fastforwardlabs.com/2018/01/22/exploring-recommendation-systems.html), by Shioulin, 22 January 2018
* [Surprise project on GitHub](https://github.com/NicolasHug/Surprise)
* [Surprise Web site](http://surpriselib.com)
* [Movie-Lens data set](http://surprise.readthedocs.io/en/stable/dataset.html#dataset)
                        

# References
* [Exploring Recommendation Systems](http://blog.fastforwardlabs.com/2018/01/22/exploring-recommendation-systems.html), by Shioulin, 22 January 2018
* [Surprise project on GitHub](https://github.com/NicolasHug/Surprise)
* [Surprise Web site](http://surpriselib.com)
* [Movie-Lens data set](http://surprise.readthedocs.io/en/stable/dataset.html#dataset)
                        

In [17]:
import pandas as pd

from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import NormalPredictor
from surprise import Reader
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

In [15]:
%%markdown
# Movie-Lens Dataset

# Movie-Lens Dataset

In [8]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# sample random trainset and testset
# test set is made of 25% of the ratings.
trainset, testset = train_test_split(data, test_size=.25)

In [9]:
# Use the famous SVD algorithm.
algo = SVD()

In [10]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.9418


0.9417547514595387

In [11]:
# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9384  0.9332  0.9377  0.9333  0.9351  0.9355  0.0022  
MAE (testset)     0.7405  0.7348  0.7377  0.7389  0.7351  0.7374  0.0022  
Fit time          4.84    4.71    4.62    4.65    4.60    4.68    0.08    
Test time         0.25    0.41    0.18    0.19    0.20    0.25    0.09    


{'fit_time': (4.838788032531738,
  4.7063822746276855,
  4.621519565582275,
  4.647674798965454,
  4.60384464263916),
 'test_mae': array([0.74048213, 0.73483854, 0.73766246, 0.73892974, 0.73514339]),
 'test_rmse': array([0.93840196, 0.93319571, 0.93765372, 0.93332851, 0.93513885]),
 'test_time': (0.25264811515808105,
  0.4097118377685547,
  0.17980575561523438,
  0.1945943832397461,
  0.1971583366394043)}

In [18]:
# Grid search parameters
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

0.8521315869223091
{'reg_all': 0.4, 'lr_all': 0.002, 'n_epochs': 5}


In [14]:
%%markdown
# Custom Datasets

# Custom Datasets

In [13]:
# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {'itemID': [1, 1, 1, 2, 2],
                'userID': [9, 32, 2, 45, 'user_foo'],
                'rating': [3, 2, 4, 3, 1]}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)


{'fit_time': (0.0006020069122314453, 9.202957153320312e-05),
 'test_mae': array([1.04467587, 1.8052166 ]),
 'test_rmse': array([1.29904915, 1.83048978]),
 'test_time': (0.00040435791015625, 6.103515625e-05)}