##### Supriselib algos benchmark example
* Following outputs 5 folds validation results on ml-1m dataset with each of the inbuilt algos provided by surpriselib package.
* [SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor] are some which come included in package

In [1]:
import time
import datetime
import random

import numpy as np
import six
from tabulate import tabulate

from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering


In [5]:
stable = 'http://surprise.readthedocs.io/en/stable/'
LINK = {'SVD': '[{}]({})'.format('SVD',
                                 stable +
                                 'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD'),
        'SVDpp': '[{}]({})'.format('SVD++',
                                   stable +
                                   'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp'),
        'NMF': '[{}]({})'.format('NMF',
                                 stable +
                                 'matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF'),
        'SlopeOne': '[{}]({})'.format('Slope One',
                                      stable +
                                      'slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne'),
        'KNNBasic': '[{}]({})'.format('k-NN',
                                      stable +
                                      'knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic'),
        'KNNWithMeans': '[{}]({})'.format('Centered k-NN',
                                          stable +
                                          'knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans'),
        'KNNBaseline': '[{}]({})'.format('k-NN Baseline',
                                         stable +
                                         'knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline'),
        'CoClustering': '[{}]({})'.format('Co-Clustering',
                                          stable +
                                          'co_clustering.html#surprise.prediction_algorithms.co_clustering.CoClustering'),
        'BaselineOnly': '[{}]({})'.format('Baseline',
                                          stable +
                                          'basic_algorithms.html#surprise.prediction_algorithms.baseline_only.BaselineOnly'),
        'NormalPredictor': '[{}]({})'.format('Random',
                                             stable +
                                             'basic_algorithms.html#surprise.prediction_algorithms.random_pred.NormalPredictor'),
        'ml-100k': '[{}]({})'.format('Movielens 100k',
                                     'http://grouplens.org/datasets/movielens/100k'),
        'ml-1m': '[{}]({})'.format('Movielens 1M',
                                   'http://grouplens.org/datasets/movielens/1m'),
        }

In [6]:
LINK

{'SVD': '[SVD](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVD)',
 'SVDpp': '[SVD++](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp)',
 'NMF': '[NMF](http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.NMF)',
 'SlopeOne': '[Slope One](http://surprise.readthedocs.io/en/stable/slope_one.html#surprise.prediction_algorithms.slope_one.SlopeOne)',
 'KNNBasic': '[k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBasic)',
 'KNNWithMeans': '[Centered k-NN](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNWithMeans)',
 'KNNBaseline': '[k-NN Baseline](http://surprise.readthedocs.io/en/stable/knn_inspired.html#surprise.prediction_algorithms.knns.KNNBaseline)',
 'CoClustering': '[C

In [7]:
np.random.seed(0)
random.seed(0)

dataset = 'ml-1m'
data = Dataset.load_builtin(dataset)
kf = KFold(random_state=0)  # folds will be the same for all algorithms.

Dataset ml-1m could not be found. Do you want to download it? [Y/n] y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-1m.zip...
Done! Dataset ml-1m has been saved to C:\Users\might/.surprise_data/ml-1m


In [None]:
classes = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline,
           CoClustering, BaselineOnly, NormalPredictor)
table = []
for klass in classes:
    start = time.time()
    out = cross_validate(klass(), data, ['rmse', 'mae'], kf)
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    link = LINK[klass.__name__]
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))

    new_line = [link, mean_rmse, mean_mae, cv_time]
    print(tabulate([new_line], tablefmt="pipe"))  # print current algo perf
    table.append(new_line)

header = [LINK[dataset],
          'RMSE',
          'MAE',
          'Time'
          ]
print(tabulate(table, header, tablefmt="pipe"))

* #### Manully splitting & evaluating ml-100k dataset in train & validation set

In [38]:
#SVD
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

from surprise.model_selection import train_test_split

In [14]:
algo = SVD()
data= Dataset.load_builtin('ml-100k')

trainset, testset = train_test_split(data, test_size=.25)

In [22]:
import time
tick = time.time()
algo.fit(trainset)
print(time.time()-tick)

3.5105831623077393


In [30]:
pred = algo.test(testset)
len(testset)

25000

In [31]:
testset[1500:1505]

[('782', '1615', 3.0),
 ('376', '154', 4.0),
 ('892', '31', 4.0),
 ('442', '482', 3.0),
 ('94', '223', 5.0)]

In [41]:
a,b,c,d,e = pred[0]

In [55]:
[print(eval(i)) for i in list('abcd')]

94
544
3.0
3.656863038461044


[None, None, None, None]

In [32]:
pred[1500:1505]#prediction output format

[Prediction(uid='782', iid='1615', r_ui=3.0, est=2.8040956371694645, details={'was_impossible': False}),
 Prediction(uid='376', iid='154', r_ui=4.0, est=3.9792781983007997, details={'was_impossible': False}),
 Prediction(uid='892', iid='31', r_ui=4.0, est=3.8741317430704663, details={'was_impossible': False}),
 Prediction(uid='442', iid='482', r_ui=3.0, est=4.036310960447297, details={'was_impossible': False}),
 Prediction(uid='94', iid='223', r_ui=5.0, est=4.64151396134745, details={'was_impossible': False})]

In [60]:
from surprise import accuracy
accuracy.rmse(pred)

RMSE: 0.9404


0.9403683813991757

In [4]:
import time


tick = time.time()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)#5 folds CV with SVD
print(time.time()-tick)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9333  0.9369  0.9416  0.9319  0.9376  0.9362  0.0034  
MAE (testset)     0.7376  0.7377  0.7447  0.7323  0.7410  0.7387  0.0041  
Fit time          3.81    3.84    3.78    3.75    3.95    3.83    0.07    
Test time         0.22    0.13    0.11    0.15    0.13    0.15    0.04    
20.4947509765625


In [2]:
#########

In [11]:
### Parsing reader module
import os
from collections import namedtuple

BuiltinDataset = namedtuple('BuiltinDataset',['url', 'path', 'rating_scale', 'reader_params'])

In [19]:
### from .builtin_datasets import BUILTIN_DATASETS
def get_dataset_dir():
    '''Return folder where downloaded datasets and other data are stored.
    Default folder is ~/.surprise_data/, but it can also be set by the
    environment variable ``SURPRISE_DATA_FOLDER``.
    '''

    folder = os.environ.get('SURPRISE_DATA_FOLDER', os.path.expanduser('~') +
                            '/.surprise_data/')
    if not os.path.exists(folder):
        os.makedirs(folder)

    return folder


BUILTIN_DATASETS = {
    'ml-100k':
        BuiltinDataset(
            url='http://files.grouplens.org/datasets/movielens/ml-100k.zip',
            path=os.path.join(get_dataset_dir(), 'ml-100k/ml-100k/u.data'),
            rating_scale=(1, 5),
            reader_params=dict(line_format='user item rating timestamp',
                               sep='\t')
        ),
    'ml-1m':
        BuiltinDataset(
            url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
            path=os.path.join(get_dataset_dir(), 'ml-1m/ml-1m/ratings.dat'),
            rating_scale=(1, 5),
            reader_params=dict(line_format='user item rating timestamp',
                               sep='::')
        )}

In [22]:
dset= BUILTIN_DATASETS['ml-100k']

In [36]:
dset

BuiltinDataset(url='http://files.grouplens.org/datasets/movielens/ml-100k.zip', path='C:\\Users\\might/.surprise_data/ml-100k/ml-100k/u.data', rating_scale=(1, 5), reader_params={'line_format': 'user item rating timestamp', 'sep': '\t'})

* #### Importing inbuilt datasets: ml-100k as custom files

In [1]:
from surprise import Reader
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import os

In [4]:
file_path = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/u.data')
rdr = Reader(line_format='user item rating', sep ='\t', rating_scale=(1,5))#line_format='')

In [2]:
rdr = Reader(line_format='user item rating', sep ='\t', rating_scale=(1,5))#line_format='')

In [17]:
data =Dataset.load_from_file(file_path, rdr)
trainset,testset= train_test_split(data, test_size=0.2)

In [24]:
algo =SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x17061c50908>

In [27]:
from surprise import accuracy

pred= algo.test(testset)
accuracy.rmse(pred)

RMSE: 0.9360


0.9360411069215001

* ###### Cross validating results on ml-100k with SVD, KNN & Baseline

In [35]:
algo =SVD()
cross_validate(algo, data, measures=['rmse','mae'], verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9415  0.9267  0.9353  0.9373  0.9347  0.9351  0.0048  
MAE (testset)     0.7413  0.7319  0.7336  0.7391  0.7380  0.7368  0.0035  
Fit time          3.81    3.85    3.80    3.70    3.75    3.78    0.05    
Test time         0.11    0.17    0.11    0.11    0.10    0.12    0.02    


{'test_rmse': array([0.94146352, 0.9266794 , 0.93533813, 0.93731226, 0.93474275]),
 'test_mae': array([0.74134185, 0.73192685, 0.73357895, 0.7391303 , 0.73803207]),
 'fit_time': (3.812755823135376,
  3.847705125808716,
  3.800828218460083,
  3.7040839195251465,
  3.751995086669922),
 'test_time': (0.1077427864074707,
  0.16854619979858398,
  0.1076807975769043,
  0.1076819896697998,
  0.10372114181518555)}

In [31]:
from surprise import BaselineOnly
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9461  0.9376  0.9461  0.9515  0.9365  0.9436  0.0057  
MAE (testset)     0.7512  0.7440  0.7491  0.7549  0.7410  0.7481  0.0050  
Fit time          0.13    0.14    0.15    0.15    0.15    0.15    0.01    
Test time         0.07    0.07    0.13    0.07    0.08    0.09    0.02    


{'test_rmse': array([0.94611425, 0.93764737, 0.94611765, 0.95154629, 0.93645949]),
 'test_mae': array([0.7512452 , 0.74402131, 0.7490589 , 0.75492983, 0.74103763]),
 'fit_time': (0.129669189453125,
  0.14261937141418457,
  0.15358757972717285,
  0.15261530876159668,
  0.1466541290283203),
 'test_time': (0.07179951667785645,
  0.0747981071472168,
  0.129608154296875,
  0.07380938529968262,
  0.07580113410949707)}

In [36]:
from surprise import KNNBasic
algo = KNNBasic()
cross_validate(algo, data, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9721  0.9782  0.9780  0.9789  0.9843  0.9783  0.0039  
MAE (testset)     0.7669  0.7742  0.7705  0.7725  0.7790  0.7726  0.0040  
Fit time          0.47    0.41    0.42    0.41    0.51    0.45    0.04    
Test time         2.63    2.65    2.64    2.48    2.70    2.62    0.07    


{'test_rmse': array([0.97213513, 0.9782337 , 0.97800681, 0.97893964, 0.98431272]),
 'test_mae': array([0.76689422, 0.77424671, 0.77052573, 0.77248953, 0.7790203 ]),
 'fit_time': (0.4747741222381592,
  0.4148576259613037,
  0.42087316513061523,
  0.40686678886413574,
  0.5145902633666992),
 'test_time': (2.6319382190704346,
  2.6509103775024414,
  2.635981321334839,
  2.482391119003296,
  2.699810028076172)}

* ##### With jester dataset

In [6]:
data_j = Dataset.load_builtin('jester')#loading jester dataset

In [7]:
jtrain, jtes= train_test_split(data_j, test_size=0.3)

In [45]:
jtes[:10]#contains the s [User ID] [Item ID] [Rating]

[('35517', '96', 13.469),
 ('62147', '145', 18.25),
 ('47657', '122', 15.125),
 ('46926', '92', 17.75),
 ('39564', '96', 5.281),
 ('42263', '64', 10.938),
 ('37125', '68', 14.844),
 ('40446', '119', 19.561999999999998),
 ('32408', '59', 12.031),
 ('36269', '70', 13.469)]

In [8]:
algo1 = SVD()
cross_validate(algo1, data_j, measures=['rmse','mae'], verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    4.5002  4.5010  4.5044  4.5022  4.5072  4.5030  0.0025  
MAE (testset)     3.3685  3.3747  3.3752  3.3749  3.3799  3.3746  0.0036  
Fit time          65.35   65.76   65.70   66.48   66.91   66.04   0.57    
Test time         3.64    3.80    3.23    3.64    3.47    3.56    0.19    


{'test_rmse': array([4.50022424, 4.5009501 , 4.50444094, 4.50224849, 4.50720804]),
 'test_mae': array([3.36848129, 3.37474824, 3.37515326, 3.37485802, 3.37985276]),
 'fit_time': (65.34619045257568,
  65.75805640220642,
  65.70223569869995,
  66.48274183273315,
  66.9066846370697),
 'test_time': (3.6365621089935303,
  3.799867630004883,
  3.233344554901123,
  3.6382975578308105,
  3.4737374782562256)}

In [9]:
from surprise import KNNBasic
algo2 = KNNBasic()

In [None]:
cross_validate(algo2, data_j, measures= ['rmse','mae'], verbose=True)