# import

In [None]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

# wczytywanie danych 

In [18]:
from movie_lens_data import get_movie_lens

ml_ver = '10m'
def printver():
    print(f'movie lens version: {ml_ver}\n')

ml_ratings_df = get_movie_lens(ml_ver)

reader = Reader(rating_scale=(1,5))
ml_ratings_ds = Dataset.load_from_df(
    ml_ratings_df[['userId', 'movieId', 'rating']],
    reader)

print(ml_ratings_df.head(5))

   userId  movieId  rating  timestamp
0       1      122     5.0  838985046
1       1      185     5.0  838983525
2       1      231     5.0  838983392
3       1      292     5.0  838983421
4       1      316     5.0  838983392


# proste testy czasowe

SVD najszybsze, następnie KNN i dużo wolniejsze SVD++.

In [60]:
# cross validation, RMSE, MAE
printver()
pred_algs = [SVD(), SVDpp(), KNNBasic()]
for alg in pred_algs:
    cross_validate(alg, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)
    print('')

movie lens version: 100k

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9509  0.9395  0.9460  0.9455  0.0047  
MAE (testset)     0.7514  0.7427  0.7456  0.7466  0.0036  
Fit time          4.04    4.11    3.63    3.93    0.21    
Test time         0.34    0.19    0.19    0.24    0.07    

Evaluating RMSE, MAE of algorithm SVDpp on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.9295  0.9220  0.9273  0.9263  0.0032  
MAE (testset)     0.7288  0.7242  0.7290  0.7273  0.0022  
Fit time          121.17  147.60  129.08  132.62  11.08   
Test time         4.83    5.15    5.37    5.12    0.22    

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split

In [23]:
printver()
train_data, test_data = train_test_split(ml_ratings_ds, test_size=10000)
pred_algs = [SVD(), SVDpp(), KNNBasic()]
for alg in pred_algs:
    print(type(alg).__name__)
    alg.fit(train_data)
    %timeit -n 1 -r 1 pred = [alg.predict(*t) for t in test_data]

movie lens version: 10m

SVD
53 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
SVDpp
4.89 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
719 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [27]:
from collections import defaultdict

def get_top_n(predictions, n=10):
# https://surprise.readthedocs.io/en/stable/FAQ.html#top-n-recommendations-py
    
    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def get_preds(alg, test_data, n):
    preds = alg.test(test_data)
    return get_top_n(preds, 10)
    
printver()
train_data, test_data = train_test_split(ml_ratings_ds, test_size=10000)
pred_algs = [SVD(), SVDpp(), KNNBasic()]
for alg in pred_algs:
    print(type(alg).__name__)
    alg.fit(train_data)
    %timeit -n 1 -r 1 top_n = get_preds(alg, test_data, 10)

movie lens version: 10m

SVD
152 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
SVDpp
5.05 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
KNNBasic
Computing the msd similarity matrix...
Done computing similarity matrix.
853 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
