In [3]:
import sys
import surprise

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from recommenders.models.surprise.surprise_utils import (
    predict,
    compute_ranking_predictions,
)
from recommenders.utils.notebook_utils import store_metadata


print(f"System version: {sys.version}")
print(f"Surprise version: {surprise.__version__}")


ModuleNotFoundError: No module named 'surprise'

In [17]:
# Top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "1m"


### Load data

In [18]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE, header=["userID", "itemID", "rating"]
)

data.head()



00%|████████████████████████████████████████████████████████████████████████████| 5.78k/5.78k [00:02<00:00, 2.52kKB/s]

Unnamed: 0,userID,itemID,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


### Train the SVD Model

In [23]:
train, test = python_random_split(data, 0.80)

In [24]:
# 'reader' is being used to get rating scale (for MovieLens, the scale is [1, 5]).
# 'rating_scale' parameter can be used instead for the later version of surprise lib:
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
train_set = surprise.Dataset.load_from_df(
    train, reader=surprise.Reader("ml-1m")
).build_full_trainset()
train_set


<surprise.trainset.Trainset at 0x2e6cae72430>

In [25]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 51.69159130000003 seconds for training.


### Prediction

In [26]:
predictions = predict(svd, test, usercol="userID", itemcol="itemID")
predictions.head()


Unnamed: 0,userID,itemID,prediction
0,5412,2683,3.016058
1,5440,904,4.897222
2,368,3717,3.684309
3,425,1721,3.443993
4,4942,3697,2.215397


Remove rated movies in the top k recommendations

In [27]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(
        svd, train, usercol="userID", itemcol="itemID", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")


Took 953.4770515999999 seconds for prediction.


In [28]:
all_predictions.head()


Unnamed: 0,userID,itemID,prediction
1,1,2,3.131044
2,1,3,3.50777
3,1,4,2.997232
4,1,5,2.860436
5,1,6,3.786617


In [29]:
eval_rmse = rmse(test, predictions)

eval_precision = precision_at_k(
    test, all_predictions, col_prediction="prediction", k=TOP_K
)
eval_recall = recall_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)

print(
    "RMSE:\t\t%f" % eval_rmse,
    sep="\n",
)

print("----")

print(
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)


RMSE:		0.888161
----
Precision@K:	0.073981
Recall@K:	0.025564
