In [1]:
import sys
import surprise

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from recommenders.models.surprise.surprise_utils import (
    predict,
    compute_ranking_predictions,
)
from recommenders.utils.notebook_utils import store_metadata


print(f"System version: {sys.version}")
print(f"Surprise version: {surprise.__version__}")


System version: 3.9.20 (main, Oct  3 2024, 07:38:01) [MSC v.1929 64 bit (AMD64)]
Surprise version: 1.1.4


In [2]:
# Top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"


### 3.1 Load data

In [3]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE, header=["userID", "itemID", "rating"]
)

data.head()



00%|███████████████████████████████████████████████████████████████████████████████████| 4.81k/4.81k [00:00<00:00, 8.22kKB/s]

Unnamed: 0,userID,itemID,rating
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0


### 3.2 Train the SVD Model

In [12]:
train, test = python_random_split(data, 0.80)


In [13]:
# 'reader' is being used to get rating scale (for MovieLens, the scale is [1, 5]).
# 'rating_scale' parameter can be used instead for the later version of surprise lib:
# https://github.com/NicolasHug/Surprise/blob/master/surprise/dataset.py
train_set = surprise.Dataset.load_from_df(
    train, reader=surprise.Reader("ml-100k")
).build_full_trainset()
train_set


<surprise.trainset.Trainset at 0x1782d02c0d0>

In [14]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")


Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 1.073982700000002 seconds for training.


### Prediction

In [15]:
predictions = predict(svd, test, usercol="userID", itemcol="itemID")
predictions.head()


Unnamed: 0,userID,itemID,prediction
0,877,381,3.455602
1,815,602,3.85471
2,94,431,3.224675
3,416,875,2.80787
4,500,182,3.909425


Remove rated movies in the top k recommendations

In [16]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(
        svd, train, usercol="userID", itemcol="itemID", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")


Took 10.344942899999992 seconds for prediction.


In [17]:
all_predictions.head()


Unnamed: 0,userID,itemID,prediction
0,1,1,3.782591
3,1,4,3.86701
5,1,6,3.283456
7,1,8,4.183098
19,1,20,3.450803


In [19]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

eval_map = map_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)
eval_precision = precision_at_k(
    test, all_predictions, col_prediction="prediction", k=TOP_K
)
eval_recall = recall_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)


print(
    "RMSE:\t\t%f" % eval_rmse,
    sep="\n",
)

print("----")

print(
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)


RMSE:		0.945502
----
Precision@K:	0.079574
Recall@K:	0.034264


In [20]:
# Record results for tests - ignore this cell
store_metadata("rmse", eval_rmse)
store_metadata("mae", eval_mae)
store_metadata("rsquared", eval_rsquared)
store_metadata("exp_var", eval_exp_var)
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)
store_metadata("train_time", train_time.interval)
store_metadata("test_time", test_time.interval)
