In [13]:
import os
import sys
import cornac
import pandas as pd

from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.utils.notebook_utils import store_metadata

print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

System version: 3.10.13 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:15:57) [MSC v.1916 64 bit (AMD64)]
Cornac version: 1.18


In [14]:
# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

In [15]:
from get_data import *
tag = "bfs"
data = get_df(tag)

max(data["userID"])


9936

In [16]:
len(data["itemID"].unique())

492

In [17]:
9937 in data["userID"].unique()


False

In [18]:
train, test = python_random_split(data, 0.75)

In [19]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 128
Number of items: 445


In [20]:
bpr = cornac.models.BPR(
    k=NUM_FACTORS,
    max_iter=NUM_EPOCHS,
    learning_rate=0.01,
    lambda_reg=0.001,
    verbose=True,
    seed=SEED
)

In [21]:
with Timer() as t:
    bpr.fit(train_set)
print("Took {} seconds for training.".format(t))

100%|██████████| 100/100 [00:00<00:00, 258.57it/s, correct=86.01%, skipped=0.64%]

Optimization finished!
Took 0.3904 seconds for training.





In [22]:
data = [
    [0,1260,5],
    [0,2606,5],
    [0,2206,5]
]
validation = pd.DataFrame(data, columns=["userID", "itemID","rating"])
validation

Unnamed: 0,userID,itemID,rating
0,0,1260,5
1,0,2606,5
2,0,2206,5


In [23]:
with Timer() as t:
    all_predictions = predict_ranking(bpr, validation, usercol='userID', itemcol='itemID', remove_seen=True)
print("Took {} seconds for prediction.".format(t))

Took 0.0644 seconds for prediction.


In [32]:
all_predictions.sort_values("prediction", ascending=False)

Unnamed: 0,userID,itemID,prediction
34279,5473,1260,3.019011
56084,9922,1260,3.018673
24044,3764,1260,3.007591
31609,4989,1260,3.007119
11139,1167,1260,3.005979
...,...,...,...
47644,8157,1580,-1.293866
47918,8157,17197,-1.301057
48062,8157,31720,-1.328656
47917,8157,17154,-1.336376


In [30]:
prediction = all_predictions.sort_values("prediction", ascending=False).drop_duplicates(subset="itemID")

In [42]:
threshold = prediction["prediction"].quantile(0.95)
top_20 = prediction[prediction["prediction"] > threshold]
len(top_20)

23

In [41]:
len(all_predictions["itemID"].unique())


445

In [43]:
import pickle
with open(f"{tag}_bpr.pkl", "wb") as f:
    pickle.dump(bpr, f)