<h1>Neural Collaborative Filtering</h1>

In [2]:
import sys
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:48:25) 
[Clang 14.0.6 ]
Pandas version: 1.5.3
Tensorflow version: 2.11.0


In [3]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 256

SEED = 42

<h1>Download the MovieLens dataset</h1>

In [4]:
df = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=["userID", "itemID", "rating", "timestamp"]
)

INFO:recommenders.datasets.download_utils:Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:00<00:00, 16.4kKB/s]


<h1>Split the data using the Spark chronological splitter provided in utilities</h1>

In [5]:
train, test = python_chrono_split(df, 0.75)

In [6]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

In [7]:
train_file = "../movielens/train.csv"
test_file = "../movielens/test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

In [8]:
data = NCFDataset(train_file=train_file, test_file=test_file, seed=SEED)

INFO:recommenders.models.ncf.dataset:Indexing ../movielens/train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ../movielens/test.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ../movielens/test_full.csv ...


<h1>Train the NCF model on the training data, and get the top-k recommendations for our testing data</h1>

- NCF accepts implicit feedback and generates prospensity of items to be recommended to users in the scale of 0 to 1
- A recommended item list can then be generated based on the scores

In [9]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

2023-01-30 03:05:54.768728: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:357] MLIR V1 optimization pass is not enabled
2023-01-30 03:05:54.773124: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [10]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [1.88s]: train_loss = 0.261580 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [1.87s]: train_loss = 0.246016 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [1.86s]: train_loss = 0.238357 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [1.89s]: train_loss = 0.233181 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [1.91s]: train_loss = 0.228597 


Took 95.3382 seconds for training.


In [11]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time))

Took 4.1750 seconds for prediction.


<h1>Evaluate how well NCF performs</h1>

In [12]:
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.050350
NDCG:	0.202260
Precision@K:	0.182078
Recall@K:	0.101317


In [16]:
if is_jupyter():
    # Record results with papermill for tests
    import papermill as pm
    #sb.glue("map", eval_map)
    #sb.glue("ndcg", eval_ndcg)
    #sb.glue("precision", eval_precision)
    #sb.glue("recall", eval_recall)
    #sb.glue("train_time", train_time.interval)
    #sb.glue("test_time", test_time.interval)