# NCF
https://github.com/microsoft/recommenders/blob/main/examples/00_quick_start/ncf_movielens.ipynb

## Prepare

In [1]:
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.0 (default, Oct  9 2018, 10:31:47) 
[GCC 7.3.0]
Pandas version: 1.1.5
Tensorflow version: 1.15.5


In [2]:
import fun

## Set default parameters

In [3]:
# top k items to recommend
TOP_K = 20

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 256

SEED = 42

## Load all Data

In [4]:
data_full = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='title',
    genres_col='genres'
)

data_full_spark = fun.movielens_to_spark(data_full, schema = fun.get_movielens_schema())
feature_data = fun.create_feature_data(data_full_spark)
data = data_full[['UserId', 'MovieId', 'Rating','Timestamp']]
train_df_spark, test_df_spark, user_item = fun.split_spark(data_full_spark)
user_item_pd = fun.movielens_to_pandas(user_item)

100%|██████████| 4.81k/4.81k [00:03<00:00, 1.27kKB/s]


Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+------+-------+------+---------+------------+------+
|UserId|MovieId|Rating|Timestamp|       title|genres|
+------+-------+------+---------+------------+------+
|   196|    242|   3.0|881250949|Kolya (1996)|Comedy|
|    63|    242|   3.0|875747190|Kolya (1996)|Comedy|
|   226|    242|   5.0|883888671|Kolya (1996)|Comedy|
|   154|    242|   3.0|879138235|Kolya (1996)|Comedy|
|   306|    242|   5.0|876503793|Kolya (1996)|Comedy|
+------+-------+------+---------+------------+------+
only showing top 5 rows



## Train NCFD model

In [5]:
train, test = python_chrono_split(data, 0.75, col_user = "UserId", col_item = "MovieId", col_timestamp = "Timestamp")

data_ncf = NCFDataset(train=train, test=test, seed=SEED, col_user="UserId",
        col_item="MovieId",
        col_rating="Rating",
        col_timestamp="Timestamp")

ncf = NCF (
    n_users=data_ncf.n_users, 
    n_items=data_ncf.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

In [6]:
with Timer() as train_time:
    ncf.fit(data_ncf)

print("Took {} seconds for training.".format(train_time))

Took 240.1703 seconds for training.


## Predict old-way -NOT USED

In [13]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.MovieId.unique())
    for user in train.UserId.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(ncf.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"UserId": users, "MovieId":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["UserId", "MovieId"], how="outer")
    all_predictions = merged[merged.Rating.isnull()].drop('Rating', axis=1)

print("Took {} seconds for prediction.".format(test_time))

print(type(all_predictions))

Took 2.7722 seconds for prediction.
<class 'pandas.core.frame.DataFrame'>


In [15]:
all_predictions = all_predictions[['UserId', 'MovieId', 'prediction']]
all_predictions.sort_values(by=['prediction'], ascending = False).head()

Unnamed: 0,UserId,MovieId,prediction
904251,549,117,0.998169
809225,486,100,0.997878
1156102,714,121,0.997813
1234399,765,100,0.997769
337729,173,288,0.997684


## Predict new way -USED

In [7]:
users = user_item_pd['UserId'].tolist()
items = user_item_pd['MovieId'].tolist()
preds = []

In [8]:
for i in range(len(users)):
    pred = ncf.predict(users[i], items[i])
    preds.append(pred)
    
predictions = pd.DataFrame(data={"UserId": users, "MovieId":items, "prediction":preds})

In [9]:
pred_spark = fun.movielens_to_spark(predictions, schema = fun.get_predictions_schema())

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+-----------+
|UserId|MovieId| prediction|
+------+-------+-----------+
|   148|    496|  0.9423152|
|   148|    471| 0.19843729|
|   148|    463| 0.21158259|
|   148|    148|0.076068595|
|   148|   1342|3.901091E-6|
+------+-------+-----------+
only showing top 5 rows



## Get top K

In [10]:
top_k, top_all = fun.create_topk_topall(pred_df_spark = pred_spark , train_df_spark = train_df_spark, top_k = TOP_K )
top_k.show()

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|   148|    496| 0.9423152|
|   148|     69| 0.9419968|
|   148|    132|0.92173594|
|   148|    216|0.92139715|
|   148|    143|0.91003084|
|   148|    423|0.90460455|
|   148|    186| 0.8999268|
|   148|     28|  0.899551|
|   148|    211| 0.8938162|
|   148|    210|0.88862014|
|   148|    202| 0.8843819|
|   148|    197| 0.8829703|
|   148|     97| 0.8820187|
|   148|    208|0.86807096|
|   148|    483| 0.8656971|
|   148|     79| 0.8640293|
|   148|    435|0.85855025|
|   148|    655|0.85649014|
|   148|    153| 0.8561597|
|   148|    427| 0.8498135|
+------+-------+----------+
only showing top 20 rows



## Evaluate

In [16]:
print(type(train))

<class 'pandas.core.frame.DataFrame'>


In [11]:
diversity_collaborative, diversity_content, ranking, rating = fun.get_metrics(train_df_spark, test_df_spark, top_k, top_all, feature_data, top_k = TOP_K)

In [12]:
met_res = fun.display_metrics(diversity_collaborative, diversity_content, ranking, rating)
met_res

Unnamed: 0,Metric,Score,Range,Criteria
0,Collaborative Diversity,0.67494,"[0,1]",The closer to 1 the better
1,Collaborative Serendipity,0.732897,"[0,1]",The closer to 1 the better
2,Collaborative Novelty,8.842217,>=0,Inverse popularity. The higher the better
3,Content Diversity,0.84588,"[0,1]",The closer to 1 the better
4,Content Serendipity,0.862206,"[0,1]",The closer to 1 the better
5,Content Novelty,8.842217,>=0,Inverse popularity. The higher the better
6,RMSE,3.157404,>0,The smaller the better
7,MAE,2.960488,>=0,The smaller the better
8,R Squared,-6.935766,<=1,The closer to 1 the better
9,Precision@k,0.328959,"[0,1]",The closer to 1 the better. Grows with k


## Save metrics

In [17]:
met_res.to_csv("ncf_20k_results.csv", index = False)