In [1]:
import os
import datetime
import tensorflow as tf
import lightfm
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
import pandas as pd
import numpy as np
from scipy import sparse
from tensorboard import notebook



In [2]:
print(f"Tensorflow version: {tf.__version__}")
print(f"LightFM version: {lightfm.__version__}")
print(f"Pandas version: {pd.__version__}")
print(f"Numpy version: {np.__version__}")

Tensorflow version: 2.3.0
LightFM version: 1.15
Pandas version: 1.1.1
Numpy version: 1.18.5


In [3]:
TOP_K = 5

In [4]:
data = fetch_movielens(min_rating=3.0)

print("Interaction matrix:")
print(data['train'].toarray()[:10,:10])

Interaction matrix:
[[5 3 4 3 3 5 4 0 5 3]
 [4 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [4 0 0 0 0 0 0 4 4 0]
 [0 0 0 5 0 0 5 5 5 4]
 [0 0 0 0 0 0 3 0 0 0]
 [0 0 0 0 0 0 4 0 0 0]
 [4 0 0 4 0 0 0 0 4 0]]


In [5]:
for dataset in ['test', 'train']:
    data[dataset] = (data[dataset].toarray() > 0).astype('int8')
    
# Make the ratings binary
print("Interaction matrix:")
print(data['train'][:10,:10])

print("\nRatings:")
unique_ratings = np.unique(data['train'])
print(unique_ratings)

Interaction matrix:
[[1 1 1 1 1 1 1 0 1 1]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 1 1 0]
 [0 0 0 1 0 0 1 1 1 1]
 [0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 1 0 0 0]
 [1 0 0 1 0 0 0 0 1 0]]

Ratings:
[0 1]


In [6]:
from typing import List


def wide_to_long(wide: np.array, possible_ratings: List) -> np.array:
    
    def _get_ratings(arr: np.array, rating: int) -> np.array:
        idx = np.where(arr == rating)
        return np.vstack((idx[0],idx[1], np.ones(idx[0].size, dtype='int8') * rating)).T
    
    long_arrays = []
    for r in possible_ratings:
        long_arrays.append(_get_ratings(wide, r))
    
    return np.vstack(long_arrays)

In [7]:
long_train = wide_to_long(data['train'], unique_ratings)
df_train = pd.DataFrame(long_train, columns = ['user_id', 'item_id', 'interaction'])

In [8]:
print("All interactions:")
df_train.head()

All interactions:


Unnamed: 0,user_id,item_id,interaction
0,0,7,0
1,0,10,0
2,0,19,0
3,0,20,0
4,0,26,0


In [9]:
print("\nOnly positive interactions:")
df_train[df_train['interaction'] > 0].head()


Only positive interactions:


Unnamed: 0,user_id,item_id,interaction
1511499,0,0,1
1511500,0,1,1
1511501,0,2,1
1511502,0,3,1
1511503,0,4,1


In [152]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Embedding,
    Input,
    Dense,
    Multiply,
    Flatten,
    Concatenate,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2


def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 4,
    latent_dim_mlp: int = 16,
    reg_mf: int = 0,
    reg_mlp: int = 0.01,
    dense_layers: List[int] = [8, 4], # try different things here - maybe try the architecture from the paper?
    reg_layers: List[int] = [0.1, 0.1], # best results was without the NN part really
    activation_dense: str = "relu"
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name='user_id')
    item = Input(shape=(), dtype="int32", name='item_id')

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            activation=activation_dense,
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, activation="sigmoid", kernel_initializer="lecun_uniform", name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item], outputs=[output],
    )

    return model

In [153]:
from tensorflow.keras.optimizers import Adam


n_users, n_items = data['train'].shape
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(optimizer=Adam(lr=0.001),
                  loss='binary_crossentropy',
                  metrics=[
                      tf.keras.metrics.TruePositives(name='tp'),
                      tf.keras.metrics.FalsePositives(name='fp'),
#                       tf.keras.metrics.TrueNegatives(name='tn'),
#                       tf.keras.metrics.FalseNegatives(name='fn'), 
#                       tf.keras.metrics.BinaryAccuracy(name='accuracy'),
#                       tf.keras.metrics.Precision(name='precision'),
#                       tf.keras.metrics.Recall(name='recall'),
                      tf.keras.metrics.AUC(name='auc')
                  ])
ncf_model._name = 'neural_collaborative_filtering'
ncf_model.summary()

Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
item_id (InputLayer)            [(None,)]            0                                            
__________________________________________________________________________________________________
mlp_user_embedding (Embedding)  (None, 16)           15088       user_id[0][0]                    
__________________________________________________________________________________________________
mlp_item_embedding (Embedding)  (None, 16)           26912       item_id[0][0]                    
_____________________________________________________________________

In [154]:
def make_tf_dataset(df: pd.DataFrame, targets: List[str], val_split: float = 0.1, batch_size: int = 512, seed = 42):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling the data - setting to `None` will not shuffle the data"""
    
    n_val = round(df.shape[0]*val_split)
    if seed:
        x = df.sample(frac=1, random_state=seed).to_dict('series')  # shuffle all the rows
    else:
        x = df.to_dict('series')
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    
    ds_val = (
        ds
        .take(n_val)
        .batch(batch_size)
    )
    ds_train = (
        ds
        .skip(n_val)
        .batch(batch_size)
    )
    return ds_train, ds_val

In [155]:
# create train and validation datasets
ds_train, ds_val = make_tf_dataset(df_train, ['interaction'])

In [156]:
%%time
# define logs and callbacks
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=0)

train_hist = ncf_model.fit(ds_train, validation_data=ds_val, epochs=20, callbacks=[tensorboard_callback, early_stopping_callback], verbose=1)

Epoch 1/20


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: user 4min 18s, sys: 1min 14s, total: 5min 32s
Wall time: 2min 24s


In [157]:
long_test = wide_to_long(data['train'], unique_ratings)
df_test = pd.DataFrame(long_test, columns = ['user_id', 'item_id', 'interaction'])
ds_test, _ = make_tf_dataset(df_test, ['interaction'], val_split=0, seed=None)

In [158]:
%%time
ncf_predictions = ncf_model.predict(ds_test)
df_test['ncf_predictions'] = ncf_predictions

CPU times: user 3.72 s, sys: 205 ms, total: 3.92 s
Wall time: 3.59 s


In [159]:
df_test.head()

Unnamed: 0,user_id,item_id,interaction,ncf_predictions
0,0,7,0,0.480493
1,0,10,0,0.753207
2,0,19,0,0.174637
3,0,20,0,0.074821
4,0,26,0,0.145832


In [160]:
# sanity checks
std = df_test.describe().loc['std', 'ncf_predictions']
if std < 0.01:
    raise ValueError("Model predictions have standard deviation of less than 1e-2.")

In [161]:
data['ncf_predictions'] = df_test.pivot(index='user_id', columns='item_id', values='ncf_predictions').values
print("Neural collaborative filtering predictions")
print(data['ncf_predictions'][:10,:4])

Neural collaborative filtering predictions
[[7.00398564e-01 3.53813112e-01 3.48921120e-01 7.65521049e-01]
 [1.64715350e-01 2.69353390e-03 2.31661499e-02 3.32486629e-03]
 [3.44626009e-02 1.21375837e-04 1.87519193e-03 1.86479319e-05]
 [9.94561911e-02 1.72623992e-03 1.26564503e-03 7.40855932e-04]
 [5.68827629e-01 2.66197205e-01 4.13460135e-02 3.17071855e-01]
 [3.41444194e-01 4.38211262e-02 1.19111538e-02 4.49252069e-01]
 [6.45004809e-01 6.24527216e-01 1.02570385e-01 8.42683196e-01]
 [6.03474379e-01 1.06243074e-01 1.44939423e-02 1.69369459e-01]
 [1.39996946e-01 1.65140629e-03 1.47922874e-05 2.34645605e-03]
 [4.31202054e-01 1.01638705e-01 2.72692144e-02 4.99637365e-01]]


In [162]:
precision_ncf = tf.keras.metrics.Precision(top_k=TOP_K)
recall_ncf = tf.keras.metrics.Recall(top_k=TOP_K)

precision_ncf.update_state(data['test'], data['ncf_predictions'])
recall_ncf.update_state(data['test'], data['ncf_predictions'])
print(f"At K = {TOP_K}, we have a precision of {precision_ncf.result().numpy():.5f} and a recall of {recall_ncf.result().numpy():.5f}")

At K = 5, we have a precision of 0.10901 and a recall of 0.06512


In [21]:
raise ValueError

ValueError: 

In [50]:
%%time
# LightFM model
norm = lambda x: (x - np.min(x))/np.ptp(x)
lightfm_model = LightFM(loss='warp')
lightfm_model.fit(sparse.coo_matrix(data['train']), epochs=20, num_threads=2)

lightfm_predictions = lightfm_model.predict(df_test['user_id'].values, df_test['item_id'].values)
df_test['lightfm_predictions'] = lightfm_predictions
wide_predictions = df_test.pivot(index='user_id', columns='item_id', values='lightfm_predictions').values
data['lightfm_predictions'] = norm(wide_predictions)

# compute the metrics
precision_lightfm = tf.keras.metrics.Precision(top_k=TOP_K)
recall_lightfm = tf.keras.metrics.Recall(top_k=TOP_K)
precision_lightfm.update_state(data['test'], data['lightfm_predictions'])
recall_lightfm.update_state(data['test'], data['lightfm_predictions'])
print(f"At K = {TOP_K}, we have a precision of {precision_lightfm.result().numpy():.5f} and a recall of {recall_lightfm.result().numpy():.5f}")

At K = 5, we have a precision of 0.10562 and a recall of 0.06309
CPU times: user 1.44 s, sys: 259 ms, total: 1.7 s
Wall time: 1.31 s
