In [1]:
from typing import Dict, Text
import os
import numpy as np
import tensorflow as tf
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

import warnings
warnings.filterwarnings("ignore")
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pickle

2023-03-23 19:10:31.396572: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-23 19:10:31.546069: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-23 19:10:32.096046: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 19:10:32.096120: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

In [2]:
def save_pickle(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In this notebook, we will primarily focus on building a simple matrix factorization retrieval model powered by neuralCF. 

In [3]:
# Load final data from preprocessing_2_feature_engineering
data = load_pickle('data_2.pickle')

In [4]:
data.columns

Index(['userId', 'age', 'gender', 'occupation', 'zipcode_bucket', 'movieId',
       'imdb_id', 'title', 'movie_genre_1', 'movie_genre_2', 'IMDb_rating',
       'plot embedding', 'release_year', 'rating', 'user_avg_rating',
       'user_std_rating', 'user_rating_count', 'movie_avg_rating',
       'movie_std_rating', 'movie_rating_count', 'user_fav_genre',
       'user_fav_movieId', 'timestamp'],
      dtype='object')

In [5]:
# Load ratings and movies
ratings =  data[['userId', 'imdb_id']]
movies = data[['imdb_id']]
ratings['userId'] = ratings.userId.astype(str)
ratings['imdb_id'] = ratings.imdb_id.astype(str)
movies['imdb_id'] = movies.imdb_id.astype(str)

In [6]:
# Load into tf datasets
movies_ds = tf.data.Dataset.from_tensor_slices(dict(movies))
ratings_ds = tf.data.Dataset.from_tensor_slices(dict(ratings))

2023-03-23 19:10:34.353680: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-23 19:10:34.353723: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-23 19:10:34.353766: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:163] no NVIDIA GPU device is present: /dev/nvidia0 does not exist
2023-03-23 19:10:34.354183: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
# get training and testing dataset
# shuffle and batch
shuffled = ratings_ds.shuffle(100000, seed=32)

train = shuffled.take(80000)
test = shuffled.skip(80000).take(20000)

cached_train = train.batch(128).cache()
cached_test = test.batch(128).cache()

In [8]:
# Build userId string lookup layer
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
user_ids_vocabulary.adapt(ratings_ds.map(lambda x: x["userId"]))

In [9]:
# Build movieId string lookup layer
movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
movie_ids_vocabulary.adapt(movies_ds.map(lambda x: x["imdb_id"]))

In [10]:
# Define MovieLensModel model
class MovieLensModel(tfrs.Model):

    def __init__(
                self,
                user_model: tf.keras.Model,
                movie_model: tf.keras.Model,
                task: tfrs.tasks.Retrieval):
        super().__init__()

        self.user_model = user_model
        self.movie_model = movie_model
        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features["userId"])
        movie_embeddings = self.movie_model(features["imdb_id"])

        return self.task(user_embeddings, movie_embeddings)

In [11]:
# define user and movie models
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)


])
movie_model = tf.keras.Sequential([
    movie_ids_vocabulary,
    tf.keras.layers.Embedding(movie_ids_vocabulary.vocabulary_size(), 64)
])


In [12]:
# Build a neural collaborative filtering model
user_model = tf.keras.Sequential([
    user_ids_vocabulary,
    tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu")

])
movie_model = tf.keras.Sequential([
    movie_ids_vocabulary,
    tf.keras.layers.Embedding(movie_ids_vocabulary.vocabulary_size(), 64),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(16, activation="relu")
])


In [14]:
# define metrics and task
metrics = tfrs.metrics.FactorizedTopK(
  candidates=movies_ds.map(lambda x: x["imdb_id"]).batch(128).map(movie_model),
  ks = (1, 5),
)

task = tfrs.tasks.Retrieval(num_hard_negatives = 2, remove_accidental_hits = False)

In [15]:
# compile and train
model = MovieLensModel(user_model, movie_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

# Train for 3 epochs.
model_history = model.fit(cached_train, epochs=1)



In [16]:
model.evaluate(cached_test, return_dict=True)



{'loss': 103.26954650878906,
 'regularization_loss': 0,
 'total_loss': 103.26954650878906}