# Setup

## Import modules

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
import pprint
import tempfile

In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
from tensorflow.keras import layers

2024-09-05 23:37:51.189961: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-05 23:37:51.217510: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-05 23:37:51.217540: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-05 23:37:51.218652: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-05 23:37:51.223754: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-05 23:37:51.224399: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [4]:
from typing import Dict, Text

In [5]:
from datetime import datetime
import re

## Import data

Documentation for datasets: https://www.tensorflow.org/datasets/catalog/movielens

In [95]:
# Ratings data.
ratings = tfds.load("movielens/100k-ratings", split="train")
# Features of all the available movies.
movies = tfds.load("movielens/100k-movies", split="train")

In [None]:
type(ratings)

### Transform tfds to dataframe for exploratory data analysis

In [None]:
ratings_df = tfds.as_dataframe(ratings)
movies_df = tfds.as_dataframe(movies)

In [None]:
ratings_df.info()

In [None]:
movies_df.info()

### Create mapping of relevant features from the tfds for the model

In [96]:
ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "user_rating": float(x["user_rating"])
})
movies = movies.map(lambda x: x["movie_title"])

In [8]:
type(ratings)

tensorflow.python.data.ops.map_op._MapDataset

# The model

## Shuffle, train, test

In [9]:
print('Total Data: {}'.format(len(ratings)))

tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = ratings.take(800_000)
test = ratings.skip(800_000).take(200_000)

Total Data: 100000


## Create batches

In [10]:
movie_titles = movies.batch(1_024)
user_ids = ratings.batch(1_024).map(lambda x: x["user_id"])

## Get unique user_ids and movie_titles

... to later convert each user_id and movie_title to a unique integer index for the model

In [11]:
unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [12]:
print('Unique Movies: {}'.format(len(unique_movie_titles)))
print('Unique users: {}'.format(len(unique_user_ids)))

Unique Movies: 1664
Unique users: 943


## The model

In [34]:
class MovieModel(tfrs.models.Model):

    def __init__(self, movies, unique_movie_titles, unique_user_ids, rating_weight: float, retrieval_weight: float, seed=42) -> None:
        # We take the loss weights in the constructor: this allows us to instantiate
        # several model objects with different loss weights.

        super().__init__()

        embedding_dimension = 32

        # User and movie models.
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_movie_titles, mask_token=None),
          tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
          tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
          tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
        ])

        # A small model to take in user and movie embeddings and predict ratings.
        # We can make this as complicated as we want as long as we output a scalar
        # as our prediction.
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(1),
        ])

        # The tasks.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_id"])
        # And pick out the movie features and pass them into the movie model.
        movie_embeddings = self.movie_model(features["movie_title"])

        return (
            user_embeddings,
            movie_embeddings,
            # We apply the multi-layered rating model to a concatentation of
            # user and movie embeddings.
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        ratings = features.pop("user_rating")

        user_embeddings, movie_embeddings, rating_predictions = self(features)

        # We compute the loss for each task.
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        # And combine them using the loss weights.
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 47)

In [None]:
class MovieModel(tfrs.models.Model):

    def __init__(self, movies, unique_movie_titles, unique_user_ids, rating_weight: float, retrieval_weight: float, seed=42) -> None:
        super().__init__()

        embedding_dimension = 64

        # Seeded initializers for embeddings to ensure determinism
        embedding_initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        # User and movie models.
        self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_movie_titles, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_movie_titles) + 1,
                embedding_dimension,
                embeddings_initializer=embedding_initializer  # Seeded initializer
            )
        ])
        self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_user_ids, mask_token=None),
            tf.keras.layers.Embedding(
                len(unique_user_ids) + 1,
                embedding_dimension,
                embeddings_initializer=embedding_initializer  # Seeded initializer
            )
        ])

        # A small model to take in user and movie embeddings and predict ratings.
        self.rating_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation="relu", kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed)),
            tf.keras.layers.Dense(128, activation="relu", kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed)),
            tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.GlorotUniform(seed=seed)),
        ])

        # The tasks.
        self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )
        self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=movies.batch(128).map(self.movie_model)
            )
        )

        # The loss weights.
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight

    def call(self, features: dict) -> tf.Tensor:
        # Extract user and movie features and pass them to the respective models.
        user_embeddings = self.user_model(features["user_id"])
        movie_embeddings = self.movie_model(features["movie_title"])

        # Concatenate user and movie embeddings and pass through the rating model.
        return (
            user_embeddings,
            movie_embeddings,
            self.rating_model(
                tf.concat([user_embeddings, movie_embeddings], axis=1)
            ),
        )

    def compute_loss(self, features: dict, training=False) -> tf.Tensor:
        ratings = features.pop("user_ratings")

        user_embeddings, movie_embeddings, rating_predictions = self(features)

        # Compute loss for both tasks (ranking and retrieval).
        rating_loss = self.rating_task(
            labels=ratings,
            predictions=rating_predictions,
        )
        retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

        # Combine the losses using the specified weights.
        return (self.rating_weight * rating_loss
                + self.retrieval_weight * retrieval_loss)

In [82]:
class MovieModel(tfrs.models.Model):

  def __init__(self, rating_weight: float, retrieval_weight: float) -> None:
    # We take the loss weights in the constructor: this allows us to instantiate
    # several model objects with different loss weights.

    super().__init__()

    embedding_dimension = 64

    # User and movie models.
    self.movie_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
    ])
    self.user_model: tf.keras.layers.Layer = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # A small model to take in user and movie embeddings and predict ratings.
    # We can make this as complicated as we want as long as we output a scalar
    # as our prediction.
    self.rating_model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(1),
    ])

    # The tasks.
    self.rating_task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    self.retrieval_task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=movies.batch(128).map(self.movie_model)
        )
    )

    # The loss weights.
    self.rating_weight = rating_weight
    self.retrieval_weight = retrieval_weight

  def call(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features["user_id"])
    # And pick out the movie features and pass them into the movie model.
    movie_embeddings = self.movie_model(features["movie_title"])
    
    return (
        user_embeddings,
        movie_embeddings,
        # We apply the multi-layered rating model to a concatentation of
        # user and movie embeddings.
        self.rating_model(
            tf.concat([user_embeddings, movie_embeddings], axis=1)
        ),
    )

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

    ratings = features.pop("user_ratings")

    user_embeddings, movie_embeddings, rating_predictions = self(features)

    # We compute the loss for each task.
    rating_loss = self.rating_task(
        labels=ratings,
        predictions=rating_predictions,
    )
    retrieval_loss = self.retrieval_task(user_embeddings, movie_embeddings)

    # And combine them using the loss weights.
    return (self.rating_weight * rating_loss
            + self.retrieval_weight * retrieval_loss)

## Fitting and evaluating

In [83]:
model = MovieModel(rating_weight=1.0, retrieval_weight=1.0)

In [84]:
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.05))

## Shuffle, batch and cache the training and evaluation data

In [85]:
cached_train = train.batch(1_024).cache()
cached_test = test.batch(1_024).cache()



## Train the model

def train_model():
    history = model.fit(cached_train, epochs=3)
    return model, history

train_model()

In [86]:
model.fit(cached_train, epochs=1)



<keras.src.callbacks.History at 0x7f7634200a60>

## Evaluate the model on the test set

In [None]:
metrics = model.evaluate(cached_test, return_dict=True)

print(f"\nRetrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}")

## Predict

In [87]:
predict_movie(13, top_n=5)

<class 'tensorflow_recommenders.layers.factorized_top_k.BruteForce'>








INFO:tensorflow:Assets written to: /tmp/tmpba3typxi/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpba3typxi/model/assets


Top 5 recommendations for user 13:

1. Ice Storm, The (1997)
2. Ice Storm, The (1997)
3. Money Talks (1997)
4. Money Talks (1997)
5. Desperate Measures (1998)


In [88]:
user_ratings = pd.DataFrame({
    'movie': ['Movie 1', 'Movie 2', 'Movie 3', 'Movie 4'],
    'rating': [5, 4, 5, 2]
})

In [45]:
movie_to_id = {
    'Movie 1': 1,
    'Movie 2': 2,
    'Movie 3': 3,
    'Movie 4': 4
}

In [46]:
user_ratings['movie_id'] = user_ratings['movie'].map(movie_to_id)

In [47]:
user_movie_ids = user_ratings['movie_id'].values

In [48]:
predicted_movies = model(user_movie_ids)

TypeError: Exception encountered when calling layer 'movie_model_1' (type MovieModel).

Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got 'user_id'

Call arguments received by layer 'movie_model_1' (type MovieModel):
  • features=tf.Tensor(shape=(4,), dtype=int64)

In [56]:
with tempfile.TemporaryDirectory() as tmp:
  path = os.path.join(tmp, "model")

In [63]:
def predict_movie(user, top_n=3):
    # Create a model that takes in raw query features, and
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    print(type(index))
    # recommends movies out of the entire movies dataset.
    index.index_from_dataset(
      tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
    )

    # Get recommendations.
    _, titles = index(tf.constant([str(user)]))
    
    tf.saved_model.save(index, path)
    
    print('Top {} recommendations for user {}:\n'.format(top_n, user))
    for i, title in enumerate(titles[0, :top_n].numpy()):
        print('{}. {}'.format(i+1, title.decode("utf-8")))

def predict_rating(user, movie):
    trained_movie_embeddings, trained_user_embeddings, predicted_rating = model({
          "userId": np.array([str(user)]),
          "original_title": np.array([movie])
      })
    print("Predicted rating for {}: {}".format(movie, predicted_rating.numpy()[0][0]))

In [64]:
predict_movie(user=123)

<class 'tensorflow_recommenders.layers.factorized_top_k.BruteForce'>








INFO:tensorflow:Assets written to: /tmp/tmpba3typxi/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpba3typxi/model/assets


Top 3 recommendations for user 123:

1. It Happened One Night (1934)
2. Secrets & Lies (1996)
3. Philadelphia Story, The (1940)


In [65]:
# Load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load(path)

In [66]:
# Pass a user id in, get top predicted movie titles back.
scores, titles = loaded(["42"])

print(f"Recommendations: {titles[0][:3]}")

Recommendations: [b'Rudy (1993)' b'Client, The (1994)'
 b'Bridges of Madison County, The (1995)']


In [81]:
model

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x7f76343a9720>

# this below somewhat worked so worth to investigate further!!!!!

In [97]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

# Example user ratings
user_ratings = pd.DataFrame({
    'movie': ['Movie 1', 'Movie 2', 'Movie 3', 'Movie 4'],
    'rating': [5, 4, 5, 2]
})

# Load the saved model (the user and movie models)
loaded_model = model

# Recreate the BruteForce index using the loaded user and movie models
index = tfrs.layers.factorized_top_k.BruteForce(loaded_model.user_model)

# Replace with your full movie dataset (ensure it contains enough movies)
movies = movies  # At least 5 movies

# Recreate the index
index.index_from_dataset(
  tf.data.Dataset.from_tensor_slices(movies).batch(100).map(lambda x: (x, loaded_model.movie_model(x)))
)

# Generate recommendations for a user (dummy or real user ID)
user_id = tf.constant(["dummy_user"])  # Replace with actual user ID if available

# Adjust the number of recommendations to match the number of available movies
_, recommended_movie_titles = index(user_id, k=5)  # Adjust k to the available number of movies

# Output the recommended movie titles
print("Recommended movies:", recommended_movie_titles)


ValueError: Slicing dataset elements is not supported for rank 0.

In [75]:
import tensorflow as tf

# Example user ratings
user_ratings = pd.DataFrame({
    'movie': ['Movie 1', 'Movie 2', 'Movie 3', 'Movie 4'],
    'rating': [5, 4, 5, 2]
})

# Load your trained TFRS model
model = loaded

# Define a function to create a weighted user profile
def create_user_profile(model, movie_titles, ratings):
    weighted_embeddings = []
    
    # Loop through each movie and rating
    for movie_title, rating in zip(movie_titles, ratings):
        # Get the movie's embedding by passing the movie title through the model
        movie_embedding = model(tf.constant([movie_title]))  # Adjust to match your model's interface
        
        # Weight the embedding by the rating
        weighted_embedding = movie_embedding * rating
        weighted_embeddings.append(weighted_embedding)
    
    # Combine weighted embeddings (average them for simplicity)
    user_profile_embedding = tf.reduce_mean(weighted_embeddings, axis=0)
    
    return user_profile_embedding

# Create a user profile based on rated movies
user_profile = create_user_profile(model, user_ratings['movie'].values, user_ratings['rating'].values)

# Output the user profile embedding
print("User profile embedding:", user_profile)

# Now use this user profile to get recommendations
# You can pass this user profile to your model to get recommendations (depends on the structure of your model)


InvalidArgumentError: cannot compute Pack as input #1(zero-based) was expected to be a float tensor but is a string tensor [Op:Pack] name: 0

In [70]:
import tensorflow as tf

# Load your trained TFRS model
model = loaded

# Create input features (user_id only)
input_features = tf.constant(["dummy_user"])  # Dummy user ID for a new user

# Pass the input features into the model to get recommendations
predicted_movies = model(input_features)

# Output the recommendations
print("Recommended movies:", predicted_movies)


Recommended movies: (<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[0.17887518, 0.15814601, 0.15369767, 0.13824794, 0.13824794,
        0.13281173, 0.12472201, 0.12226161, 0.12219059, 0.11474586]],
      dtype=float32)>, <tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'Bride of Frankenstein (1935)', b'Howling, The (1981)',
        b'American Werewolf in London, An (1981)',
        b'Body Snatchers (1993)', b'Body Snatchers (1993)',
        b'Tales From the Crypt Presents: Demon Knight (1995)',
        b'Carrie (1976)', b'Mark of Zorro, The (1940)',
        b'Omen, The (1976)',
        b'Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)']],
      dtype=object)>)


In [71]:
input_features = tf.constant(["Movie 1"])  # Example of passing a movie title
predicted_movies = model(input_features)
print("Recommended movies similar to 'Movie 1':", predicted_movies)


Recommended movies similar to 'Movie 1': (<tf.Tensor: shape=(1, 10), dtype=float32, numpy=
array([[0.17887518, 0.15814601, 0.15369767, 0.13824794, 0.13824794,
        0.13281173, 0.12472201, 0.12226161, 0.12219059, 0.11474586]],
      dtype=float32)>, <tf.Tensor: shape=(1, 10), dtype=string, numpy=
array([[b'Bride of Frankenstein (1935)', b'Howling, The (1981)',
        b'American Werewolf in London, An (1981)',
        b'Body Snatchers (1993)', b'Body Snatchers (1993)',
        b'Tales From the Crypt Presents: Demon Knight (1995)',
        b'Carrie (1976)', b'Mark of Zorro, The (1940)',
        b'Omen, The (1976)',
        b'Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)']],
      dtype=object)>)


In [None]:
import os
os.environ['TF_DETERMINISTIC_OPS'] = '1' # Set deterministic ops for TensorFlow