# Model Build

<font color='red'>NOTE</font>
 
* Notebook based on [this](https://blog.paperspace.com/movie-recommender-tensorflow/) tutorial. 
* see also [context feature implementation](https://www.tensorflow.org/recommenders/examples/context_features) and the corresponding [youtube video](https://www.youtube.com/watch?v=RWlLaWMD30M&t=1s)

* Recommender runs, however only one recommendation for a given user_id, don't know why. 

To do:
* Implement context features to take advantage of all the given variables
<br>--> if we are able to do this, we might have something that would be viable for production (apparently this query-model-thing with consideration of context is the real-world approach), that would help for answer of Question 1 and 2 of the project description

In [1]:
import numpy as np
import pandas as pd
import time

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

from numpy import count_nonzero

from typing import Dict, Text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
# filter for is listened == 1

train_listened = train_df[train_df['is_listened'] == 1]
train_listened.reset_index(inplace=True, drop=True)

# only first 100'000 records
train_listened_small = train_listened.loc[:99999]
train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


In [4]:
# convert to tfds datset

deezer_ratings = tf.data.Dataset.from_tensor_slices(dict(train_listened_small)).\
    map(lambda x: {
    'user_id': x['user_id'], 
    'is_listened': x['is_listened'], 
    'media_id': x['media_id'], 
    'timestamp': x['ts_listen']})

# get sample for overview
for x in deezer_ratings.take(5).as_numpy_iterator():
    print(x)

# assert correct object type
print(deezer_ratings)

Metal device set to: Apple M1
Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
{'user_id': b'16547', 'is_listened': 1, 'media_id': b'250467', 'timestamp': 1480544735}
{'user_id': b'7665', 'is_listened': 1, 'media_id': b'305197', 'timestamp': 1479563953}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478368974}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478382544}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478338409}
<MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'is_listened': TensorSpec(shape=(), dtype=tf.int64, name=None), 'media_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None)}>


2023-03-12 17:34:20.660957: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-12 17:34:20.661003: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-03-12 17:34:20.742453: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [5]:
tf.random.set_seed(42)
shuffled = deezer_ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(2_000)

In [6]:
# check data
# for x in test.take(5).as_numpy_iterator():
#   print(x)

In [7]:
songs = deezer_ratings.map(lambda x: x["media_id"])
user = deezer_ratings.map(lambda x: x["user_id"])

song_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["media_id"])
user_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_song_ids = np.unique(np.concatenate(list(song_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# context variable: unix timestamps
# timestamps = np.concatenate(list(deezer_ratings.map(lambda x: x["timestamp"]).batch(100)))
# max_timestamp = timestamps.max()
# min_timestamp = timestamps.min()

# timestamp_buckets = np.linspace(
#     min_timestamp, max_timestamp, num=1000,
# )

In [8]:
# define user and item models

embedding_dimension = 32

# Compute embeddings for users.
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Compute embeddings for movies.
song_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_song_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_song_ids) + 1, embedding_dimension)
])

# timestamp model: concatenate with user_model!
# timestamp_embedding = tf.keras.Sequential([
#     tf.keras.layers.Discretization(timestamp_buckets.tolist()),
#     tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
# ])
# normalized_timestamp = tf.keras.layers.Normalization(
#     axis=None
# )

# get top k recommendations
metrics = tfrs.metrics.FactorizedTopK(
  candidates=song_ids.map(song_model)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [9]:
class DeezerRecModel(tfrs.Model):

    def __init__(self, user_model, song_model):
        super().__init__()
        self.song_model = tf.keras.Model = song_model
        self.user_model: tf.keras.Model = user_model

        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features["user_id"])
        positive_song_embeddings = self.song_model(features["media_id"])
        return self.task(user_embeddings, positive_song_embeddings)

In [10]:
model = DeezerRecModel(user_model, song_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=3, )

Epoch 1/3


TypeError: in user code:

    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/tensorflow_recommenders/models/base.py", line 76, in train_step
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 1140, in apply_gradients
        return super().apply_gradients(grads_and_vars, name=name)
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 621, in apply_gradients
        self.build(trainable_variables)
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/optimizers/optimizer_experimental/adagrad.py", line 102, in build
        initial_value=initializer(shape=var.shape, dtype=var.dtype),
    File "/Users/oliviergisiger/.local/share/virtualenvs/deezer_recommendation-BCCDOVtY/lib/python3.10/site-packages/keras/initializers/initializers_v2.py", line 265, in __call__
        return tf.constant(self.value, dtype=_get_dtype(dtype), shape=shape)

    TypeError: Cannot convert 0.1 to EagerTensor of dtype int32


In [11]:
model.evaluate(cached_test, return_dict=True)

2023-03-12 17:34:36.019191: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-12 17:34:36.511381: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.
2023-03-12 17:34:36.530529: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




{'factorized_top_k/top_1_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_5_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_10_categorical_accuracy': 0.0005000000237487257,
 'factorized_top_k/top_50_categorical_accuracy': 0.34300002455711365,
 'factorized_top_k/top_100_categorical_accuracy': 0.34700000286102295,
 'loss': 15202.0205078125,
 'regularization_loss': 0,
 'total_loss': 15202.0205078125}

In [13]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((songs.batch(1000), songs.batch(1000).map(model.song_model)))
)

_, titles = index(tf.constant(["1387"]))
print(f"Song id Recommendations for user: {titles[0, :3]}")

AttributeError: 'BruteForce' object has no attribute 'index_from_dataset'

In [None]:
tfrs.layers.factorized_top_k.BruteForce.