# Model Build

<font color='red'>NOTE</font>
 
* Notebook based on [this](https://blog.paperspace.com/movie-recommender-tensorflow/) tutorial. 
* see also [context feature implementation](https://www.tensorflow.org/recommenders/examples/context_features) and the corresponding [youtube video](https://www.youtube.com/watch?v=RWlLaWMD30M&t=1s)

* Recommender runs, however only one recommendation for a given user_id, don't know why. 

To do:
* Implement context features to take advantage of all the given variables
<br>--> if we are able to do this, we might have something that would be viable for production (apparently this query-model-thing with consideration of context is the real-world approach), that would help for answer of Question 1 and 2 of the project description

In [172]:
import numpy as np
import pandas as pd
import time

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

from numpy import count_nonzero

from typing import Dict, Text

In [173]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [175]:
# filter for is listened == 1

train_listened = train_df[train_df['is_listened'] == 1]
train_listened.reset_index(inplace=True, drop=True)

# only first 100'000 records
train_listened_small = train_listened.loc[:99999]
train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


7558834 100000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


In [176]:
# convert to tfds datset

deezer_ratings = tf.data.Dataset.from_tensor_slices(dict(train_listened_small)).\
    map(lambda x: {
    'user_id': x['user_id'], 
    'is_listened': x['is_listened'], 
    'media_id': x['media_id'], 
    'timestamp': x['ts_listen']})

# get sample for overview
for x in deezer_ratings.take(5).as_numpy_iterator():
  print(x)

# assert correct object type
print(deezer_ratings)

{'user_id': b'16547', 'is_listened': 1, 'media_id': b'250467', 'timestamp': 1480544735}
{'user_id': b'7665', 'is_listened': 1, 'media_id': b'305197', 'timestamp': 1479563953}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478368974}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478382544}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335', 'timestamp': 1478338409}
<MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'is_listened': TensorSpec(shape=(), dtype=tf.int64, name=None), 'media_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None)}>


In [177]:
tf.random.set_seed(42)
shuffled = deezer_ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(2_000)

In [178]:
# check data
# for x in test.take(5).as_numpy_iterator():
#   print(x)

{'user_id': b'1387', 'is_listened': 1, 'media_id': b'132123630', 'timestamp': 1479172239}
{'user_id': b'572', 'is_listened': 1, 'media_id': b'130105294', 'timestamp': 1478282323}
{'user_id': b'45', 'is_listened': 1, 'media_id': b'127539479', 'timestamp': 1479252882}
{'user_id': b'2794', 'is_listened': 1, 'media_id': b'15417669', 'timestamp': 1478874840}
{'user_id': b'15944', 'is_listened': 1, 'media_id': b'130105294', 'timestamp': 1480624462}


In [179]:
songs = deezer_ratings.map(lambda x: x["media_id"])
user = deezer_ratings.map(lambda x: x["user_id"])

song_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["media_id"])
user_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_song_ids = np.unique(np.concatenate(list(song_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# context variable: unix timestamps
# timestamps = np.concatenate(list(deezer_ratings.map(lambda x: x["timestamp"]).batch(100)))
# max_timestamp = timestamps.max()
# min_timestamp = timestamps.min()

# timestamp_buckets = np.linspace(
#     min_timestamp, max_timestamp, num=1000,
# )

In [182]:
# define user and item models

embedding_dimension = 32

# Compute embeddings for users.
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Compute embeddings for movies.
song_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_song_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_song_ids) + 1, embedding_dimension)
])

# timestamp model: concatenate with user_model!
# timestamp_embedding = tf.keras.Sequential([
#     tf.keras.layers.Discretization(timestamp_buckets.tolist()),
#     tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
# ])
# normalized_timestamp = tf.keras.layers.Normalization(
#     axis=None
# )

# get top k recommendations
metrics = tfrs.metrics.FactorizedTopK(
  candidates=song_ids.map(song_model)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [184]:
class DeezerRecModel(tfrs.Model):

  def __init__(self, user_model, song_model):
    super().__init__()
    self.song_model = tf.keras.Model = song_model
    self.user_model: tf.keras.Model = user_model

    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features["user_id"])
    positive_song_embeddings = self.song_model(features["media_id"])
    return self.task(user_embeddings, positive_song_embeddings)

In [186]:
model = DeezerRecModel(user_model, song_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x29c260a00>

In [93]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.13650000095367432,
 'factorized_top_k/top_5_categorical_accuracy': 0.13650000095367432,
 'factorized_top_k/top_10_categorical_accuracy': 0.14000000059604645,
 'factorized_top_k/top_50_categorical_accuracy': 0.16599999368190765,
 'factorized_top_k/top_100_categorical_accuracy': 0.19200000166893005,
 'loss': 11566.048828125,
 'regularization_loss': 0,
 'total_loss': 11566.048828125}

In [187]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((songs.batch(1000), songs.batch(1000).map(model.song_model)))
)

_, titles = index(tf.constant(["1387"]))
print(f"Song id Recommendations for user: {titles[0, :3]}")

Song id Recommendations for user: [b'132123630' b'132123630' b'132123630']
