# Model Build

<font color='red'>NOTE</font>
 


In [1]:
import numpy as np
import pandas as pd
import time

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

from numpy import count_nonzero

from typing import Dict, Text

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [3]:
# filter for is listened == 1

train_listened = train_df[train_df['is_listened'] == 1]
train_listened.reset_index(inplace=True, drop=True)

# only first 100'000 records
train_listened_small = train_listened.loc[:99999]
train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_listened_small[['user_id', 'media_id']] = train_listened_small[['user_id', 'media_id']].astype(str)


In [4]:
# convert to tfds datset

deezer_ratings = tf.data.Dataset.from_tensor_slices(dict(train_listened_small)).\
    map(lambda x: {
    'user_id': x['user_id'], 
    'is_listened': x['is_listened'], 
    'media_id': x['media_id'], 
    'timestamp': x['ts_listen']})

# get sample for overview
# for x in deezer_ratings.take(5).as_numpy_iterator():
#   print(x)

# assert correct object type
print(deezer_ratings)

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089
<MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'is_listened': TensorSpec(shape=(), dtype=tf.int64, name=None), 'media_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None)}>


In [5]:
tf.random.set_seed(42)
shuffled = deezer_ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(2_000)

In [6]:
songs = deezer_ratings.map(lambda x: x["media_id"])
user = deezer_ratings.map(lambda x: x["user_id"])

song_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["media_id"])
user_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_song_ids = np.unique(np.concatenate(list(song_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))


2023-03-17 10:02:58.908752: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [7]:
# define user and item models

embedding_dimension = 32

# Compute embeddings for users.
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_user_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

# Compute embeddings for movies.
song_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(
    vocabulary=unique_song_ids, mask_token=None),
    tf.keras.layers.Embedding(len(unique_song_ids) + 1, embedding_dimension)
])

# get top k recommendations
metrics = tfrs.metrics.FactorizedTopK(
  candidates=song_ids.map(song_model)
)
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [8]:
class DeezerRecModel(tfrs.Model):

  def __init__(self, user_model, song_model):
    super().__init__()
    self.song_model = tf.keras.Model = song_model
    self.user_model: tf.keras.Model = user_model

    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    user_embeddings = self.user_model(features["user_id"])
    positive_song_embeddings = self.song_model(features["media_id"])
    return self.task(user_embeddings, positive_song_embeddings)

In [9]:
model = DeezerRecModel(user_model, song_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2d09776a0>

In [10]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.11349999904632568,
 'factorized_top_k/top_5_categorical_accuracy': 0.12700000405311584,
 'factorized_top_k/top_10_categorical_accuracy': 0.14650000631809235,
 'factorized_top_k/top_50_categorical_accuracy': 0.20200000703334808,
 'factorized_top_k/top_100_categorical_accuracy': 0.23600000143051147,
 'loss': 11889.4794921875,
 'regularization_loss': 0,
 'total_loss': 11889.4794921875}

In [11]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
  tf.data.Dataset.zip((songs.batch(1000), songs.batch(1000).map(model.song_model)))
)

_, titles = index(tf.constant(["1387"]))
print(f"Song id Recommendations for user: {titles[0, :3]}")

Song id Recommendations for user: [b'14747630' b'14747630' b'14747630']
