# Model Build

<font color='red'>NOTE</font>
 
* Notebook based on [this](https://www.tensorflow.org/recommenders/examples/basic_ranking) tutorial. 
* tf model not yet adapted for implicit rating, uses dense output layer atm; change to softmax. 

In [16]:
import numpy as np
import pandas as pd
import time

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds

from numpy import count_nonzero

from typing import Dict, Text

In [17]:
def get_user_item_matrix(df, show_head=False, get_info=True): 
    """
    Get the user-item frequency matrix, 
    and print infos about sparsity.
    """

    st = time.time()
    # get media x user table
    tab = df.groupby(['media_id', 'user_id']).size()
    # replace Nan
    iu_mat = tab.unstack().fillna(0)
    
    if show_head:
        print(iu_mat.head(5))
    
    # convert to nparray & compute sparsity
    iu_mat = iu_mat.to_numpy()
    sparsity = 1.0 - (count_nonzero(iu_mat) / float(iu_mat.size))
    et = time.time()

    if get_info:
        print(f'\n\nNon-zero values: {count_nonzero(iu_mat)} \nSize of matrix:',
               f'{iu_mat.size}\nMatrix sparsity: {sparsity}', 
               f'\n\nExecution time: {round((et-st)/60, 4)}min')
    return iu_mat

In [18]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

In [19]:
# get slice for processing speed
train_small_df = train_df[:10000]

# convert to string for lookup later
train_small_df[['user_id', 'media_id']] = train_small_df[['user_id', 'media_id']].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_small_df[['user_id', 'media_id']] = train_small_df[['user_id', 'media_id']].astype(str)


In [20]:
train_small_df.head(5)

Unnamed: 0,genre_id,ts_listen,media_id,album_id,context_type,release_date,platform_name,platform_family,media_duration,listen_type,user_gender,user_id,artist_id,user_age,is_listened
0,25471,1480597215,222606,41774,12,20040704,1,0,223,0,0,9241,55164,29,0
1,25571,1480544735,250467,43941,0,20060301,2,1,171,0,0,16547,55830,30,1
2,16,1479563953,305197,48078,1,20140714,2,1,149,1,1,7665,2704,29,1
3,7,1480152098,900502,71521,0,20001030,0,0,240,0,1,1580,938,30,0
4,7,1478368974,542335,71718,0,20080215,0,0,150,0,1,1812,2939,24,1


In [21]:
# lookup user-item matrix:
small_iu = get_user_item_matrix(train_small_df, True)

user_id     1   10  100  1000  10000  1001  10014  10015  10026  10027  ...  \
media_id                                                                ...   
10093142  0.0  0.0  0.0   0.0    0.0   0.0    0.0    0.0    0.0    0.0  ...   
10093143  0.0  0.0  0.0   0.0    0.0   0.0    0.0    0.0    0.0    0.0  ...   
10093144  0.0  0.0  0.0   0.0    0.0   0.0    0.0    0.0    0.0    0.0  ...   
10093145  0.0  0.0  0.0   0.0    0.0   0.0    0.0    0.0    0.0    0.0  ...   
10093147  0.0  0.0  0.0   0.0    0.0   0.0    0.0    0.0    0.0    0.0  ...   

user_id   9927  993  994  996  9965  9967  9975  998  999  9991  
media_id                                                         
10093142   0.0  0.0  0.0  0.0   0.0   0.0   0.0  0.0  0.0   0.0  
10093143   0.0  0.0  0.0  0.0   0.0   0.0   0.0  0.0  0.0   0.0  
10093144   0.0  0.0  0.0  0.0   0.0   0.0   0.0  0.0  0.0   0.0  
10093145   0.0  0.0  0.0  0.0   0.0   0.0   0.0  0.0  0.0   0.0  
10093147   0.0  0.0  0.0  0.0   0.0   0.0   0.0  0

In [22]:
# convert to tfds datset

deezer_ratings = tf.data.Dataset.from_tensor_slices(dict(train_small_df)).\
    map(lambda x: {
    'user_id': x['user_id'], 
    'is_listened': x['is_listened'], 
    'media_id': x['media_id']})

# get sample for overview
for x in deezer_ratings.take(5).as_numpy_iterator():
  print(x)

# assert correct object type
print(deezer_ratings)

{'user_id': b'9241', 'is_listened': 0, 'media_id': b'222606'}
{'user_id': b'16547', 'is_listened': 1, 'media_id': b'250467'}
{'user_id': b'7665', 'is_listened': 1, 'media_id': b'305197'}
{'user_id': b'1580', 'is_listened': 0, 'media_id': b'900502'}
{'user_id': b'1812', 'is_listened': 1, 'media_id': b'542335'}
<MapDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'is_listened': TensorSpec(shape=(), dtype=tf.int64, name=None), 'media_id': TensorSpec(shape=(), dtype=tf.string, name=None)}>


In [23]:
tf.random.set_seed(42)
shuffled = deezer_ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [24]:
song_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["media_id"])
user_ids = deezer_ratings.batch(1_000_000).map(lambda x: x["user_id"])

unique_song_ids = np.unique(np.concatenate(list(song_ids)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [44]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.song_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_song_ids, mask_token=None),
      tf.keras.layers.Embedding(len(unique_song_ids) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, song_ids = inputs

    user_embedding = self.user_embeddings(user_id)
    song_embedding = self.song_embeddings(song_ids)

    return self.ratings(tf.concat([user_embedding, song_embedding], axis=1))

In [None]:
# example of prediction return of model
tf.print(RankingModel()((["100"], ["222606"])))

In [33]:
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [40]:
class DeezerModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["user_id"], features["media_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("is_listened")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [41]:
model = DeezerModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [42]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2a4175330>

In [None]:
model.evaluate(cached_test, return_dict=True)# does not work yet; float inf values because of 0's in dataset

In [None]:
test_ratings = {}
test_movie_titles = ["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)"]
for movie_title in test_movie_titles:
  test_ratings[movie_title] = model({
      "user_id": np.array(["42"]),
      "movie_title": np.array([movie_title])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

In [None]:
#tf.saved_model.save(model, "export")