# Libraries

In [13]:
import pandas as pd 
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
# import tensorflow_datasets

In [14]:
%%capture

ratings = tfds.load("movielens/100k-ratings", split="train")

In [15]:
len(ratings)

100000

In [16]:
import pprint

for x in ratings.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


# Preprocessing

## movie related

In [17]:
mds = ratings.map(lambda x: x["movie_title"])

for title in mds.take(2).as_numpy_iterator():
    # pprint.pprint(title)
    print(title)

b"One Flew Over the Cuckoo's Nest (1975)"
b'Strictly Ballroom (1992)'


In [18]:
movie_title_lookup = tf.keras.layers.StringLookup()

movie_titles = ratings.map(lambda x: x["movie_title"])
movie_title_lookup.adapt(movie_titles)

In [19]:
titles = movie_titles.take(1)
for title in titles:
    pprint.pprint(title)

<tf.Tensor: shape=(), dtype=string, numpy=b"One Flew Over the Cuckoo's Nest (1975)">


In [20]:
# movie_title_lookup.is_adapted
movie_title_lookup.vocabulary_size()
movie_title_lookup(["One Flew Over the Cuckoo's Nest (1975)" ,'Strictly Ballroom (1992)'])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 58, 319], dtype=int64)>

In [21]:
movie_title_embedding = tf.keras.layers.Embedding(
    input_dim = movie_title_lookup.vocabulary_size(),
    output_dim = 32
)

movie_preprocessor = tf.keras.Sequential([
    movie_title_lookup,
    movie_title_embedding
])

## user related

In [23]:
next(ratings.take(1).as_numpy_iterator())

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}

In [24]:
user_ids = ratings.map(lambda x: x['user_id'])

In [25]:
user_ids_string_lookup = tf.keras.layers.StringLookup()
user_ids_string_lookup.adapt(user_ids)

user_ids_vocab_size = user_ids_string_lookup.vocabulary_size()

In [26]:
user_id_embedding_model = tf.keras.layers.Embedding(
    input_dim = user_ids_vocab_size,
    output_dim = 32
)

In [27]:
user_preprocessor = tf.keras.Sequential(
    user_ids_string_lookup,
    user_id_embedding_model
)

## time related

In [28]:
#descrization of timestamp

max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    np.int64(1e9), tf.minimum).numpy().min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

In [29]:
timestamp_buckets[:5]

array([8.74724710e+08, 8.74743291e+08, 8.74761871e+08, 8.74780452e+08,
       8.74799032e+08])

In [30]:
timestamp_embedding_model = tf.keras.Sequential([
  tf.keras.layers.Discretization(
    timestamp_buckets.tolist()
    ),
    
  tf.keras.layers.Embedding(
    len(timestamp_buckets) + 1,
    32
    )
])

In [31]:
for timestamp in ratings.take(1).map(lambda x: x['timestamp']).batch(1).as_numpy_iterator():
  print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")

Timestamp embedding: [[ 0.00126406  0.03683684  0.04848229  0.04291603 -0.01407162  0.00389475
  -0.03409994  0.04169731 -0.01688946 -0.04251561 -0.04795588 -0.02990823
  -0.04887525  0.03449798  0.00819398 -0.04634598 -0.0013225  -0.04571677
   0.01601881 -0.04693482  0.04869023  0.01688388 -0.02021176  0.01360455
  -0.01053651  0.00043181  0.03451455  0.04925514  0.03139712  0.02429155
  -0.00485607 -0.02512664]].


In [32]:
next(ratings.take(1).as_numpy_iterator())

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}

## text (context) related

In [33]:
text_vectorizer = tf.keras.layers.TextVectorization()
text_vectorizer.adapt(ratings.map(lambda x: x['movie_title']))

In [34]:
text_preprocessor = tf.keras.Sequential([
    text_vectorizer,
    tf.keras.layers.Embedding(
        input_dim = text_vectorizer.vocabulary_size(),
        output_dim = 32,
        mask_zero = True
    ),
    tf.keras.layers.GlobalAveragePooling1D()
])

In [35]:
text_preprocessor.call(["One Flew Over the Cuckoo's Nest (1975)"])



<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[-0.00773971,  0.00423782,  0.01144901, -0.00092731,  0.00727667,
        -0.00044186,  0.00015625, -0.00067975, -0.01298766, -0.01011722,
         0.00130498, -0.00895921, -0.01889254, -0.00460414,  0.00879441,
        -0.01392043,  0.01082589, -0.0020494 , -0.00398778,  0.01969817,
         0.00359111,  0.0233259 ,  0.00240958,  0.0034159 ,  0.00574002,
        -0.00778672, -0.01579991, -0.00436174, -0.00384612, -0.00775677,
         0.03031283,  0.00636865]], dtype=float32)>

# Preprocessing models

In [36]:
for row in ratings.batch(1).take(1):
  # pprint.pprint(row)
  test_row = row

In [37]:
test_row['movie_title']

<tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>

In [38]:
class MovieModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        self.max_tokens = 10000

        self.embed_title = tf.keras.Sequential([
                            movie_title_lookup,
                            tf.keras.layers.Embedding(
                                input_dim = movie_title_lookup.vocabulary_size(),
                                output_dim = 32
                            )
                        ])
        # self.embed_timestamp = timestamp_embedding_model
        self.embed_text = tf.keras.Sequential([
                            text_vectorizer,
                            tf.keras.layers.Embedding(
                                input_dim = self.max_tokens,
                                output_dim = 32,
                                mask_zero = True
                                ),
                            tf.keras.layers.GlobalAveragePooling1D()
                            ])

    def call(self, inputs):
        return tf.concat([
            self.embed_title(inputs['movie_title']),
            self.embed_text(inputs['movie_title'])
        ], axis = 1)



In [39]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x['user_id']))

In [40]:
class UserModel(tf.keras.Model):
    def __init__(self):
        super().__init__()

        self.max_tokens = 10000

        self.embed_ids = tf.keras.Sequential([
                            user_id_lookup,
                            tf.keras.layers.Embedding(
                                input_dim = user_id_lookup.vocabulary_size(),
                                output_dim = 32
                            )
                        ])
        self.embed_timestamp = timestamp_embedding_model

    def call(self, inputs):
        return tf.concat([
            self.embed_ids(inputs['user_id']),
            self.embed_timestamp(inputs['timestamp'])
        ], axis = 1)

In [41]:
user_model = UserModel()
user_model(test_row)

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[ 0.01343979, -0.03909282, -0.03369359, -0.02829543, -0.04917816,
        -0.00542297,  0.00409679,  0.01189666, -0.04363866, -0.02073841,
        -0.03234669,  0.03293184, -0.02384801,  0.02178412,  0.03004453,
         0.04003431, -0.01710523,  0.01142223, -0.04225413, -0.02827203,
         0.01185542,  0.04060663,  0.03091243, -0.03868673,  0.03280857,
         0.03623761, -0.00457498,  0.01279357,  0.04344759, -0.03850254,
        -0.02423432,  0.0118003 ,  0.00126406,  0.03683684,  0.04848229,
         0.04291603, -0.01407162,  0.00389475, -0.03409994,  0.04169731,
        -0.01688946, -0.04251561, -0.04795588, -0.02990823, -0.04887525,
         0.03449798,  0.00819398, -0.04634598, -0.0013225 , -0.04571677,
         0.01601881, -0.04693482,  0.04869023,  0.01688388, -0.02021176,
         0.01360455, -0.01053651,  0.00043181,  0.03451455,  0.04925514,
         0.03139712,  0.02429155, -0.00485607, -0.02512664]],
      dtype=f

In [42]:
test_row

{'bucketized_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([45.], dtype=float32)>,
 'movie_genres': <tf.Tensor: shape=(1, 1), dtype=int64, numpy=array([[7]], dtype=int64)>,
 'movie_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'357'], dtype=object)>,
 'movie_title': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b"One Flew Over the Cuckoo's Nest (1975)"], dtype=object)>,
 'raw_user_age': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([46.], dtype=float32)>,
 'timestamp': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([879024327], dtype=int64)>,
 'user_gender': <tf.Tensor: shape=(1,), dtype=bool, numpy=array([ True])>,
 'user_id': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'138'], dtype=object)>,
 'user_occupation_label': <tf.Tensor: shape=(1,), dtype=int64, numpy=array([4], dtype=int64)>,
 'user_occupation_text': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'doctor'], dtype=object)>,
 'user_rating': <tf.Tensor: shape=(1,), dtype=floa

In [43]:
# MovieModel.call(test_row)
movie_model = MovieModel()
movie_model(test_row)

<tf.Tensor: shape=(1, 64), dtype=float32, numpy=
array([[ 2.90772356e-02, -1.46816149e-02,  2.29792334e-02,
        -4.17889580e-02, -2.68987902e-02,  2.65689231e-02,
        -2.35139485e-02, -2.10843328e-02,  4.26002629e-02,
         1.79305561e-02, -1.98737625e-02,  4.62428592e-02,
        -4.41723242e-02, -1.50525197e-02, -2.12696679e-02,
         4.75692265e-02,  2.04403065e-02, -1.98970195e-02,
         2.87093855e-02, -4.69027162e-02,  2.15325393e-02,
         4.09050472e-02,  8.25768709e-03,  8.46757740e-03,
        -3.33635099e-02,  1.50557421e-02, -3.53715792e-02,
         4.17643301e-02,  2.83649303e-02,  4.23302166e-02,
        -1.62136778e-02, -2.10096966e-02, -1.38198007e-02,
        -9.93992295e-03,  1.03034498e-02, -2.41529681e-02,
        -5.29148895e-03,  5.00304904e-03,  1.69599298e-02,
         1.45412385e-02,  1.45061854e-02, -1.67479757e-02,
        -2.66707479e-03,  2.67520873e-03, -5.07435109e-03,
        -2.52201571e-03,  1.48678198e-03, -2.49160291e-03,
       