<a href="https://colab.research.google.com/github/mmaghajani/recommender-with-tf-sample/blob/main/recomm_movielens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## MovieLens Recommender System

In [1]:
!pip install -q --upgrade tensorflow-datasets

[K     |████████████████████████████████| 3.9MB 27.7MB/s 
[?25h

In [2]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [3]:
ratings_dataset, ratings_datasets_info = tfds.load(
    name='movielens/100k-ratings',
    with_info=True,
    split='train',
)

[1mDownloading and preparing dataset 4.70 MiB (download: 4.70 MiB, generated: 32.41 MiB, total: 37.10 MiB) to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…






HBox(children=(FloatProgress(value=0.0, description='Generating splits...', max=1.0, style=ProgressStyle(descr…

HBox(children=(FloatProgress(value=0.0, description='Generating train examples...', max=100000.0, style=Progre…

HBox(children=(FloatProgress(value=0.0, description='Shuffling movielens-train.tfrecord...', max=100000.0, sty…

[1mDataset movielens downloaded and prepared to /root/tensorflow_datasets/movielens/100k-ratings/0.1.0. Subsequent calls will reuse this data.[0m


In [4]:
assert isinstance(ratings_dataset, tf.data.Dataset)

In [5]:
len(ratings_dataset)

100000

In [6]:
ratings_dataset_head = ratings_dataset.take(5)

for rating in ratings_dataset_head.as_numpy_iterator():
  print(rating)

{'bucketized_user_age': 45.0, 'movie_genres': array([7]), 'movie_id': b'357', 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)", 'raw_user_age': 46.0, 'timestamp': 879024327, 'user_gender': True, 'user_id': b'138', 'user_occupation_label': 4, 'user_occupation_text': b'doctor', 'user_rating': 4.0, 'user_zip_code': b'53211'}
{'bucketized_user_age': 25.0, 'movie_genres': array([ 4, 14]), 'movie_id': b'709', 'movie_title': b'Strictly Ballroom (1992)', 'raw_user_age': 32.0, 'timestamp': 875654590, 'user_gender': True, 'user_id': b'92', 'user_occupation_label': 5, 'user_occupation_text': b'entertainment', 'user_rating': 2.0, 'user_zip_code': b'80525'}
{'bucketized_user_age': 18.0, 'movie_genres': array([4]), 'movie_id': b'412', 'movie_title': b'Very Brady Sequel, A (1996)', 'raw_user_age': 24.0, 'timestamp': 882075110, 'user_gender': True, 'user_id': b'301', 'user_occupation_label': 17, 'user_occupation_text': b'student', 'user_rating': 4.0, 'user_zip_code': b'55439'}
{'bucketized_use

In [7]:
len(ratings_dataset_head)

5

In [8]:
tfds.as_dataframe(ds=ratings_dataset_head, ds_info=ratings_datasets_info)

Unnamed: 0,bucketized_user_age,movie_genres,movie_id,movie_title,raw_user_age,timestamp,user_gender,user_id,user_occupation_label,user_occupation_text,user_rating,user_zip_code
0,45.0,7 (Drama),b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",46.0,879024327,True,b'138',4 (doctor/health care),b'doctor',4.0,b'53211'
1,25.0,4 (Comedy) 14 (Romance),b'709',b'Strictly Ballroom (1992)',32.0,875654590,True,b'92',5 (entertainment),b'entertainment',2.0,b'80525'
2,18.0,4 (Comedy),b'412',"b'Very Brady Sequel, A (1996)'",24.0,882075110,True,b'301',17 (student),b'student',4.0,b'55439'
3,50.0,5 (Crime) 7 (Drama),b'56',b'Pulp Fiction (1994)',50.0,883326919,True,b'60',4 (doctor/health care),b'healthcare',4.0,b'06472'
4,50.0,10 (Horror) 16 (Thriller),b'895',b'Scream 2 (1997)',55.0,891409199,True,b'197',18 (technician/engineer),b'technician',3.0,b'75094'


### Feature Selection

In [9]:
ratings_dataset = ratings_dataset.map(
    lambda rating: {
        'user_id': rating['user_id'],
        'movie_id': rating['movie_id'],
        'movie_title': rating['movie_title'],
        'user_rating': rating['user_rating'],
        'timestamp': rating['timestamp'],

    }
)

tfds.as_dataframe(ds=ratings_dataset.take(5), ds_info=ratings_datasets_info)

Unnamed: 0,movie_id,movie_title,timestamp,user_id,user_rating
0,b'357',"b""One Flew Over the Cuckoo's Nest (1975)""",879024327,b'138',4.0
1,b'709',b'Strictly Ballroom (1992)',875654590,b'92',2.0
2,b'412',"b'Very Brady Sequel, A (1996)'",882075110,b'301',4.0
3,b'56',b'Pulp Fiction (1994)',883326919,b'60',4.0
4,b'895',b'Scream 2 (1997)',891409199,b'197',3.0


### What is Dataset Metadata?

In [10]:
ratings_datasets_info

tfds.core.DatasetInfo(
    name='movielens',
    full_name='movielens/100k-ratings/0.1.0',
    description="""
    This dataset contains a set of movie ratings from the MovieLens website, a movie
    recommendation service. This dataset was collected and maintained by [GroupLens]
    (https://grouplens.org/), a research group at the University of Minnesota. There
    are 5 versions included: "25m", "latest-small", "100k", "1m", "20m". In all
    datasets, the movies data and ratings data are joined on "movieId". The 25m
    dataset, latest-small dataset, and 20m dataset contain only movie data and
    rating data. The 1m dataset and 100k dataset contain demographic data in
    addition to movie and rating data.
    
    - "25m": This is the latest stable version of the MovieLens dataset. It is
    recommended for research purposes.
    - "latest-small": This is a small subset of the latest version of the MovieLens
    dataset. It is changed and updated over time by GroupLens.
    - "10

In [11]:
ratings_datasets_info.splits['train'].num_examples

100000

### Train/Test split

In [12]:
tf.random.set_seed(42)
ratings_dataset_shuffled = ratings_dataset.shuffle(
    buffer_size=100_000,
    seed=42,
    reshuffle_each_iteration=False,
)

In [13]:
ratings_trainset = ratings_dataset_shuffled.take(80_000)
ratings_testset = ratings_dataset_shuffled.skip(80_000)

In [14]:
len(ratings_trainset), len(ratings_testset)

(80000, 20000)

### End-to-End Preprocessing

In [15]:
timestamp_normalization_layer = tf.keras.layers.experimental.preprocessing.Normalization(axis=None)
timestamp_normalization_layer.adapt(
    ratings_trainset.map(
        lambda rating: rating['timestamp']
    )
)

In [16]:
for rating in ratings_trainset.take(3).as_numpy_iterator():
  print(rating['timestamp'])
  print(timestamp_normalization_layer(rating['timestamp']))

885409515
tf.Tensor(0.3526018, shape=(), dtype=float32)
883388887
tf.Tensor(-0.026022714, shape=(), dtype=float32)
891249586
tf.Tensor(1.4468869, shape=(), dtype=float32)


In [17]:
user_id_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
user_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda rating: rating['user_id']
    )
)

In [18]:
print(
    user_id_lookup_layer.get_vocabulary(),
)

['[UNK]', '405', '655', '13', '450', '276', '303', '416', '537', '234', '393', '181', '429', '279', '682', '846', '308', '378', '293', '94', '92', '7', '222', '435', '417', '201', '880', '561', '592', '796', '59', '896', '406', '758', '334', '551', '474', '130', '889', '804', '642', '268', '727', '363', '650', '194', '269', '151', '916', '387', '648', '399', '145', '749', '524', '291', '90', '864', '311', '747', '457', '299', '85', '385', '286', '374', '497', '805', '716', '327', '271', '653', '301', '883', '833', '18', '95', '389', '328', '532', '178', '506', '894', '437', '184', '881', '870', '533', '280', '339', '314', '1', '666', '472', '788', '707', '504', '798', '313', '886', '244', '62', '606', '500', '373', '343', '454', '345', '932', '782', '109', '892', '711', '588', '354', '487', '207', '774', '790', '660', '43', '622', '618', '407', '6', '87', '868', '535', '305', '425', '495', '456', '144', '102', '919', '843', '643', '49', '854', '409', '312', '851', '807', '346', '256', 

In [29]:
print(
    user_id_lookup_layer(
        ['-2', '13', '655']
    )
)

tf.Tensor([0 3 2], shape=(3,), dtype=int64)


In [30]:
user_id_embedding_dim = 32
user_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=user_id_lookup_layer.vocab_size(),
    output_dim=user_id_embedding_dim,
)
user_id_model = tf.keras.Sequential([
    user_id_lookup_layer,
    user_id_embedding_layer,                                      
])

In [31]:
user_id_model(['-2', '13'])

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(2, 32), dtype=float32, numpy=
array([[ 0.02413679,  0.01285462, -0.04826153, -0.01568551,  0.00106376,
        -0.01222459, -0.0426784 , -0.04786297, -0.02128229, -0.00289384,
         0.01936141, -0.04267867,  0.04325121, -0.02915695,  0.02010583,
        -0.00414361,  0.03596262,  0.04293433, -0.02970809,  0.02686551,
         0.01001602, -0.02296026,  0.03818062, -0.04463496, -0.00772554,
         0.03903778,  0.02887033, -0.03983442, -0.03059117, -0.02210329,
        -0.01048737, -0.03776479],
       [ 0.0374391 , -0.03322836,  0.01596291,  0.03011217, -0.04667386,
         0.00679704, -0.02052331,  0.04518041, -0.007684  ,  0.03294318,
        -0.02833353, -0.0223081 , -0.02255479,  0.02441252,  0.04237778,
         0.03737256,  0.03173491,  0.04524806,  0.04349098, -0.01546743,
         0.0326384 , -0.0242365 ,  0.0047522 , -0.01451031, -0.00412635,
         0.04911378, -0.00730715,  0.01182103,  0.01676439,  0.03054024,
         0.03585732, -0.00702628]], dtyp

In [32]:
movie_id_lookup_layer = tf.keras.layers.experimental.preprocessing.StringLookup(mask_token=None)
movie_id_lookup_layer.adapt(
    ratings_trainset.map(
        lambda rating: rating['movie_id']
    )
)

In [33]:
print(
    movie_id_lookup_layer.get_vocabulary(),
)

['[UNK]', '50', '181', '100', '286', '288', '258', '294', '1', '174', '300', '121', '127', '7', '98', '56', '172', '117', '237', '222', '313', '204', '405', '210', '173', '168', '748', '151', '79', '257', '69', '195', '302', '423', '269', '118', '9', '276', '15', '22', '328', '202', '96', '64', '318', '234', '216', '176', '275', '183', '25', '111', '89', '28', '191', '12', '357', '82', '135', '238', '196', '742', '97', '268', '289', '186', '153', '125', '70', '132', '323', '11', '185', '333', '245', '483', '228', '197', '546', '194', '475', '144', '655', '568', '182', '496', '180', '179', '273', '301', '265', '161', '282', '211', '95', '8', '71', '678', '471', '322', '143', '215', '187', '427', '235', '588', '271', '250', '88', '508', '597', '435', '603', '4', '403', '385', '208', '298', '284', '272', '175', '134', '474', '307', '200', '515', '527', '479', '230', '147', '393', '209', '340', '99', '83', '23', '13', '58', '566', '274', '419', '14', '124', '326', '24', '732', '327', '229'

In [34]:
movie_id_embedding_dim = 32
movie_id_embedding_layer = tf.keras.layers.Embedding(
    input_dim=movie_id_lookup_layer.vocab_size(),
    output_dim=movie_id_embedding_dim,
)
movie_id_model = tf.keras.Sequential([
    movie_id_lookup_layer,
    movie_id_embedding_layer,                                      
])

In [35]:
movie_id_model(['181'])

Consider rewriting this model with the Functional API.


Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.0463539 , -0.0261884 , -0.03654646,  0.01966048, -0.00632324,
        -0.0135553 ,  0.01554319,  0.01717571, -0.0217539 ,  0.0139563 ,
        -0.03811926, -0.00494232, -0.01253927,  0.02892852, -0.00147589,
         0.04769996,  0.04756613,  0.02968651, -0.00017743, -0.04505055,
         0.04428105, -0.0088385 , -0.02552181, -0.00331017,  0.0196426 ,
         0.02212788,  0.0072131 , -0.00881346, -0.02435054, -0.01322163,
        -0.01882978, -0.00503979]], dtype=float32)>

In [36]:
movie_title_vectorization_layer = tf.keras.layers.experimental.preprocessing.TextVectorization()
movie_title_vectorization_layer.adapt(
    ratings_trainset.map(
        lambda rating: rating['movie_title']
    )
)

In [37]:
print(
    movie_title_vectorization_layer.get_vocabulary(),
)

['', '[UNK]', 'the', '1996', '1997', '1995', '1994', 'of', '1993', 'and', 'a', 'in', '1989', '1992', 'star', 'to', '1990', '1991', '1986', 'day', '1982', 'man', 'trek', '1981', '1984', 'for', '1987', '1980', 'dead', '1979', 'with', '1974', 'one', 'liar', '1988', 'lost', '1985', '2', 'my', '1998', 'first', 'contact', '1971', '1977', 'monty', '1983', 'love', 'on', 'last', 'men', 'back', 'die', 'seven', 'you', '1975', 'hard', 'life', 'i', '1963', 'ii', 'chocolate', 'air', 'return', 'la', 'story', 'home', 'godfather', 'terminator', '1962', 'scream', 'about', 'wars', '1939', 'bride', '1967', '1940', '1954', 'it', 'good', 'batman', 'when', '1957', 'time', 'night', 'jedi', 'alien', 'by', 'lies', '1941', 'fear', 'fargo', 'english', 'red', 'patient', 'mrs', '1951', 'full', 'kiss', 'blade', 'under', 'toy', 'do', '1972', 'king', 'mr', 'kill', 'dogs', '1959', '1968', 'raiders', 'devils', 'ark', 'rock', 'park', 'force', 'american', 'fire', '1970', 'jurassic', 'independence', 'id4', '1958', 'black',

In [38]:
print(movie_title_vectorization_layer("One Flew Over the Cuckoo's Nest (1975)"))

tf.Tensor([ 32 263 161   2 264 261  54], shape=(7,), dtype=int64)


In [39]:
movie_title_dim = 32
movie_title_embedding_layer = tf.keras.layers.Embedding(
    input_dim=len(movie_title_vectorization_layer.get_vocabulary()),
    output_dim=movie_title_dim,
)

In [40]:
movie_title_model = tf.keras.Sequential([
    movie_title_vectorization_layer,
    movie_title_embedding_layer,   
    tf.keras.layers.GlobalAveragePooling1D(),                   
])

In [44]:
for row in ratings_trainset.batch(1).map(lambda rating: rating['movie_title']).take(1):
  print(row)
  print(
      movie_title_model(row)
  )

tf.Tensor([b'Postman, The (1997)'], shape=(1,), dtype=string)
tf.Tensor(
[[ 0.00978375 -0.00361248 -0.00892127 -0.02573025 -0.01518443  0.00302891
   0.01130522 -0.00996631 -0.01486917  0.00336484  0.0011009  -0.01384857
  -0.02098953  0.00607694 -0.00113188  0.03476429  0.02512205 -0.02074855
  -0.00443245  0.03121509  0.01485846 -0.00761569  0.01439593  0.03156597
  -0.01945089 -0.01195898 -0.00631799  0.0094654  -0.02505939 -0.00632705
  -0.00291794  0.0167402 ]], shape=(1, 32), dtype=float32)


### Query and Candidate using Two-Tower

In [None]:
query_tower = user_id_model
candidate_tower = movie_id_model