# Recommendation systems with Tensorflow

### Recommender systems composed of 2 stages  
1. Retrieval stage, efficiently weed out all candidates that the user is not interested in  
2. Ranking stage, takes the outputs of the retrieval model and fine-tunes them to select the best possible handful of recommendation

## Retrieval Stage

### Standard Imports

In [6]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import pandas as pd

### Import data

In [74]:
ratings = pd.read_csv('datasets/ratings.csv')
movies = pd.read_csv('datasets/movies.csv')

ratings = pd.merge(ratings, movies, on='movieId')
ratings = ratings.drop(columns='genres')
ratings['userId'] = ratings['userId'].astype(str)

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


In [75]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [76]:


# convert pandas dataframe to tf dataset
ratings_tf = tf.data.Dataset.from_tensor_slices(dict(ratings))
movies_tf = tf.data.Dataset.from_tensor_slices(dict(movies))


In [77]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,1,1,4.0,964982703,Toy Story (1995)
1,5,1,4.0,847434962,Toy Story (1995)
2,7,1,4.5,1106635946,Toy Story (1995)
3,15,1,2.5,1510577970,Toy Story (1995)
4,17,1,4.5,1305696483,Toy Story (1995)


In [78]:
for x in ratings_tf.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'movieId': 1,
 'rating': 4.0,
 'timestamp': 964982703,
 'title': b'Toy Story (1995)',
 'userId': b'1'}


### Data cleaning

In [79]:
# extract only the features that will be used
ratings_tf = ratings_tf.map(lambda x: {
    "movieTitle": x["title"],
    "userId": x["userId"],
})
movies_tf = movies_tf.map(lambda x: x["title"])

### Split into test and train dataset

In [80]:
tf.random.set_seed(42)
shuffled = ratings_tf.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

### Extract unique user ids and unique movie titles

In [81]:
movie_titles = movies_tf.batch(1_000)

user_ids = ratings_tf.batch(1_000_000).map(lambda x: x['userId'])

unique_movie_titles = np.unique(np.concatenate(list(movie_titles)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_user_ids= np.asarray(unique_user_ids).astype('str')
unique_movie_titles[:10]

array([b"'71 (2014)", b"'Hellboy': The Seeds of Creation (2004)",
       b"'Round Midnight (1986)", b"'Salem's Lot (2004)",
       b"'Til There Was You (1997)", b"'Tis the Season for Love (2015)",
       b"'burbs, The (1989)", b"'night Mother (1986)",
       b'(500) Days of Summer (2009)', b'*batteries not included (1987)'],
      dtype=object)

### Declare Models

In [82]:
embedding_dimension = 32

movie_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(vocabulary=unique_movie_titles, mask_token=None),
  tf.keras.layers.Embedding(len(unique_movie_titles) + 1, embedding_dimension)
])

In [83]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

### Declare metrics
To figure out how good our model is we compare affinity scores with the positive pair with all other candidates.  
If score for positive pair is higher than all other candidates, our model is accurate

In [84]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=movies_tf.batch(128).map(movie_model)
)

#bundles together the loss function and metric computation
task = tfrs.tasks.Retrieval(
    metrics=metrics
)

### Building the full model

In [85]:
class MovieModel(tfrs.Model):

    def __init__(self, user_model, movie_model):
        super().__init__()
        self.movie_model: tf.keras.Model = movie_model
        self.user_model: tf.keras.Model = user_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        user_embeddings = self.user_model(features["userId"])
        positive_movie_embeddings = self.movie_model(features["movieTitle"])
        return self.task(user_embeddings, positive_movie_embeddings)

### Fitting and Evaluating

In [86]:
model = MovieModel(user_model, movie_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1531cc12f40>

### Evaluate model on test set

In [87]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0005499999970197678,
 'factorized_top_k/top_5_categorical_accuracy': 0.004449999891221523,
 'factorized_top_k/top_10_categorical_accuracy': 0.010850000195205212,
 'factorized_top_k/top_50_categorical_accuracy': 0.05829999968409538,
 'factorized_top_k/top_100_categorical_accuracy': 0.1071000024676323,
 'loss': 28280.9296875,
 'regularization_loss': 0,
 'total_loss': 28280.9296875}

### Making Predictions

In [91]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model) #model that takes in raw query input

# recommends movies out of the entire movies dataset
index.index_from_dataset(
  tf.data.Dataset.zip((movies_tf.batch(100), movies_tf.batch(100).map(model.movie_model)))
)

_, titles = index(tf.constant(["12"]))
print(f"Recommendations for user 12: {titles[0, :5]}")

Recommendations for user 12: [b'Heart and Souls (1993)' b'First Wives Club, The (1996)'
 b'Sweet Home Alabama (2002)' b'Guys and Dolls (1955)'
 b'Lady and the Tramp (1955)']
