In [1]:
# Adapted from: https://www.tensorflow.org/recommenders/examples/basic_retrieval

In [2]:
!pip install -q tensorflow-recommenders

### Imports

In [3]:
import pandas as pd

from tensorflow.data import Dataset
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow.keras.optimizers import Adagrad

import tensorflow as tf
import tensorflow_recommenders as tfrs
from tensorflow_recommenders.metrics import FactorizedTopK
from tensorflow_recommenders.tasks import Retrieval

### Data

In [4]:
# from google.colab import files
# files.upload()
# df = pd.read_csv("candy.csv")

In [5]:
df = pd.read_csv("https://raw.githubusercontent.com/maxhumber/BRE/master/data/candy.csv")

In [6]:
df.head()

Unnamed: 0,item,user,review
0,Reese's Peanut Butter Cups Miniatures,darlene90,5
1,Reese's Peanut Butter Cups Miniatures,taylordarlene,4
2,Reese's Peanut Butter Cups Miniatures,aliciadennis,5
3,Reese's Peanut Butter Cups Miniatures,ocook,5
4,Reese's Peanut Butter Cups Miniatures,chad38,5


In [7]:
dataset = Dataset.from_tensor_slices(df.to_dict('list'))
items = Dataset.from_tensor_slices(df['item'].unique())

In [8]:
for row in dataset.take(1).as_numpy_iterator():
    print(row)

{'item': b"Reese's Peanut Butter Cups Miniatures", 'user': b'darlene90', 'review': 5}


### Train/Test Split

In [9]:
rows = df.shape[0]
n_train = int(rows * 0.8 // 1)
n_test = rows - n_train

print(n_train, n_test)

13787 3447


In [10]:
tf.random.set_seed(42)
shuffled = dataset.shuffle(rows, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(n_train)
test = shuffled.skip(n_train).take(n_test)

### Model Definition

In [11]:
unique_items = df.item.unique()
unique_users = df.user.unique()

In [12]:
embedding_dimension = 32

In [13]:
user_model = Sequential([
    StringLookup(vocabulary=unique_users),
    # add 2 to account for unknown and mask tokens
    Embedding(len(unique_users) + 2, embedding_dimension)   
])

In [14]:
item_model = Sequential([
    StringLookup(vocabulary=unique_items),
    Embedding(len(unique_items) + 2, embedding_dimension)
])

In [15]:
metrics = FactorizedTopK(candidates=items.batch(128).map(item_model))

In [16]:
task = Retrieval(metrics=metrics)

In [17]:
for row in dataset.take(1).as_numpy_iterator():
    print(row)

{'item': b"Reese's Peanut Butter Cups Miniatures", 'user': b'darlene90', 'review': 5}


In [18]:
class CandyModel(tfrs.Model):

    def __init__(self, user_model, item_model, task):
        super().__init__()
        self.item_model = item_model
        self.user_model = user_model
        self.task = task

    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features["user"])
        positive_item_embeddings = self.item_model(features["item"])
        return self.task(user_embeddings, positive_item_embeddings)

### Model Training

In [19]:
model = CandyModel(user_model, item_model, task)
model.compile(optimizer=Adagrad(learning_rate=0.1))

In [20]:
cached_train = train.batch(128).cache()
cached_test = test.batch(128).cache()

In [21]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f85fd5c8898>

In [22]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k': array([0.00464172, 0.0908036 , 0.23585726, 0.70670146, 0.91441834],
       dtype=float32),
 'factorized_top_k/top_100_categorical_accuracy': 0.9144183397293091,
 'factorized_top_k/top_10_categorical_accuracy': 0.2358572632074356,
 'factorized_top_k/top_1_categorical_accuracy': 0.004641717299818993,
 'factorized_top_k/top_50_categorical_accuracy': 0.7067014575004578,
 'factorized_top_k/top_5_categorical_accuracy': 0.0908036008477211,
 'loss': 534.1340942382812,
 'regularization_loss': 0,
 'total_loss': 534.1340942382812}

### Predictions

In [23]:
# Create a model that takes in raw query features, and
index = tfrs.layers.ann.BruteForce(model.user_model)
# recommends candies out of the entire candy dataset.
index.index(items.batch(100).map(model.item_model), items)

In [24]:
_, candies = index(tf.constant(["taylordarlene"]))

In [25]:
df[df['user'] == "taylordarlene"]

Unnamed: 0,item,user,review
1,Reese's Peanut Butter Cups Miniatures,taylordarlene,4
34,Ferrero Rocher Chocolate,taylordarlene,5
266,Twix,taylordarlene,5
665,M&Ms Peanut Chocolate Candy,taylordarlene,5
872,Snickers Chocolate Bar,taylordarlene,5


In [26]:
candies[0].numpy().tolist()

[b'Ferrero Rocher Chocolate',
 b"Reese's Peanut Butter Cups Miniatures",
 b'M&Ms Peanut Chocolate Candy',
 b'Snickers Chocolate Bar',
 b'Twix',
 b'Lindt Lindor Milk Chocolate Truffles',
 b'M&Ms Milk Chocolate Candy',
 b"Hershey's Cookies 'n' Creme Candy Bar",
 b"Hershey's Kisses Milk Chocolate",
 b'Nestle Toll House Semi Sweet Chocolate Morsels']

### Ranking Models

In [27]:
# Adapted from https://www.tensorflow.org/recommenders/examples/basic_ranking

In [28]:
class RankingModel(tf.keras.Model):

    def __init__(self, unique_users, unique_items, embedding_dimension=32):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.user_embeddings = Sequential([
            StringLookup(vocabulary=unique_users),
            Embedding(len(unique_users) + 2, self.embedding_dimension)
        ])
        self.item_embeddings = Sequential([
            StringLookup(vocabulary=unique_items),
            Embedding(len(unique_items) + 2, self.embedding_dimension)
        ])
        self.ratings = Sequential([
            # Learn multiple dense layers
            Dense(256, activation="relu"),
            Dense(64, activation="relu"),
            # Make rating predictions in the final layer
            Dense(1)
        ])
    
    def call(self, user, item):
        user_embedding = self.user_embeddings(user)
        item_embedding = self.item_embeddings(item)
        return self.ratings(tf.concat([user_embedding, item_embedding], axis=1))

In [29]:
task = tfrs.tasks.Ranking(
    loss = tf.keras.losses.MeanSquaredError(),
    metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [30]:
class CandyRankingModel(tfrs.Model):

    def __init__(self, ranking_model, task):
        super().__init__()
        self.ranking_model = ranking_model
        self.task = task

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(features["user"], features["item"])
        # The task computes the loss and the metrics.
        return self.task(labels=features["review"], predictions=rating_predictions)


In [31]:
ranking_model = RankingModel(unique_users, unique_items)
model = CandyRankingModel(ranking_model, task)

In [32]:
model.compile(optimizer=Adagrad(learning_rate=0.1))

In [33]:
model.fit(cached_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f85fd327d30>

In [34]:
model.evaluate(cached_test, return_dict=True)



{'loss': 0.9586119651794434,
 'regularization_loss': 0,
 'root_mean_squared_error': 0.9788199067115784,
 'total_loss': 0.9586119651794434}