# Training TFRS

 - [ ] Fix the data 
     - [ ] Get a reasonable amount of data, make sure there is overlap in train/test 
     - [ ] Set up a flag so we can use all vs. subset of data depending on CPU/GPU
 - [ ] Set up eval procedure 
     - [ ] Metrics 
     - [ ] Coverage/Popularity
     - [ ] Qualitative evaluation of predictions 
 - [ ] Baselines
     - [ ] Most popular 
     - [ ] Domain Knowledge 
     - [ ] kNN
 - [ ] TFRS
     - Simple model 
     - With Context Features
     - Sequential 
     - Memory Efficient
 - [ ] Serving 
     - In memory 
     - TFS
 - [ ] E2E with TFX
 - [ ] Alternatives 
     - [ ] LightFM, Microsoftrecommenders, Transformer recommends
 - [ ] Clean Notebook
     - [ ] References to Papers / Books
     - [ ] Evaluation notes
     - [ ] Shortcomings/Future work 

In [1]:
from typing import Dict, Any, Text

import numpy as np 
import pandas as pd

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_data_validation as tfdv

## **Reading in the Data** 

In [77]:
train_df = pd.read_csv('train.csv', dtype={'user_no': str, 'item_no': str})
test_df = pd.read_csv('test.csv', dtype={'user_no': str, 'item_no': str})

# For evaluation
item_info_df = pd.read_csv('item_info.csv', dtype={'item_no': str})

<div class="alert alert-block alert-info">
<b>TODO:</b> Move all this stuff to EDA notebook so this is a bit more streamlined and we can just 
read in data that is ready-to-go. 
    
<b>NOTE:</b> Gonna cheat here a bit and make an artificial dataset such that all of the users are repeat
    
Create **two** versions of the dataset (abbreviated and full) so that we can run on CPU and GPU
</div>

In [86]:
NUM_USERS = 1000

overlap_users = set(train_df['user_no']) & set(test_df['user_no'].unique())
top_users = train_df[train_df['user_no'].isin(overlap_users)]['user_no'].value_counts()[:NUM_USERS].index

In [91]:
train_df_filtered = train_df.loc[train_df['user_no'].isin(top_users)]
test_df_filtered = test_df.loc[test_df['user_no'].isin(top_users)]
items = train_df_filtered['item_no'].unique()

In [99]:
train_dataset = tf.data.Dataset.from_tensor_slices(dict(train_df_filtered))
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_df_filtered))

items_dataset = tf.data.Dataset.from_tensor_slices(items)

In [100]:
for item in items_dataset.take(3):
    print(item)

tf.Tensor(b'7695529757452122196', shape=(), dtype=string)
tf.Tensor(b'1959675403949859161', shape=(), dtype=string)
tf.Tensor(b'2588296344401354503', shape=(), dtype=string)


In [101]:
for elem in train_dataset.take(3):
    print(elem)

{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'7695529757452122196'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'reima'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'trainers'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'1959675403949859161'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'gola kids'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'trainers'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'2588296344401354503'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: sha

In [104]:
unique_users = train_df_filtered['user_no'].unique()
unique_items_training = set(train_df_filtered['item_no'])
unique_items_test = set(test_df_filtered['item_no'])

print(len(unique_users))
print(len(unique_items_training))
print(len(unique_items_test))
print(len(unique_items_test - unique_items_training))

1000
9519
879
494


## Creating the Model

In [138]:
EMBEDDING_DIM = 32
NUM_OOV_INDICES = 1

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_users, 
      num_oov_indices=NUM_OOV_INDICES),
  tf.keras.layers.Embedding(len(unique_users) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

item_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=items, 
      num_oov_indices=NUM_OOV_INDICES),
  tf.keras.layers.Embedding(len(items) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=items_dataset.batch(128).map(item_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [118]:
class SimpleTFRSModel(tfrs.Model):

    def __init__(self, user_model, item_model, task):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.item_model: tf.keras.Model = item_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_no"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_item_embeddings = self.item_model(features["item_no"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_item_embeddings)

---
---

<div class="alert alert-block alert-warning">
<b>The above is just a convenience!</b> The following class is a simplified version of what
is actually going on under-the-hood:

```python 
class NonTFRSModel(tf.keras.Model):
    def __init__(self, user_model, item_model, metrics):
        """
        Note that we don't pass in the task! That's because we define 
        what it is here.
        """
        super().__init__()
        self.user_model = user_model 
        self.item_model = item_model 
        # When we perform retrieval, the default loss is actually just good 
        # old CategoricalCrossentropy :) 
        self._loss = tf.keras.losses.CategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.SUM
        )
        self._factorized_metrics = metrics

    def calc_loss(self, query_embeddings, candidate_embeddings): 
        scores = tf.linalg.matmul(
            query_embeddings, 
            candidate_embeddings, 
            transpose_b=True
        )
        num_queries, num_candidates = scores.shape
        labels = tf.eye(num_queries, num_candidates)
        loss = self._loss(y_true=labels, y_pred=scores)
        self._factorized_metrics.update_state(
            query_embeddings, 
            candidate_embeddings
        )
        return loss
    

    def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        with tf.GradientTape() as tape: 
            user_embeddings = self.user_model(features['user_no'])
            positive_item_embeddings = self.item_model(features['item_no'])
            loss = self.calc_loss(user_embeddings, positive_item_embeddings)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        metrics = {metric.name: metric.result() for metric in self.metrics}
        return metrics 

    def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: 
        user_embeddings = self.user_model(features['user_no'])
        positive_item_embeddings = self.item_model(features['item_no'])

        loss = self.compute_loss(user_embeddings, positive_item_embeddings)        

        metrics = {metric.name: metric.result() for metric in self.metrics}
        return metrics 
```

We can then instantiate and compile a model like so: 

```python 
simple_model = NonTFRSModel(user_model, item_model, metrics)
# Need to specify run_eagerly=True because we need the shape of the scores 
# in the calc_loss function
simple_model.compile(optimizer=tf.keras.optimizers.Adam(), run_eagerly=True)
```

After that we can just train the model the same as below :)

</div>
---
---

In [120]:
model = SimpleTFRSModel(user_model, item_model, task)
model.compile(optimizer=tf.keras.optimizers.Adam())

In [121]:
train_dataset_interactions = train_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})
test_dataset_interactions = test_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})

cached_train = train_dataset_interactions.shuffle(1_000).batch(1024).cache()
cached_test = test_dataset_interactions.batch(512).cache()

In [122]:
history = model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Evaluation

In [123]:
results = model.evaluate(cached_test, return_dict=True)



## Serving

In [124]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends items out of the entire items dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((items_dataset.batch(100), items_dataset.batch(100).map(model.item_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x14991ad10>

In [127]:
random_user = np.random.choice(train_df_filtered['user_no'].unique())
train_df_filtered.loc[train_df_filtered['user_no'] == random_user]

Unnamed: 0,user_no,item_no,gender_description,brand,product_group
377129,1145355627971110554,-7865088438347131541,girls,wheat,bottoms
377130,1145355627971110554,-4157873162967126783,unisex,kuling,headwear
377131,1145355627971110554,-2142740165482218263,unisex,kuling,headwear
377132,1145355627971110554,3748431471949385807,unisex,kuling,swimwear and coverups
377133,1145355627971110554,-2860174162663871712,unisex,kuling,headwear
377134,1145355627971110554,-2195934864809708124,unisex,wheat,fleeces and midlayers
377135,1145355627971110554,1513658394069720986,unisex,wheat,fleeces and midlayers
377136,1145355627971110554,-6850173515075499791,unisex,wheat,clothing sets
377137,1145355627971110554,-2195934864809708124,unisex,wheat,fleeces and midlayers
377138,1145355627971110554,1513658394069720986,unisex,wheat,fleeces and midlayers


In [129]:
%%time
# Get recommendations.
_, titles = index(tf.constant([random_user]))

CPU times: user 4.58 ms, sys: 6.71 ms, total: 11.3 ms
Wall time: 21 ms


In [132]:
%%time
items_to_exclude = train_df_filtered.loc[train_df_filtered['user_no'] == random_user]['item_no'].unique()
_, titles = index.query_with_exclusions(tf.constant([random_user]), 
                                       tf.constant([items_to_exclude]))

CPU times: user 4.08 ms, sys: 704 µs, total: 4.79 ms
Wall time: 3.78 ms


In [133]:
recommendations = [item.numpy().decode() for item in titles[0]]
item_info_df.loc[item_info_df['item_no'].isin(recommendations)]

Unnamed: 0,item_no,colour,gender_description,brand,product_group,min_age,max_age
8723,-6452537443298138438,cream,unisex,bobo choses,all in ones,1.0,11.0
16614,-5461181132081057096,blue,girls,burberry,dresses,2.0,14.0
21874,-873465860918484678,navy,unisex,kuling,sandals,0.875,6.0
39166,-8429863690086218988,pink,girls,adidas,trainers,4.0,10.0
41496,8659013735764980519,grey,unisex,bugaboo,stroller parts and customisati,,
42154,7480282260445719099,black,boys,nike,trainers,0.375,5.0
52105,1493209376961654965,navy,boys,didriksons,coats and jackets,1.0,9.0
54505,-6447463798668859639,purple,unisex,buddy & hope,stroller accessories,,
56913,-7634805924562764179,black,unisex,reima,trainers,0.875,5.0
58036,-5678741866268285557,blue,unisex,bobo choses,swimwear and coverups,1.0,11.0


---
---
---

## **Baselines**

### **Top Items**

**Let's find the top 100 items in the training dataset and always predict during the test dataset**

In [228]:
NUM_TOP_ITEMS = 100
top_items = train_df_filtered['item_no'].value_counts()[:100].index

In [164]:
top_items_in_test_dataset = test_df_filtered.loc[test_df_filtered['item_no'].isin(top_items)]

print(len(top_items_in_test_dataset))
print(len(test_df_filtered))

51
1000


In [211]:
ks = (1, 5, 10, 50, 100)
metrics = [tf.keras.metrics.Mean() for k in ks]

In [169]:
true_candidates = tf.expand_dims(tf.constant(test_df_filtered['item_no'].values), 1)

In [208]:
retrieved_candidates = tf.expand_dims(top_items, 1)
retrieved_candidates = tf.transpose(tf.repeat(retrieved_candidates, tf.constant(true_candidates.shape[0]), axis=1))

In [213]:
ids_match = tf.cast(tf.math.equal(true_candidates, retrieved_candidates), tf.float32)

In [214]:
ids_match

<tf.Tensor: shape=(1000, 100), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [215]:
for k, metric in zip(ks, metrics):
    # By slicing until :k we assume scores are sorted.
    # Clip to only count multiple matches once.
    match_found = tf.clip_by_value(
        tf.reduce_sum(ids_match[:, :k], axis=1, keepdims=True),
        0.0, 1.0
    )
    metric.update_state(match_found)

In [218]:
for metric in metrics:
    print(metric.result())

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.003, shape=(), dtype=float32)
tf.Tensor(0.007, shape=(), dtype=float32)
tf.Tensor(0.027, shape=(), dtype=float32)
tf.Tensor(0.051, shape=(), dtype=float32)


### **Top Items Domain Knowledge**

Since the test data is in November let's exclude certain product groups

In [223]:
item_info_df.loc[item_info_df['item_no'].isin(top_items)]['product_group'].unique()

array(['jumpers and knitwear', 'coveralls', 'boots', 'trainers',
       'dresses', 'tops', 'clothing sets', 'coats and jackets',
       'stroller accessories', 'fleeces and midlayers', 'winter sets',
       'sandals', 'bottoms', 'gloves and mittens', 'role play',
       'stationary', 'headwear'], dtype=object)

In [231]:
GROUPS_TO_INCLUDE = ['jumpers and knitwear', 'coveralls', 'boots', 'coats and jackets', 'stroller accessories', 
                      'fleeces and midlayers', 'winter sets', 'gloves and mittens', 'headwear']

items_to_consider = item_info_df.loc[item_info_df['product_group'].isin(GROUPS_TO_INCLUDE)]['item_no']

In [233]:
top_items_filtered = train_df_filtered[
    train_df_filtered['item_no'].isin(items_to_consider)]['item_no'].value_counts()[:100].index

In [236]:
len(set(top_items_filtered) - set(top_items))

47

In [237]:
retrieved_candidates = tf.expand_dims(top_items_filtered, 1)
retrieved_candidates = tf.transpose(tf.repeat(retrieved_candidates, tf.constant(true_candidates.shape[0]), axis=1))

In [238]:
ids_match = tf.cast(tf.math.equal(true_candidates, retrieved_candidates), tf.float32)

In [239]:
metrics = [tf.keras.metrics.Mean() for k in ks]
for k, metric in zip(ks, metrics):
    # By slicing until :k we assume scores are sorted.
    # Clip to only count multiple matches once.
    match_found = tf.clip_by_value(
        tf.reduce_sum(ids_match[:, :k], axis=1, keepdims=True),
        0.0, 1.0
    )
    metric.update_state(match_found)

In [240]:
for metric in metrics:
    print(metric.result())

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.004, shape=(), dtype=float32)
tf.Tensor(0.011, shape=(), dtype=float32)
tf.Tensor(0.031, shape=(), dtype=float32)
tf.Tensor(0.061, shape=(), dtype=float32)
