# Training TFRS

 - [ ] Fix the data 
     - [x] Get a reasonable amount of data, make sure there is overlap in train/test 
     - [ ] Set up a flag so we can use all vs. subset of data depending on CPU/GPU
 - [ ] Set up eval procedure - **Clean this up a bit more**
     - [x] Metrics 
     - [ ] Coverage/Popularity
     - [x] Qualitative evaluation of predictions 
 - [x] Baselines - **Done, just need to clean**
     - [x] Most popular 
     - [x] Domain Knowledge 
     - [x] kNN
 - [ ] TFRS
     - [x] Simple model 
     - [ ] With Context Features
     - [ ] Sequential 
     - [ ] Memory Efficient
 - [ ] Serving 
     - [x] In memory 
     - [ ] TFS
 - [ ] E2E with TFX
 - [ ] Alternatives 
     - [ ] LightFM, Microsoftrecommenders, Transformer recommends
 - [ ] Clean Notebook
     - [ ] References to Papers / Books
     - [ ] Evaluation notes
     - [ ] Shortcomings/Future work 
    
After doing with context features, do a more advanced on GPU, and then do E2E with TFX 

In [28]:
from typing import Dict, Any, Text

import numpy as np 
import pandas as pd

import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_data_validation as tfdv

## **Reading in the Data** 

In [29]:
train_df = pd.read_csv('train.csv', dtype={'user_no': str, 'item_no': str})
test_df = pd.read_csv('test.csv', dtype={'user_no': str, 'item_no': str})

# For evaluation
item_info_df = pd.read_csv('item_info.csv', dtype={'item_no': str})

<div class="alert alert-block alert-info">
<b>TODO:</b> Move all this stuff to EDA notebook so this is a bit more streamlined and we can just 
read in data that is ready-to-go. 
    
<b>NOTE:</b> Gonna cheat here a bit and make an artificial dataset such that all of the users are repeat
    
Create **two** versions of the dataset (abbreviated and full) so that we can run on CPU and GPU
</div>

In [41]:
NUM_USERS = 2000

overlap_users = set(train_df['user_no']) & set(test_df['user_no'].unique())
top_users = train_df[train_df['user_no'].isin(overlap_users)]['user_no'].value_counts()[:NUM_USERS].index

In [152]:
train_df_filtered = train_df.loc[train_df['user_no'].isin(top_users)]
test_df_filtered = test_df.loc[test_df['user_no'].isin(top_users)]
items = train_df_filtered['item_no'].unique()

In [43]:
train_dataset = tf.data.Dataset.from_tensor_slices(dict(train_df_filtered))
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_df_filtered))

items_dataset = tf.data.Dataset.from_tensor_slices(items)

In [44]:
for item in items_dataset.take(3):
    print(item)

tf.Tensor(b'7695529757452122196', shape=(), dtype=string)
tf.Tensor(b'1959675403949859161', shape=(), dtype=string)
tf.Tensor(b'2588296344401354503', shape=(), dtype=string)


2022-02-07 10:06:35.070125: W tensorflow/core/data/root_dataset.cc:163] Optimization loop failed: CANCELLED: Operation was cancelled


In [45]:
for elem in train_dataset.take(3):
    print(elem)

{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'7695529757452122196'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'reima'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'trainers'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'1959675403949859161'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'gola kids'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'trainers'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'2588296344401354503'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: sha

In [46]:
print(train_df_filtered['user_no'].nunique())
print(train_df_filtered['item_no'].nunique())
print(test_df_filtered['item_no'].nunique())
print(len(set(test_df_filtered['item_no']) - set(train_df_filtered['item_no'])))

2000
15254
2050
976


## Creating the Model

In [47]:
EMBEDDING_DIM = 32
NUM_OOV_INDICES = 1

user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_users, 
      num_oov_indices=NUM_OOV_INDICES),
  tf.keras.layers.Embedding(len(unique_users) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

item_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=items, 
      num_oov_indices=NUM_OOV_INDICES),
  tf.keras.layers.Embedding(len(items) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

metrics = tfrs.metrics.FactorizedTopK(
  candidates=items_dataset.batch(128).map(item_model)
)

task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [48]:
class SimpleTFRSModel(tfrs.Model):

    def __init__(self, user_model, item_model, task):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.item_model: tf.keras.Model = item_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_no"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_item_embeddings = self.item_model(features["item_no"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_item_embeddings)

---
---

<div class="alert alert-block alert-info">
<b>TODO:</b> Explain what needs to be done to customize this better
</div>

<div class="alert alert-block alert-warning">
<b>The above is just a convenience!</b> The following class is a simplified version of what
is actually going on under-the-hood:

```python 
class NonTFRSModel(tf.keras.Model):
    def __init__(self, user_model, item_model, metrics):
        """
        Note that we don't pass in the task! That's because we define 
        what the task is here.
        """
        super().__init__()
        self.user_model = user_model 
        self.item_model = item_model 
        # When we perform retrieval, the default loss is actually just good 
        # old CategoricalCrossentropy :) 
        self._loss = tf.keras.losses.CategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.SUM
        )
        self._factorized_metrics = metrics

    def calc_loss(self, query_embeddings, candidate_embeddings): 
        scores = tf.linalg.matmul(
            query_embeddings, 
            candidate_embeddings, 
            transpose_b=True
        )
        num_queries, num_candidates = scores.shape
        labels = tf.eye(num_queries, num_candidates)
        loss = self._loss(y_true=labels, y_pred=scores)
        self._factorized_metrics.update_state(
            query_embeddings, 
            candidate_embeddings
        )
        return loss
    

    def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:
        with tf.GradientTape() as tape: 
            user_embeddings = self.user_model(features['user_no'])
            positive_item_embeddings = self.item_model(features['item_no'])
            loss = self.calc_loss(user_embeddings, positive_item_embeddings)

        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        metrics = {metric.name: metric.result() for metric in self.metrics}
        return metrics 

    def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor: 
        user_embeddings = self.user_model(features['user_no'])
        positive_item_embeddings = self.item_model(features['item_no'])

        loss = self.compute_loss(user_embeddings, positive_item_embeddings)        

        metrics = {metric.name: metric.result() for metric in self.metrics}
        return metrics 
```

We can then instantiate and compile a model like so: 

```python 
simple_model = NonTFRSModel(user_model, item_model, metrics)
# Need to specify run_eagerly=True because we need the shape of the scores 
# in the calc_loss function
simple_model.compile(optimizer=tf.keras.optimizers.Adam(), run_eagerly=True)
```

After that we can just train the model the same as below :)

</div>
---
---

In [49]:
model = SimpleTFRSModel(user_model, item_model, task)
model.compile(optimizer=tf.keras.optimizers.Adam())

In [50]:
train_dataset_interactions = train_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})
test_dataset_interactions = test_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})

cached_train = train_dataset_interactions.shuffle(1_000).batch(1024).cache()
cached_test = test_dataset_interactions.batch(512).cache()

In [51]:
history = model.fit(cached_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluation

In [52]:
results = model.evaluate(cached_test, return_dict=True)



## Serving and Qualitative Evaluation

In [53]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends items out of the entire items dataset.
_ = index.index_from_dataset(
        tf.data.Dataset.zip((items_dataset.batch(100), 
                             items_dataset.batch(100).map(model.item_model))))

In [54]:
random_user = np.random.choice(train_df_filtered['user_no'].unique())
train_df_filtered.loc[train_df_filtered['user_no'] == random_user]

Unnamed: 0,user_no,item_no,gender_description,brand,product_group
222819,7609855518106808612,-8540093817314100981,unisex,reima,boots
222820,7609855518106808612,5857246362790420524,unisex,kuling,boots
222821,7609855518106808612,-7422965665689630400,unisex,reima,boots
222822,7609855518106808612,-1377491581056901551,unisex,britax,car seats
222823,7609855518106808612,7497061752484914189,unisex,by nils,sandals
222824,7609855518106808612,-8748692560361981849,unisex,by nils,sandals
222825,7609855518106808612,-8540093817314100981,unisex,reima,boots
222826,7609855518106808612,-511428661644226692,unisex,kuling,trainers
222827,7609855518106808612,-5063842517211058761,unisex,condor,underwear
222828,7609855518106808612,1256069566134698387,unisex,jacadi,underwear


In [58]:
%%time
# Get recommendations.
_, titles = index(tf.constant([random_user]))

CPU times: user 5.88 ms, sys: 8.2 ms, total: 14.1 ms
Wall time: 20.6 ms


In [59]:
%%time
items_to_exclude = train_df_filtered.loc[train_df_filtered['user_no'] == random_user]['item_no'].unique()
_, titles = index.query_with_exclusions(tf.constant([random_user]), 
                                       tf.constant([items_to_exclude]))

CPU times: user 7.8 ms, sys: 7.74 ms, total: 15.5 ms
Wall time: 15.3 ms


In [60]:
recommendations = [item.numpy().decode() for item in titles[0]]
item_info_df.loc[item_info_df['item_no'].isin(recommendations)]

Unnamed: 0,item_no,colour,gender_description,brand,product_group,min_age,max_age
7355,3387337770681619452,green,unisex,liewood,baby feeding,,
28716,2587009692567528436,navy,boys,fila,shorts,8.0,14.0
29348,-7565930523461907101,grey,unisex,miffy,nightwear,0.125,4.0
39722,2402600656258666771,orange,unisex,bobo choses,underwear,0.875,5.0
40703,3237264711712670734,yellow,unisex,done by deer,baby feeding,,
44615,6087446764353347056,beige,unisex,oas,tops,1.0,14.0
47713,-6890138272450000519,black,unisex,kuling,gloves and mittens,0.125,7.0
51589,-8127090748480580060,green,unisex,reima,coveralls,0.125,0.375
52954,-4660690319150656043,brown,unisex,kuling,clothing sets,0.625,8.0
61158,6851777188151737367,pink,girls,molo,all in ones,0.125,2.0


---
---
---

## **Baselines**

### **Top Items**

**Let's find the top 100 items in the training dataset and always predict during the test dataset**

In [61]:
NUM_TOP_ITEMS = 100
top_items = train_df_filtered['item_no'].value_counts()[:100].index

In [62]:
top_items_in_test_dataset = test_df_filtered.loc[test_df_filtered['item_no'].isin(top_items)]

print(len(top_items_in_test_dataset))
print(len(test_df_filtered['item_no'].unique()))
print(len(test_df_filtered))

159
2050
2737


In [63]:
ks = (1, 5, 10, 50, 100)
metrics = [tf.keras.metrics.Mean() for k in ks]

In [64]:
true_candidates = tf.expand_dims(tf.constant(test_df_filtered['item_no'].values), 1)

In [65]:
print(true_candidates)

tf.Tensor(
[[b'1922350135993011210']
 [b'-3862839313591948025']
 [b'450895378801668282']
 ...
 [b'3225720764400126984']
 [b'-3082989563423633366']
 [b'-4879359146474125634']], shape=(2737, 1), dtype=string)


In [66]:
retrieved_candidates = tf.expand_dims(top_items, 1)
retrieved_candidates = tf.transpose(tf.repeat(retrieved_candidates, tf.constant(true_candidates.shape[0]), axis=1))

In [67]:
ids_match = tf.cast(tf.math.equal(true_candidates, retrieved_candidates), tf.float32)

In [68]:
ids_match

<tf.Tensor: shape=(2737, 100), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [69]:
for k, metric in zip(ks, metrics):
    # By slicing until :k we assume scores are sorted.
    # Clip to only count multiple matches once.
    match_found = tf.clip_by_value(
        tf.reduce_sum(ids_match[:, :k], axis=1, keepdims=True),
        0.0, 1.0
    )
    metric.update_state(match_found)

In [70]:
for metric in metrics:
    print(metric.result())

tf.Tensor(0.0021921813, shape=(), dtype=float32)
tf.Tensor(0.004749726, shape=(), dtype=float32)
tf.Tensor(0.013153087, shape=(), dtype=float32)
tf.Tensor(0.033613447, shape=(), dtype=float32)
tf.Tensor(0.058092803, shape=(), dtype=float32)


### **Top Items Domain Knowledge**

Since the test data is in November let's exclude certain product groups

In [71]:
item_info_df.loc[item_info_df['item_no'].isin(top_items)]['product_group'].unique()

array(['jumpers and knitwear', 'shorts', 'coveralls', 'boots', 'trainers',
       'bottoms', 'dresses', 'tops', 'clothing sets', 'coats and jackets',
       'stroller accessories', 'fleeces and midlayers', 'sandals',
       'gloves and mittens', 'role play', 'bicycles and other vehicles',
       'stationary', 'headwear'], dtype=object)

In [72]:
GROUPS_TO_INCLUDE = ['jumpers and knitwear', 'coveralls', 'boots', 'coats and jackets', 'stroller accessories', 
                      'fleeces and midlayers', 'winter sets', 'gloves and mittens', 'headwear']

items_to_consider = item_info_df.loc[item_info_df['product_group'].isin(GROUPS_TO_INCLUDE)]['item_no']

In [73]:
top_items_filtered = train_df_filtered[
    train_df_filtered['item_no'].isin(items_to_consider)]['item_no'].value_counts()[:100].index

In [74]:
len(set(top_items_filtered) - set(top_items))

52

In [75]:
retrieved_candidates = tf.expand_dims(top_items_filtered, 1)
retrieved_candidates = tf.transpose(tf.repeat(retrieved_candidates, tf.constant(true_candidates.shape[0]), axis=1))

In [76]:
ids_match = tf.cast(tf.math.equal(true_candidates, retrieved_candidates), tf.float32)

In [77]:
metrics = [tf.keras.metrics.Mean() for k in ks]
for k, metric in zip(ks, metrics):
    # By slicing until :k we assume scores are sorted.
    # Clip to only count multiple matches once.
    match_found = tf.clip_by_value(
        tf.reduce_sum(ids_match[:, :k], axis=1, keepdims=True),
        0.0, 1.0
    )
    metric.update_state(match_found)

In [78]:
for metric in metrics:
    print(metric.result())

tf.Tensor(0.0021921813, shape=(), dtype=float32)
tf.Tensor(0.010230179, shape=(), dtype=float32)
tf.Tensor(0.013153087, shape=(), dtype=float32)
tf.Tensor(0.03726708, shape=(), dtype=float32)
tf.Tensor(0.06065035, shape=(), dtype=float32)


## Content-Based

In [79]:
top_brands = train_df_filtered['brand'].value_counts()[:100].index
top_groups = train_df_filtered['product_group'].value_counts()[:50].index
train_df_filtered.loc[:, 'brand'] = train_df_filtered['brand'].apply(lambda x: x if x in top_brands else 'niche_brand')
train_df_filtered.loc[:, 'product_group'] = train_df_filtered['product_group'].apply(lambda x: x if x in top_groups else 'niche_group')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [80]:
train_df_filtered

Unnamed: 0,user_no,item_no,gender_description,brand,product_group
774,-4011379598502823212,7695529757452122196,unisex,reima,trainers
775,-4011379598502823212,1959675403949859161,unisex,niche_brand,trainers
776,-4011379598502823212,2588296344401354503,unisex,niche_brand,trainers
777,-4011379598502823212,8947821984744787968,unisex,superfit,boots
778,-4011379598502823212,-4197849053693626824,unisex,niche_brand,gloves and mittens
...,...,...,...,...,...
578359,-3695442683323654294,7192063088183359841,unisex,kuling,coveralls
578360,-3695442683323654294,-4879359146474125634,unisex,kuling,coveralls
578361,-3695442683323654294,-4601135257316693802,unisex,niche_brand,boots
578362,-3695442683323654294,-6074194196112931950,girls,niche_brand,trainers


In [81]:
train_df_one_hot = pd.get_dummies(train_df_filtered[['user_no', 'gender_description', 'brand', 'product_group']], 
                                  columns=['gender_description', 'brand', 'product_group'])
train_df_one_hot

Unnamed: 0,user_no,gender_description_boys,gender_description_girls,gender_description_unisex,brand_a happy brand,brand_adidas,brand_babybjörn,brand_beau loves,brand_bergans,brand_besafe,...,product_group_stroller accessories,product_group_stroller parts and customisati,product_group_strollers,product_group_swimwear and coverups,product_group_tableware,product_group_textile,product_group_tops,product_group_trainers,product_group_underwear,product_group_water toys
774,-4011379598502823212,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
775,-4011379598502823212,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
776,-4011379598502823212,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
777,-4011379598502823212,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
778,-4011379598502823212,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
578359,-3695442683323654294,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
578360,-3695442683323654294,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
578361,-3695442683323654294,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
578362,-3695442683323654294,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [82]:
user_embeddings = train_df_one_hot.groupby('user_no').agg('mean')

user_embeddings

Unnamed: 0_level_0,gender_description_boys,gender_description_girls,gender_description_unisex,brand_a happy brand,brand_adidas,brand_babybjörn,brand_beau loves,brand_bergans,brand_besafe,brand_billieblush,...,product_group_stroller accessories,product_group_stroller parts and customisati,product_group_strollers,product_group_swimwear and coverups,product_group_tableware,product_group_textile,product_group_tops,product_group_trainers,product_group_underwear,product_group_water toys
user_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1004190764919556160,0.000000,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.555556,0.111111,0.333333,0.0,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
-1006521943957043595,0.000000,0.055556,0.944444,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.666667,0.000000,0.000000,0.000000
-101493426712742714,0.111111,0.722222,0.166667,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.055556,0.000000,0.111111,0.000000
-1018522587505622351,0.000000,0.111111,0.888889,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.277778,0.000000,0.000000,0.000000
-1025616239793598547,0.000000,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.315789,0.0,0.000000,0.000000,0.000000,0.263158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
976469937023655762,0.052632,0.000000,0.947368,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.368421,0.000000
988004909956453914,0.055556,0.611111,0.333333,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.111111,0.000000,0.000000
988471106800508483,0.000000,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.111111,0.000000,0.000000
98933627682977975,0.000000,0.000000,1.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.157895,0.0,0.157895,0.000000,0.000000,0.000000


In [84]:
user_embeddings_matrix = np.concatenate((np.zeros((1, 155)), user_embeddings.values))

In [85]:
user_embedding_layer = tf.keras.layers.Embedding(*user_embeddings_matrix.shape, 
                                                 embeddings_initializer=tf.keras.initializers.Constant(user_embeddings_matrix),
                                                 trainable=False)

In [86]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=user_embeddings.index, 
      num_oov_indices=NUM_OOV_INDICES),
  user_embedding_layer
])

In [87]:
item_info_df.loc[:, 'brand'] = item_info_df['brand'].apply(lambda x: x if x in top_brands else 'niche_brand')
item_info_df.loc[:, 'product_group'] = item_info_df['product_group'].apply(lambda x: x if x in top_groups else 'niche_group')
item_embeddings = pd.get_dummies(item_info_df[['gender_description', 'brand', 'product_group']], 
                                 columns=['gender_description', 'brand', 'product_group'])

In [88]:
item_embeddings

Unnamed: 0,gender_description_boys,gender_description_girls,gender_description_unisex,brand_a happy brand,brand_adidas,brand_babybjörn,brand_beau loves,brand_bergans,brand_besafe,brand_billieblush,...,product_group_stroller accessories,product_group_stroller parts and customisati,product_group_strollers,product_group_swimwear and coverups,product_group_tableware,product_group_textile,product_group_tops,product_group_trainers,product_group_underwear,product_group_water toys
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61699,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61700,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
61701,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
61702,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [90]:
item_embeddings_matrix = np.concatenate((np.zeros((1, 155)), item_embeddings.values))

item_embedding_layer = tf.keras.layers.Embedding(*item_embeddings_matrix.shape, 
                                                 embeddings_initializer=tf.keras.initializers.Constant(item_embeddings_matrix),
                                                 trainable=False)

In [91]:
item_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=item_info_df['item_no'], 
      num_oov_indices=NUM_OOV_INDICES),
  item_embedding_layer
])

In [92]:
item_model('206890150141030846')

Consider rewriting this model with the Functional API.


<tf.Tensor: shape=(155,), dtype=float32, numpy=
array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.], dtype=float32)>

In [93]:
items_dataset = tf.data.Dataset.from_tensor_slices(item_info_df['item_no'])

In [94]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(user_model)
# recommends items out of the entire items dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((items_dataset.batch(100), items_dataset.batch(100).map(item_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x151448f50>

In [95]:
random_user = np.random.choice(train_df_filtered['user_no'].unique())
train_df_filtered.loc[train_df_filtered['user_no'] == random_user]

Unnamed: 0,user_no,item_no,gender_description,brand,product_group
203364,4718089218791613906,-8127812396037778948,unisex,kuling,headwear
203365,4718089218791613906,3672968112931760574,unisex,kuling,baselayers
203366,4718089218791613906,2107313587614884676,unisex,kuling,baselayers
203367,4718089218791613906,-8800243931651903138,unisex,kuling,niche_group
203368,4718089218791613906,1794872327729997045,unisex,buddy & hope,bedding
203369,4718089218791613906,2708216661078434021,unisex,cybex,stroller accessories
203370,4718089218791613906,-5162775827087250791,unisex,buddy & hope,changing and travel bags
203371,4718089218791613906,-6269194583511743625,unisex,buddy & hope,changing and travel bags
203372,4718089218791613906,-7260987486808914025,unisex,buddy & hope,changing and travel bags
203373,4718089218791613906,-5449728430414457470,unisex,niche_brand,water toys


In [96]:
%%time
items_to_exclude = train_df_filtered.loc[train_df_filtered['user_no'] == random_user]['item_no'].unique()
_, titles = index.query_with_exclusions(tf.constant([random_user]), 
                                       tf.constant([items_to_exclude]))

CPU times: user 219 ms, sys: 59.9 ms, total: 279 ms
Wall time: 400 ms


In [97]:
recommendations = [item.numpy().decode() for item in titles[0]]
item_info_df.loc[item_info_df['item_no'].isin(recommendations)]

Unnamed: 0,item_no,colour,gender_description,brand,product_group,min_age,max_age
819,1943778631151349733,pink,unisex,buddy & hope,baby changing,,
1051,8348016184428027188,grey,unisex,buddy & hope,bedding,,
1414,-4324574363727628345,grey,unisex,buddy & hope,bedding,,
1708,-6809232176306104698,green,unisex,buddy & hope,bedding,,
3849,5459258902060684247,grey,unisex,buddy & hope,bedding,,
5221,9029407477525255005,black,unisex,buddy & hope,changing and travel bags,,
5392,8707946038086595320,black,unisex,buddy & hope,changing and travel bags,,
5631,-8787526849673626106,green,unisex,buddy & hope,bedding,,
5814,2782791252797313639,grey,unisex,buddy & hope,bedding,,
6752,-7824895490178514798,beige,unisex,buddy & hope,bedding,,


**Looks like it 'memorizes' users' tastes more**

In [98]:
test_users_dataset = tf.data.Dataset.from_tensor_slices(test_df_filtered['user_no'])

In [99]:
_, retrieved_items = index(test_df_filtered['user_no'], k=100)

779        6052211932610476899
911       -2849321406370879909
1338      -5023667320973902868
1391       1681580167487634121
                  ...         
431141     8562571093282867354
431331    -3111990773673910124
431332    -3111990773673910124
432066    -6153155530715126273
432338    -3695442683323654294
Name: user_no, Length: 2737, dtype: object
Consider rewriting this model with the Functional API.


In [100]:
ids_match = tf.cast(tf.math.equal(true_candidates, retrieved_items), tf.float32)

In [101]:
metrics = [tf.keras.metrics.Mean() for k in ks]
for k, metric in zip(ks, metrics):
    # By slicing until :k we assume scores are sorted.
    # Clip to only count multiple matches once.
    match_found = tf.clip_by_value(
        tf.reduce_sum(ids_match[:, :k], axis=1, keepdims=True),
        0.0, 1.0
    )
    metric.update_state(match_found)

In [102]:
for metric in metrics:
    print(metric.result())

tf.Tensor(0.0007307271, shape=(), dtype=float32)
tf.Tensor(0.0043843626, shape=(), dtype=float32)
tf.Tensor(0.006941907, shape=(), dtype=float32)
tf.Tensor(0.034344174, shape=(), dtype=float32)
tf.Tensor(0.057362076, shape=(), dtype=float32)


---

## Context Features

Now let's add context features

In [182]:
class UserModel(tf.keras.Model):
    def __init__(self, unique_users, num_oov_indices=1, embedding_dim=32):
        super().__init__()
        
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_users, 
                                         num_oov_indices=num_oov_indices),
            tf.keras.layers.Embedding(len(unique_users) + num_oov_indices, embedding_dim)
        ])
        
    def call(self, inputs):
        return self.user_embedding(inputs['user_no'])
    
class ItemModel(tf.keras.Model):
    def __init__(self, 
                 items, 
                 gender_description,
                 top_brands, 
                 top_groups, 
                 num_oov_indices=1, 
                 embedding_dim=16):
        super().__init__()
        
        self.item_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=items, 
                                         num_oov_indices=num_oov_indices),
            tf.keras.layers.Embedding(len(items) + num_oov_indices, 16)
        ])
        
        self.gender_description_lookup = tf.keras.layers.StringLookup(vocabulary=gender_description, 
                                                                      output_mode='one_hot',
                                                                      num_oov_indices=0)
        self.brand_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=top_brands, 
                                         num_oov_indices=num_oov_indices),
            tf.keras.layers.Embedding(len(top_brands) + num_oov_indices, 8)
        ])
        self.product_group_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=top_groups, 
                                         num_oov_indices=num_oov_indices),
            tf.keras.layers.Embedding(len(top_groups) + num_oov_indices, 5)
        ])
        
    def call(self, inputs):
        return tf.concat([
             self.item_embedding(inputs['item_no']),
             self.gender_description_lookup(inputs['gender_description']),
             self.brand_embedding(inputs['brand']),
             self.product_group_embedding(inputs['product_group'])
        ], axis=1)
    
class TFRSContextModel(tfrs.models.Model):
    def __init__(self, 
                 unique_users,
                 items, 
                 gender_description,
                 top_brands, 
                 top_groups):
        super().__init__()
        self.query_model = tf.keras.Sequential([
            UserModel(unique_users), 
            #tf.keras.layers.Dense(32)
        ])
        self.candidate_model = tf.keras.Sequential([
            ItemModel(items, gender_description, top_brands, top_groups),
            #tf.keras.layers.Dense(32)
        ])
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=items_dataset_w_context.batch(128).map(self.candidate_model)
            )
        )
    def compute_loss(self, inputs, training=False):
        query_embeddings = self.query_model({
            'user_no': inputs['user_no']
        })
        candidate_embeddings = self.candidate_model({
            'item_no': inputs['item_no'],
            'gender_description': inputs['gender_description'],
            'brand': inputs['brand'],
            'product_group': inputs['product_group']
        })
        
        return self.task(query_embeddings, candidate_embeddings)

**FIX ITEMS DATASET!!!**

In [183]:
items_df = item_info_df.loc[item_info_df['item_no'].isin(items)][
    ['item_no', 'gender_description', 'brand', 'product_group']]

items_dataset_w_context = tf.data.Dataset.from_tensor_slices(dict(items_df))

In [184]:
model = TFRSContextModel(unique_users, items, gender_description, top_brands, top_groups)

Consider rewriting this model with the Functional API.


In [185]:
model.compile(optimizer=tf.keras.optimizers.Adam())

In [186]:
cached_train = train_dataset.shuffle(1_000).batch(1024).cache()
cached_test = test_dataset.batch(512).cache()

In [187]:
history = model.fit(cached_train, epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [188]:
results = model.evaluate(cached_test, return_dict=True)

Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.


In [191]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.query_model)
# recommends items out of the entire items dataset.
_ = index.index_from_dataset(
        tf.data.Dataset.zip((items_dataset.batch(100), 
                             items_dataset_w_context.batch(100).map(model.candidate_model))))

Consider rewriting this model with the Functional API.


ValueError: The candidates and identifiers tensors must have the same number of rows (got 15254 candidates rows and 15300 identifier rows). 

In [None]:
for item in train_dataset.take(3).batch(3):
    print(item)

In [136]:
unique_users

array(['-6613028768649161262', '-2029740236817510102',
       '-1177964820845456571', ..., '-6153155530715126273',
       '-3695442683323654294', '-3683116124016444198'], dtype=object)

In [128]:
top_brands = train_df_filtered['brand'].value_counts()[:100].index
top_groups = train_df_filtered['product_group'].value_counts()[:50].index
gender_description = train_df_filtered['gender_description'].unique()
item_model = ItemModel(items, gender_description, top_brands, top_groups)

In [134]:
for item in train_dataset.take(3).batch(3):
    print(item)

{'user_no': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'-4011379598502823212', b'-4011379598502823212',
       b'-4011379598502823212'], dtype=object)>, 'item_no': <tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'7695529757452122196', b'1959675403949859161',
       b'2588296344401354503'], dtype=object)>, 'gender_description': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'unisex', b'unisex', b'unisex'], dtype=object)>, 'brand': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'reima', b'gola kids', b'new balance'], dtype=object)>, 'product_group': <tf.Tensor: shape=(3,), dtype=string, numpy=array([b'trainers', b'trainers', b'trainers'], dtype=object)>}


In [135]:
item_model(item)

<tf.Tensor: shape=(3, 51), dtype=float32, numpy=
array([[-2.9124392e-02,  4.1626740e-02, -5.9563406e-03, -4.5262661e-02,
         3.5044406e-02,  2.7105514e-02,  2.8516959e-02,  1.8707518e-02,
        -1.9531691e-02,  1.8144991e-02,  4.3524396e-02, -3.2704093e-02,
         3.2634068e-02,  1.4924120e-02,  4.0108468e-02, -2.2517873e-02,
         1.0000000e+00,  0.0000000e+00,  0.0000000e+00, -2.9270316e-02,
         1.5992299e-03, -4.9585924e-03, -1.5651453e-02,  4.0718261e-02,
         3.2859538e-02,  1.7436925e-02,  3.0568887e-02,  4.0927354e-02,
        -2.5297988e-02, -7.2416887e-03,  6.4537749e-03,  2.8787144e-03,
         4.1186240e-02,  4.1040454e-02, -4.3177962e-02, -4.1728616e-03,
        -4.2612206e-02,  3.6856320e-02,  4.7247592e-02, -6.0777664e-03,
        -1.2931935e-03,  3.4099136e-02,  4.5458842e-02, -2.7246797e-02,
         1.1664461e-02, -1.0514237e-02, -4.5344401e-02,  4.8438776e-02,
        -3.2641485e-02, -3.7657093e-02, -4.5715977e-02],
       [ 1.3030544e-03,  2.456

In [108]:
train_df['gender_description'].unique()

array(['boys', 'unisex', 'girls'], dtype=object)

In [110]:
tf.keras.layers.StringLookup?

In [114]:
gender_lookup = tf.keras.layers.StringLookup(vocabulary=train_df['gender_description'].unique(), 
                                             output_mode='one_hot', 
                                             num_oov_indices=0)

In [116]:
gender_lookup(tf.constant(['boys']))

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1., 0., 0.], dtype=float32)>

In [None]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
          vocabulary=unique_movie_titles, mask_token=None),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.title_vectorizer.adapt(movies)

  def call(self, titles):
    return tf.concat([
        self.title_embedding(titles),
        self.title_text_embedding(titles),
    ], axis=1)

In [104]:
for item in train_dataset.take(1):
    print(item)

{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4011379598502823212'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'7695529757452122196'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'reima'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'trainers'>}
