# Training TFRS

In [1]:
from typing import Dict, Any, Text

import numpy as np 
import pandas as pd

import tensorflow as tf
import tensorflow_recommenders as tfrs

## **Reading in the Data** 

In [2]:
train_df = pd.read_csv('train.csv', dtype={'user_no': str, 'item_no': str})
test_df = pd.read_csv('test.csv', dtype={'user_no': str, 'item_no': str})

In [76]:
train_df_filtered = train_df.loc[train_df['user_no'].isin(train_df['user_no'].value_counts()[:1000].index)]
items = {'item_no': train_df_filtered['item_no'].unique()}

In [83]:
train_dataset = tf.data.Dataset.from_tensor_slices(dict(train_df_filtered))
test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_df))

items_dataset = tf.data.Dataset.from_tensor_slices(items)

In [5]:
for elem in train_dataset.take(3):
    print(elem)

{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'9060639138425951676'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-478270421339298398'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'boys'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'aden + anais'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'bedding'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'9060639138425951676'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-4352133231638554813'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'boys'>, 'brand': <tf.Tensor: shape=(), dtype=string, numpy=b'ralph lauren'>, 'product_group': <tf.Tensor: shape=(), dtype=string, numpy=b'jumpers and knitwear'>}
{'user_no': <tf.Tensor: shape=(), dtype=string, numpy=b'9060639138425951676'>, 'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'3628487599004239534'>, 'gender_description': <tf.Tensor: shape=(), dtype=string, numpy=b'unisex'>, 'brand': 

In [84]:
for item in items_dataset.take(3):
    print(item)

{'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'-5730313128884875484'>}
{'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'8568137491440306643'>}
{'item_no': <tf.Tensor: shape=(), dtype=string, numpy=b'2548468278650164360'>}


In [34]:
unique_users = train_df_filtered['user_no'].unique()
unique_items = train_df_filtered['item_no'].unique()

print(len(unique_users))
print(len(unique_items))

1000
10432


In [35]:
EMBEDDING_DIM = 32
NUM_OOV_INDICES = 1

In [36]:
user_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_users, 
      num_oov_indices=NUM_OOV_INDICES),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_users) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

In [86]:
item_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unique_items, 
      num_oov_indices=NUM_OOV_INDICES),
  tf.keras.layers.Embedding(len(unique_items) + NUM_OOV_INDICES, EMBEDDING_DIM)
])

In [87]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=item_dataset.batch(128).map(item_model)
)

In [88]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [89]:
class SimpleTFRSModel(tfrs.Model):

    def __init__(self, user_model, item_model, task):
        super().__init__()
        self.user_model: tf.keras.Model = user_model
        self.item_model: tf.keras.Model = item_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user_no"])
        # And pick out the movie features and pass them into the movie model,
        # getting embeddings back.
        positive_item_embeddings = self.item_model(features["item_no"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_item_embeddings)

In [90]:
model = SimpleTFRSModel(user_model, item_model, task)
model.compile(optimizer=tf.keras.optimizers.Adam())

In [91]:
train_dataset_interactions = train_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})
test_dataset_interactions = test_dataset.map(lambda x: {
    'user_no': x['user_no'],
    'item_no': x['item_no']
})

In [92]:
cached_train = train_dataset_interactions.shuffle(100_000).batch(1024).cache()
cached_test = test_dataset_interactions.batch(512).cache()

In [93]:
history = model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [94]:
# Create a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommends movies out of the entire items dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((item_dataset.batch(100), item_dataset.batch(100).map(model.item_model)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x14e71e790>

In [95]:
# Get recommendations.
_, titles = index(tf.constant(["8611951538862997486"]))
print(f"Recommendations for user 8611951538862997486: {titles[0, :3]}")

Recommendations for user 8611951538862997486: [b'8568137491440306643' b'8568137491440306643' b'2269104852390528660']


In [102]:
_, titles = index.query_with_exclusions(tf.constant(["8611951538862997486"]), 
                                       tf.constant([items_to_exclude]))

In [52]:
item_info_df = pd.read_csv('item_info.csv', dtype={'item_no': str})

In [104]:
recommendations = [item.numpy().decode() for item in titles[0]]

In [103]:
titles[0]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'2269104852390528660', b'-7255110501726227326',
       b'9056398533202444508', b'2200849807137753656',
       b'4728871455199306062', b'4728871455199306062',
       b'4728871455199306062', b'-5111789415165122883',
       b'829182131992270441', b'-8658071339532683589'], dtype=object)>

In [105]:
item_info_df.loc[item_info_df['item_no'].isin(recommendations)]

Unnamed: 0,item_no,colour,gender_description,brand,product_group,min_age,max_age
12875,2269104852390528660,brown,unisex,buddy & hope,stroller accessories,,
14136,829182131992270441,green,unisex,liewood,sandals,3.0,8.0
15787,2200849807137753656,yellow,boys,kuling,swimwear and coverups,2.0,10.0
22065,-5111789415165122883,blue,unisex,cam cam,bedding,,
23800,9056398533202444508,beige,girls,joha,tops,0.875,4.0
28287,4728871455199306062,green,girls,mini a ture,coats and jackets,2.0,8.0
52760,-7255110501726227326,silver,girls,molo,skirts,2.0,14.0
54981,-8658071339532683589,pink,unisex,bobo choses,tops,1.0,11.0


In [72]:
item_info_df.loc[item_info_df['item_no'] == '-4719306461609634737']

Unnamed: 0,item_no,colour,gender_description,brand,product_group,min_age,max_age
27828,-4719306461609634737,pink,unisex,kuling,gloves and mittens,0.125,7.0


In [100]:
items_to_exclude = train_df_filtered.loc[train_df_filtered['user_no'] == '8611951538862997486']['item_no'].unique()

In [101]:
tf.constant([items_to_exclude])

<tf.Tensor: shape=(1, 14), dtype=string, numpy=
array([[b'-5730313128884875484', b'8568137491440306643',
        b'2548468278650164360', b'6475930040942565267',
        b'-247398092582614634', b'3189056047292869834',
        b'9082322682150583002', b'-2704888249198936738',
        b'-3644029383138024689', b'-7476420848659739392',
        b'-4866887541185258299', b'-500451012866804228',
        b'137006744928801487', b'-439802747482401384']], dtype=object)>

In [61]:
train_df_filtered.loc[train_df_filtered['user_no'] == '8611951538862997486']

Unnamed: 0,user_no,item_no,gender_description,brand,product_group
366,8611951538862997486,-5730313128884875484,boys,air jordan,clothing sets
367,8611951538862997486,8568137491440306643,boys,boss,bottoms
368,8611951538862997486,2548468278650164360,unisex,boss,tops
369,8611951538862997486,6475930040942565267,boys,ralph lauren,tops
370,8611951538862997486,-247398092582614634,boys,air jordan,clothing sets
371,8611951538862997486,3189056047292869834,boys,air jordan,all in ones
372,8611951538862997486,9082322682150583002,boys,adidas,clothing sets
373,8611951538862997486,-2704888249198936738,boys,adidas,clothing sets
374,8611951538862997486,-5730313128884875484,boys,air jordan,clothing sets
375,8611951538862997486,8568137491440306643,boys,boss,bottoms


In [46]:
3 + 3

6