# Data Science Festival x ASOS
## Build and Deploy a Recommender System in 3 Hours.

# Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os

# Import training data

In [2]:
train = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_train_with_alphanumeric_dummy_ids.parquet")
valid = pd.read_parquet("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_valid_with_alphanumeric_dummy_ids.parquet")
dummy_users = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_dummy_users_with_alphanumeric_dummy_ids.csv", header=None).values.flatten().astype(str)
products = pd.read_csv("https://raw.githubusercontent.com/ASOS/dsf2020/main/dsf_asos_productIds.csv", header=None).values.flatten().astype(int)

# The briefest intro to tf

Tensors

In [18]:
x=tf.constant([[1,2,3,4,5,6]], dtype=tf.float32)
tf.math.square(x)

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[ 1.,  4.,  9., 16., 25., 36.]], dtype=float32)>

In [22]:
with tf.GradientTape() as tape: 
  y=tf.math.square(x)


In [23]:
y

<tf.Tensor: shape=(1, 6), dtype=float32, numpy=array([[ 1.,  4.,  9., 16., 25., 36.]], dtype=float32)>

In [24]:
dy=tape.gradient(y,x)

In [25]:
dy

Gradients

Multiply and add tensors

In [28]:
x = tf.constant([[1,2,3]], dtype=tf.float32)
Y = tf.constant([[1,2,3, 4], [1,2,3,4], [1,2,3,4]], dtype=tf.float32)

In [29]:
z = tf.constant([10, 11, 12, 13], dtype=tf.float32)

In [30]:
tf.matmul(x,Y) + z

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[16., 23., 30., 37.]], dtype=float32)>

This operation is very common in deep learning, so it has been abstracted:

In [31]:
dl1 = tf.keras.layers.Dense(4, use_bias = True, weights = [Y,z])
dl1(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[16., 23., 30., 37.]], dtype=float32)>

You can choose to apply a function to each value in the output

In [34]:
dl2 = tf.keras.layers.Dense(4, use_bias = True, weights = [Y,z], activation= lambda x:x+1)
dl2(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[17., 24., 31., 38.]], dtype=float32)>

We can put different layers together in a sequence:

In [35]:
dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])

In [38]:
x_b = dl3(dl2(x))
x_b

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[62.]], dtype=float32)>

We can get more flexibility if you use tf.keras.model:

In [40]:
class simple_model(tf.keras.Model):
  def __init__(self):
    super(simple_model, self).__init__()
    self.dl2 = tf.keras.layers.Dense(4, use_bias = True, weights = [Y,z], activation= lambda x:x+1)
    self.dl3 = tf.keras.layers.Dense(1, use_bias=False, \
                             weights=[tf.constant([[0], [1], [0], [1]], \
                                                  dtype=tf.float32)])
  def call(self,x):
    x_b = self.dl2(x)
    return self.dl3(x_b)

In [41]:
en = simple_model()
en(x)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[62.]], dtype=float32)>

So far we have been setting the weights of the dense layers, but if we don't set the weights than weights get randomly chosen.

In [42]:
dl6 = tf.keras.layers.Dense(4, use_bias=True)
dl6(x)

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=
array([[-3.7222753 , -0.53378916, -0.44897127,  0.8237381 ]],
      dtype=float32)>

In [43]:
dl6.get_weights()

[array([[-0.23246998,  0.52457905, -0.26573527,  0.6066973 ],
        [-0.7453265 ,  0.7768909 , -0.42890045, -0.5480511 ],
        [-0.6663841 , -0.87071663,  0.22485495,  0.43771434]],
       dtype=float32), array([0., 0., 0., 0.], dtype=float32)]

# Define a Recommender Model

The embedding layer gives a list of random numbers for each user and each product.

In [44]:
embed1= tf.keras.layers.Embedding(5,8)

In [45]:
embed1(2)

<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([ 0.00769961,  0.00474096,  0.03710094,  0.00821529,  0.01609159,
       -0.00968228,  0.02942025, -0.02581803], dtype=float32)>

In [46]:
embed1.get_weights()

[array([[ 4.53571789e-02, -7.52891228e-03,  4.00060304e-02,
         -3.24027427e-02, -1.74195170e-02, -1.20781064e-02,
          4.52122800e-02,  2.79351957e-02],
        [ 3.05524729e-02,  6.65962696e-04,  4.91578691e-02,
          4.14819233e-02, -3.22631821e-02, -2.41240263e-02,
         -3.36087123e-02,  3.94546986e-03],
        [ 7.69960880e-03,  4.74096462e-03,  3.71009372e-02,
          8.21528584e-03,  1.60915889e-02, -9.68227535e-03,
          2.94202454e-02, -2.58180257e-02],
        [ 2.36146152e-05, -1.87627226e-03, -4.61183786e-02,
          2.49653123e-02, -1.65243372e-02, -1.48584135e-02,
         -2.90604960e-02, -4.68560308e-03],
        [-4.87407818e-02, -4.37066667e-02, -1.62225142e-02,
         -3.05195209e-02,  4.59719785e-02, -1.92519911e-02,
         -2.63536703e-02, -3.98528464e-02]], dtype=float32)]

Scores can be found using the dot product.

In [47]:
dummy_user_embedding = tf.keras.layers.Embedding(len(dummy_users), 6)
product_user_embedding = tf.keras.layers.Embedding(len(products), 6)

In [48]:
tf.tensordot(dummy_user_embedding(1), product_user_embedding(99),axes=[[0],[0]])

<tf.Tensor: shape=(), dtype=float32, numpy=-0.0019839539>

We can score multiple products at the same time, which is what we need to create a ranking.

In [49]:
example_product=tf.constant([3,77,104,2062])
tf.tensordot(dummy_user_embedding(1),product_user_embedding(example_product), axes=[[0],[1]])

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.00038593, -0.00110699,  0.00023639,  0.00229521], dtype=float32)>

And we can score multiple users for multiple products which we will need to do if we are to train quickly.

But we need to map product ids to embedding ids.

In [51]:
product_table = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(tf.constant(products, dtype=tf.int32), 
                                        range(len(products))), -1)

In [57]:
product_table.lookup(tf.constant([9306139]))

<tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>

Let's put those two things together

In [58]:
class SimpleRecommender(tf.keras.Model):
    def __init__(self, dummy_users, products, len_of_embedding):
        super(SimpleRecommender, self).__init__()
        self.products = tf.constant(products, dtype=tf.int32)
        self.dummy_users = tf.constant(dummy_users, dtype=tf.string)
        self.dummy_user_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.dummy_users, range(len(dummy_users))), -1)
        self.product_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(self.products, range(len(products))), -1)
        
        self.user_embedding = tf.keras.layers.Embedding(len(dummy_users), len_of_embedding)
        self.product_embedding = tf.keras.layers.Embedding(len(products), len_of_embedding)
        
        self.dot = tf.keras.layers.Dot(axes=-1)
    def call(self,inputs):
        user = inputs[0]
        products = inputs[1]
        user_embedding_index = self.dummy_users_table.lookup(user)
        product_embedding_index = self.product_table.lookup(products)

        user_embedding_values =self.user_embedding(user_embedding_index)
        product_embedding_values =self.product_embedding(product_embedding_index)
        return self.dot([user_embedding_values,product_embedding_values])
    @tf.function
    def call_item_item(self, product):
        product_x = self.product_table.lookup(product)
        pe = tf.expand_dims(self.product_embedding(product_x), 0)
        
        all_pe = tf.expand_dims(self.product_embedding.embeddings, 0)#note this only works if the layer has been built!
        scores = tf.reshape(self.dot([pe, all_pe]), [-1])
        
        top_scores, top_indices = tf.math.top_k(scores, k=100)
        top_ids = tf.gather(self.products, top_indices)
        return top_ids, top_scores

In [None]:
srl= SimpleRecommender(dummy_users,products,15)
srl([tf.constant([['pmfkU4BNZhmtLgJQwJ7x'], ['UDRRwOlzlWVbu7H8YCCi']])
    ,tf.constant([[8650774,  9306139,  9961521],[12058614, 12058615, 11927550]])])

In [61]:
products

array([ 8650774,  9306139,  9961521, ..., 12058614, 12058615, 11927550])

# Creating a dataset

First create a tf.data.Dataset from the user purchase pairs.

In [None]:
dummy_user_tensor = tf.constant(train[["dummyUserId"]].values, dtype=tf.string)
product_tensor = tf.constant(train[["productId"]].values, dtype=tf.int32)

dataset = tf.data.Dataset.from_tensor_slices((dummy_user_tensor, product_tensor))
for x, y in dataset:
    print(x)
    print(y)
    break

For each purchase let's sample a number of products that the user did not purchase. Then the model can score each of the products and we will know we are doing a good job if the product with the highest score is the product that the user actually purchased.

We can do this using dataset.map

In [None]:
class Mapper():
    
    def __init__(self, possible_products, num_negative_products):
        self.num_possible_products = len(possible_products)
        self.possible_products_tensor = tf.constant(possible_products, dtype=tf.int32)
        
        self.num_negative_products = num_negative_products
    
    def __call__(self, user, product):
        return user, product

Let's bring the steps together to define a function which creates a dataset 

In [None]:
def get_dataset():
    pass

# Train a model

We need to compile a model, set the loss and create an evaluation metric. Then we need to train the model.

Let's do a manual check on whether the model is any good.

In [None]:
test_product = 11698965

In [None]:
print("Recs for item {}: {}".format(test_product, model.call_item_item(tf.constant(test_product, dtype=tf.int32))))

# Save the model

In [None]:
model_path = "models/recommender/1"

In [None]:
inpute_signature = tf.TensorSpec(shape=(), dtype=tf.int32)

In [None]:
signatures = { 'call_item_item': r1.call_item_item.get_concrete_function(inpute_signature)}

In [None]:
imported_model = tf.saved_model.load('models/recommeder/1')
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['call_item_item'](tf.constant([14844847]))

In [None]:
os.makedirs("dummy/0")
tf.saved_model.save(model, 'dummy/0')    
imported = tf.saved_model.load("dummy/0")
imported(tf.constant([14844847]))

In [None]:
os.makedirs("dummy/1")
tf.saved_model.save(model, 'dummy/1',
                    model.call_item_item.get_concrete_function(tf.TensorSpec(shape=(), dtype=tf.int32)))      
list(imported_model.signatures.keys())

In [None]:
imported_model.signatures['serving_default'](tf.constant([14844847]))

Zipping the saved model will make it easier to download.

In [None]:
from zipfile import ZipFile
import os
# create a ZipFile object
with ZipFile('recommender.zip', 'w') as zipObj:
   # Iterate over all the files in directory
    for folderName, subfolders, filenames in os.walk("models"):
        for filename in filenames:
           #create complete filepath of file in directory
           filePath = os.path.join(folderName, filename)
           # Add file to zip
           zipObj.write(filePath)