In [1]:
import time
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
from sklearn.model_selection import train_test_split

In [3]:
item_info_df = pd.read_csv("item_info.csv").drop(columns=["min_age", "max_age"])
item_info_df["item_no"] = item_info_df["item_no"].astype(str)
user_info_df = pd.read_csv("user_info.csv")
user_info_df["user_no"] = user_info_df["user_no"].astype(str)
user_item_interaction_df = pd.read_csv("user_item_interactions.csv")
user_item_interaction_df["user_no"] = user_item_interaction_df["user_no"].astype(str)
user_item_interaction_df["item_no"] = user_item_interaction_df["item_no"].astype(str)

In [4]:
item_info_df.head()

Unnamed: 0,item_no,colour,gender_description,brand,product_group
0,206890150141030846,beige,unisex,bloomingville,furniture
1,7637494654837559066,pink,girls,petit bateau,clothing sets
2,7969520735315050609,beige,unisex,bobo choses,jumpers and knitwear
3,-565751122846696741,white,unisex,piupiuchick,tops
4,6688930722259797984,green,unisex,filibabba,baby feeding


In [5]:
user_info_df.head()

Unnamed: 0,user_no,country,aov,avg_markdown
0,8587933979694308845,sweden,479.2,-0.2
1,2740387653650048572,sweden,837.6,-0.2
2,2084988796719274722,sweden,942.4,-0.56
3,-4662401680846085311,sweden,438.4,-0.62
4,-1086148156436227367,sweden,664.8,-0.286667


In [6]:
user_item_interaction_df.head()

Unnamed: 0,user_no,item_no,date,eventtype
0,9060639138425951676,-478270421339298398,2021-06-24,purchased
1,9060639138425951676,2658388892627023500,2021-11-27,pageView
2,9060639138425951676,504233002877562247,2021-11-26,pageView
3,9060639138425951676,-3413566329152665076,2021-10-21,pageView
4,9060639138425951676,6700254580945881296,2021-10-10,pageView


In [7]:
user_item_interaction_df["eventtype"].value_counts()

pageView         411685
addToCart        354022
purchased        227165
addToWishlist     18353
Name: eventtype, dtype: int64

# Notebook structure
Overall thought: Make a simple example using one or two features, but providing a working example to inspire the audience to go test by themselves, since there are more features in the data. Same logic with the model complexity.

Use a lot of the stuff from the TFRS tutorials when it comes to batch-sizes and loss functions to make it "relatable".

## Load data
* Whats done above this cell already
## Prepare data
* Choose features
* Preprocess (missing values / standardize  etc)
* Split data train/test
* Create TF Dataset
## Model creation
* Create one Query Model and one Candidate model (use same terminology as in the presentation). Maybe skip the dense layers?
* Create one TFRS model, with appropriate task and loss function using the two tower model architecture.
## Model training
* Would be nice if we had it set up so you could train both on GPU or CPU. So baiscally a flag if you want to use GPU or not (or check if GPU attached automatically). 
* Train for n epochs
* Important to set training=False in the loss function, otherwise it evaluates during every epoch on the train data, which is painfully slow. So something like (def compute_loss(self, features, training=False) -> tf.Tensor:)

## Model evaluation
* Evaluate on test and train, show difference between Top-100 accuracy
* Show that it overfits easily, just like any NN, so need to apply regularization and early stopping if you want it to generalize well (maybe not implement this)

## Model serving
* Create an index, and predict on one customer, show speed of inference. Batch-predict over all users and show ms/user of inference speed.

In [8]:
USER_FEATURES = [
    "user_no",
    "aov"
]
ITEM_FEATURES = [
    "item_no",
    "brand"
]

In [9]:
def transform_df_to_ds(df: pd.DataFrame, training: bool = False):
    tf.random.set_seed(42)
    ds = tf.data.Dataset.from_tensor_slices(df.to_dict('list'))
    ds = ds.map(lambda x: {feat: x[feat] for feat in df.columns.to_list()})
    if training:
        return ds.shuffle(1_000_000, seed=42, reshuffle_each_iteration=False)
    else:
        return ds

In [10]:
items_ds = transform_df_to_ds(item_info_df[ITEM_FEATURES]).batch(1024)

In [11]:
user_info_df["aov"] = (user_info_df["aov"]-user_info_df["aov"].mean())/user_info_df["aov"].std()

In [12]:
interaction_with_features_df = user_item_interaction_df[["item_no", "user_no"]].merge(
    item_info_df[ITEM_FEATURES], on="item_no", how="inner").merge(
    user_info_df[USER_FEATURES], on="user_no", how="left")

In [14]:
interaction_with_features_df["aov"].fillna(interaction_with_features_df["aov"].median(), inplace=True)
interaction_with_features_df["brand"].fillna("unknown", inplace=True)

In [15]:
interaction_with_features_df

Unnamed: 0,item_no,user_no,brand,aov
0,-478270421339298398,9060639138425951676,aden + anais,-0.148118
1,-478270421339298398,9060639138425951676,aden + anais,-0.148118
2,-478270421339298398,9060639138425951676,aden + anais,-0.148118
3,-478270421339298398,4759906564227534049,aden + anais,-0.148118
4,-478270421339298398,-6088219459028908639,aden + anais,0.174368
...,...,...,...,...
1011220,3602204210967538706,523748455264982590,djeco,0.135320
1011221,-8379298112698763661,1176391229201836984,jacadi,-0.300428
1011222,3361370720256307796,-3683116124016444198,jacadi,-0.572064
1011223,-422153658955051132,4666521970382148688,billieblush,-0.148118


In [16]:
interaction_with_features_df_train, interaction_with_features_df_test = train_test_split(interaction_with_features_df, test_size=0.2)

In [17]:
train = transform_df_to_ds(interaction_with_features_df_train, training=True)
test = transform_df_to_ds(interaction_with_features_df_test)
print(f"Sample sizes in train and val respectively:  {len(train)}, {len(test)}")

Sample sizes in train and val respectively:  808980, 202245


In [18]:
def get_vocabulary(dataset, feature):
    vocab = dataset[feature].value_counts().index.to_list()  # Makes sure they are sorted if you only want to keep the most popular ones
    unique_ids = np.delete(vocab, np.argwhere(vocab == "unknown"))
    return unique_ids

In [22]:
# Create the user model
def create_user_model():
    user_vocab = get_vocabulary(user_info_df, "user_no")

    user_input = tf.keras.Input(shape=(1,), dtype="string", name="user_no")
    user_look_up = tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=user_vocab,
                    mask_token=None,
                    name=f"SL_user_no")(user_input)
    user_embedding_layer = tf.keras.layers.Embedding(len(user_vocab) + 1,
                                                71,
                                                embeddings_regularizer=tf.keras.regularizers.l2(0),
                                                name=f"Embedding_user_no")(user_look_up)
    user_flattened_embeddings = tf.keras.layers.Flatten(name=f"Flatten_user_no")(user_embedding_layer)

    aov_input = tf.keras.Input(shape=(1,), dtype="float64", name="aov")
    output = tf.keras.layers.concatenate([user_flattened_embeddings, aov_input])
    # Could include dropout here as well
    
    #output = tf.keras.layers.Dense(32,
    #                           kernel_regularizer=tf.keras.regularizers.l2(0),
    #                           activation=None,
    #                           name="user_embedding")(output)

    return tf.keras.models.Model([user_input, aov_input], output)

In [23]:
user_model = create_user_model()
user_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_no (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
SL_user_no (StringLookup)       (None, 1)            0           user_no[0][0]                    
__________________________________________________________________________________________________
Embedding_user_no (Embedding)   (None, 1, 71)        1742624     SL_user_no[0][0]                 
__________________________________________________________________________________________________
Flatten_user_no (Flatten)       (None, 71)           0           Embedding_user_no[0][0]          
____________________________________________________________________________________________

In [24]:
# Create the item model
def create_item_model():
    item_vocab = get_vocabulary(interaction_with_features_df, "item_no")

    item_input = tf.keras.Input(shape=(1,), dtype="string", name="item_no")
    item_look_up = tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=item_vocab,
                    mask_token=None,
                    name=f"SL_item_no")(item_input)
    item_embedding_layer = tf.keras.layers.Embedding(len(item_vocab) + 1,
                                                64,
                                                embeddings_regularizer=tf.keras.regularizers.l2(0),
                                                name=f"Embedding_item_no")(item_look_up)
    item_flattened_embeddings = tf.keras.layers.Flatten(name=f"Flatten_item_no")(item_embedding_layer)
    
    # Add brand feature
    brand_vocab = get_vocabulary(interaction_with_features_df, "brand")

    brand_input = tf.keras.Input(shape=(1,), dtype="string", name="brand")
    brand_look_up = tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=brand_vocab,
                    mask_token=None,
                    name=f"SL_brand")(brand_input)
    brand_embedding_layer = tf.keras.layers.Embedding(len(brand_vocab) + 1,
                                                8,
                                                embeddings_regularizer=tf.keras.regularizers.l2(0),
                                                name=f"Embedding_brand")(brand_look_up)
    brand_flattened_embeddings = tf.keras.layers.Flatten(name=f"Flatten_brand")(brand_embedding_layer)

    output = tf.keras.layers.concatenate([item_flattened_embeddings, brand_flattened_embeddings])
    # Could include dropout here as well
    
    #output = tf.keras.layers.Dense(32,
    #                       kernel_regularizer=tf.keras.regularizers.l2(0),
    #                       activation=None,
    #                       name="item_embedding")(output)

    return tf.keras.models.Model([item_input, brand_input], output)

In [25]:
item_model = create_item_model()
item_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
item_no (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
brand (InputLayer)              [(None, 1)]          0                                            
__________________________________________________________________________________________________
SL_item_no (StringLookup)       (None, 1)            0           item_no[0][0]                    
__________________________________________________________________________________________________
SL_brand (StringLookup)         (None, 1)            0           brand[0][0]                      
____________________________________________________________________________________________

In [26]:
class CustomRetrievalModel(tfrs.models.Model):
    def __init__(self, item_model, user_model, items_ds):
        super().__init__()
        self.item_model = item_model
        self.user_model = user_model
        self.task: tf.keras.layers.Layer = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=items_ds.map(self.item_model)))

    
    def call(self, features):
        user_embeddings = self.user_model(features)
        item_embeddings = self.item_model(features)
        return user_embeddings, item_embeddings

    def compute_loss(self, features, training=False):
        user_embeddings, item_embeddings = self(features)
        loss = self.task(user_embeddings,
                         item_embeddings,
                         compute_metrics=not training)
        return loss




In [27]:
N_EPOCHS = 5
PATIENCE = 3
BATCH_SIZE = 4096
USE_GPU = False

In [28]:
model = CustomRetrievalModel(item_model,
                             user_model,
                             items_ds)

callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_factorized_top_k/top_100_categorical_accuracy',
    patience=PATIENCE, restore_best_weights=True, mode="max")
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))

In [29]:
if USE_GPU:
    gpus = tf.config.list_logical_devices('GPU')
    print(f"Using {len(gpus)} GPU's. Devices: {gpus}")
    strategy = tf.distribute.MirroredStrategy(devices=gpus)

    with strategy.scope():
        model.fit(train.batch(BATCH_SIZE),
                  epochs=N_EPOCHS,
                  validation_data=test.batch(BATCH_SIZE),
                  callbacks=[callback_early_stopping])
        
else:
    model.fit(train.batch(BATCH_SIZE),
          epochs=N_EPOCHS,
          validation_data=test.batch(BATCH_SIZE),
          callbacks=[callback_early_stopping])

Epoch 1/5


  [n for n in tensors.keys() if n not in ref_input_names])
  [n for n in tensors.keys() if n not in ref_input_names])


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
train_top_100_accuracy = model.evaluate(train.take(len(test)).batch(BATCH_SIZE), return_dict=True)
val_top_100_accuracy = model.evaluate(test.batch(BATCH_SIZE), return_dict=True)
print(f"Train top-100 accuracy:  {train_top_100_accuracy['factorized_top_k/top_100_categorical_accuracy']}")
print(f"Test top-100 accuracy:  {val_top_100_accuracy['factorized_top_k/top_100_categorical_accuracy']}")

Train top-100 accuracy:  0.2574847340583801
Test top-100 accuracy:  0.12093747407197952


In [35]:
# Serving
k=100
brute_force_index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=k)
brute_force_index.index_from_dataset(
    items_ds.map(lambda item: (item["item_no"], model.item_model(item)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f277fe22210>

In [38]:
user_input = {k: np.array(v) for k, v in user_info_df.head(1).to_dict('list').items()}
_, recommendations = brute_force_index(user_input)
recommendations[0][:5] # Top 5 items for one user.

  [n for n in tensors.keys() if n not in ref_input_names])


<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'-1862413450221128561', b'2415771766691761607',
       b'-5229418859057656862', b'4841275771541059511',
       b'-8630133537136090944'], dtype=object)>

In [54]:
BATCH_SIZE = 500
N_USERS_TO_PREDICT_FOR = 20_000

start_timer = time.time()
for i in range(0, N_USERS_TO_PREDICT_FOR, BATCH_SIZE):
    user_input = {k: np.array(v) for k, v in user_info_df.iloc[i:i + BATCH_SIZE].to_dict('list').items()}
    _, recommendations = brute_force_index(user_input)

print(
    f"Inference for top-{k} item recommendations for {N_USERS_TO_PREDICT_FOR} number of users completed in {time.time()-start_timer:.2f} seconds."
    f"\nAverage inference time per user is {(time.time()-start_timer)/N_USERS_TO_PREDICT_FOR*1000:.2f}ms "
)

Inference for top-100 item recommendations for 20000 number of users completed in 2.57 seconds.
Average inference time per user is 0.13ms 
