# Trying to adapt solution from [TFRS](https://www.tensorflow.org/recommenders/examples/basic_retrieval) tutorial for H&M data.

In [393]:
import os
import pprint
import tempfile
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
tf.get_logger().setLevel('INFO')
from typing import Dict, Text
from sklearn.model_selection import train_test_split

# Short EDA on customer & article metadata

In [394]:
#article data
art_df = pd.read_csv('../../data/processed/articles_filled_priced.csv')

In [395]:
#customer data
cus_df = pd.read_csv('../../data/processed/customers_filled.csv')

- H&M transaction data were subsampled using [custom function](https://github.com/omegatro/IGP_2023/blob/main/modules/preprocessing.py to 5% of the original records (seed = 4312).
- The subsampled data were transformed to represent number of transactions for individual customer-product combinations (number of times each product was bought by each customer). The following code was used:
```
df = df.groupby(['customer_id', 'article_id']).count()
df = df[['customer_id', 'article_id','t_dat']].reset_index()
df.rename(columns={'t_dat':'t_count'}, inplace = True)
df.to_csv('0_05_4312_cus_art_grp_count.csv', header=True, index=False)
```
- The resulting file was uploaded to google drive to work with from colab.

In [396]:
#In this iteration, only ids are used as attributes for simplicity
counts = pd.read_csv('../../data/processed/0_05_4312_cus_art_grp_count.csv')[['customer_id', 'article_id']].sample(250_000)

In [397]:
#Adding product features
counts = counts.merge(art_df, left_on='article_id', right_on='article_id', how='left')

In [398]:
counts.columns

Index(['customer_id', 'article_id', 'product_code', 'prod_name',
       'product_type_no', 'product_type_name', 'product_group_name',
       'graphical_appearance_no', 'graphical_appearance_name',
       'colour_group_code', 'colour_group_name', 'perceived_colour_value_id',
       'perceived_colour_value_name', 'perceived_colour_master_id',
       'perceived_colour_master_name', 'department_no', 'department_name',
       'index_code', 'index_name', 'index_group_no', 'index_group_name',
       'section_no', 'section_name', 'garment_group_no', 'garment_group_name',
       'detail_desc', 'avg_price'],
      dtype='object')

In [399]:
#Adding customer features
counts = counts.merge(cus_df, left_on='customer_id', right_on='customer_id', how='left')

In [416]:
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [417]:
#Getting product ids (required to transform those into embeddings by the model)
articles = pd.read_csv('../../data/processed/articles_filled_priced.csv')[['article_id', 'prod_name', 'product_group_name', 'product_type_name']]#.sample(sample_size, random_state=1)
unq_articles = articles.article_id.unique().astype(str)
article_ids = articles['article_id'].astype(str).map(lambda x: x.encode('utf-8'))

In [418]:
print(article_ids[:5])

0    b'108775015'
1    b'108775044'
2    b'108775051'
3    b'110065001'
4    b'110065002'
Name: article_id, dtype: object


In [419]:
print(unq_articles[:5])

['108775015' '108775044' '108775051' '110065001' '110065002']


In [420]:
len(unq_cids)

191709

In [421]:
len(unq_articles)

105542

In [422]:
feature_names = ['customer_id', 'article_id', 'prod_name', 'product_group_name', 'product_type_name', 'postal_code']

In [423]:
for n in feature_names:
    train[n] = train[n].astype(str)
train = train[feature_names].to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [424]:
for n in feature_names:
    test[n] = test[n].astype(str)
test = test[feature_names].to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [425]:
for x in train.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'534164019',
 'customer_id': b'cd28e04198523f1e0997b5ffd7c5fc2cde37ef8340fb55cf4c1cfeb04db5'
                b'96dd',
 'postal_code': b'346f1dfc9311c793a9f3ca9f5e522c061cc8fd63f84a289b982c70567389'
                b'67e2',
 'prod_name': b'NT Alva 2-pack',
 'product_group_name': b'Garment Upper body',
 'product_type_name': b'Vest top'}


In [426]:
for x in test.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'816908001',
 'customer_id': b'35e15ad1e4d0bc7148bd9f034aeb7a5bb7d1420a463f720c45a0919ac740'
                b'8a5b',
 'postal_code': b'4ece051d46240909b6facbae07145988c16ffc8549c338c29cdd3e94afc5'
                b'4749',
 'prod_name': b'Dress Prairie',
 'product_group_name': b'Garment Full body',
 'product_type_name': b'Dress'}


# TFRS-based modeling experiments

# Converting from pandas df to tensorflow datasets for compatibility

# Adapting the model

In [406]:
embedding_dimension = 64

# Converting features into embedding vectors


In [427]:
customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_cids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.\
  tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
])

In [428]:
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_articles, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [330]:
# unq_prod_names = articles.prod_name.unique().astype(str)
# unq_prod_grp_names = articles.product_group_name.unique().astype(str)
# unq_product_type_name = articles.product_type_name.unique().astype(str)
# unq_postal_code = counts.postal_code.unique().astype(str)

# customer_postal_code = tf.keras.Sequential([
#   tf.keras.layers.StringLookup(
#       vocabulary=unq_postal_code, mask_token=None),
#   tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
# ])

# product_model = tf.keras.Sequential([
#   tf.keras.layers.StringLookup(
#       vocabulary=unq_articles, mask_token=None),
#   tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
# ])

# product_name_model = tf.keras.Sequential([
#   tf.keras.layers.StringLookup(
#       vocabulary=unq_prod_names, mask_token=None),
#   tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
# ])

# product_group_model = tf.keras.Sequential([
#   tf.keras.layers.StringLookup(
#       vocabulary=unq_prod_grp_names, mask_token=None),
#   tf.keras.layers.Embedding(len(unq_prod_grp_names) + 1, embedding_dimension)
# ])

# product_type_model = tf.keras.Sequential([
#   tf.keras.layers.StringLookup(
#       vocabulary=unq_product_type_name, mask_token=None),
#   tf.keras.layers.Embedding(len(unq_product_type_name) + 1, embedding_dimension)
# ])

In [429]:
#Generating embeddings from article ids to be used by evaluation layer to calculate the similarity score
article_embeddings = product_model(article_ids)
candidates_dataset = tf.data.Dataset.from_tensor_slices(article_embeddings)

In [430]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=candidates_dataset.batch(128)  # Batch the candidates for efficiency
)

In [431]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [432]:
class ProductlensModel(tfrs.Model):

  def __init__(self, customer_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    customer_embeddings = self.customer_model(features["customer_id"])
    # And pick out the product features and pass them into the product model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features["article_id"])

    # The task computes the loss and the metrics.
    return self.task(customer_embeddings, positive_product_embeddings, compute_metrics=not training)

In [357]:
# class ProductlensModel(tfrs.Model):

#   def __init__(self, customer_model, product_model, customer_postal_code, product_name_model, product_group_model, product_type_model, embedding_dimension):
#     super().__init__()
#     self.customer_model = customer_model
#     self.product_model = product_model
#     self.customer_postal_code = customer_postal_code
#     self.product_name_model = product_name_model
#     self.product_group_model = product_group_model
#     self.product_type_model = product_type_model

#     # Initialize the retrieval task
#     self.task = tfrs.tasks.Retrieval(
#       # Set the metric to evaluate the model
#       metrics=tfrs.metrics.FactorizedTopK(
#         candidates=candidates_dataset.batch(128)  # Assuming you have a pre-computed embeddings layer for products
#       )
#     )

#   def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
#     # Compute embeddings for customer and product
#     customer_embeddings = self.customer_model(features["customer_id"])
#     product_embeddings = self.product_model(features["article_id"])

#     # Compute embeddings for the customer's postal code
#     postal_code_embeddings = self.customer_postal_code(features["postal_code"])

#     # Combine customer embeddings and postal code embeddings
#     query_embeddings = tf.concat([customer_embeddings, postal_code_embeddings], axis=0)

#     # Compute embeddings for additional product features
#     product_name_embeddings = self.product_name_model(features["prod_name"])
#     product_group_embeddings = self.product_group_model(features["product_group_name"])
#     product_type_embeddings = self.product_type_model(features["product_type_name"])

#     # Combine product embeddings and additional product feature embeddings
#     candidate_embeddings = tf.concat([product_embeddings, product_name_embeddings, product_group_embeddings, product_type_embeddings], axis=0)

#     return self.task(query_embeddings, candidate_embeddings, compute_metrics=not training)

In [437]:
#Hyperparameters
alpha = 1
batch_size = 256
epochs = 1

In [438]:
model = ProductlensModel(customer_model, product_model) #customer_postal_code, product_name_model, product_group_model, product_type_model, embedding_dimension)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=alpha))

In [439]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [440]:
model.fit(cached_train, epochs=epochs)



<keras.src.callbacks.History at 0x7f9b5f729c60>

In [441]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.3996799886226654,
 'factorized_top_k/top_5_categorical_accuracy': 0.4053800106048584,
 'factorized_top_k/top_10_categorical_accuracy': 0.40799999237060547,
 'factorized_top_k/top_50_categorical_accuracy': 0.41326001286506653,
 'factorized_top_k/top_100_categorical_accuracy': 0.41585999727249146,
 'loss': 810.5859375,
 'regularization_loss': 0,
 'total_loss': 810.5859375}

In [442]:
# If 'test' is a dataset of dictionaries that include 'article_id'
candidates = test.map(lambda x: (x['article_id'], model.product_model(x['article_id'])))

index = tfrs.layers.factorized_top_k.BruteForce(model.customer_model)
index.index_from_dataset(candidates.batch(100))

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f9b6a51a8c0>

In [443]:
# Get recommendations for a specific customer
customer_id = 'abc'
_, titles = index(tf.constant([customer_id]))
print(f"Recommendations for customer {customer_id}: {titles[0].numpy().astype(str).tolist()}")

Recommendations for customer abc: ['426019008', '842605006', '842605006', '861173003', '727704001', '727704001', '693242009', '693242009', '693242009', '446224016']


# Ranking model

In [15]:
#In this iteration, only ids are used as attributes for simplicity
counts = pd.read_csv('../../data/processed/0_05_4312_cus_art_grp_count.csv')[['customer_id', 'article_id','t_count']]
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [18]:
#Getting product ids (required to transform those into embeddings by the model)
articles = pd.read_csv('../../data/processed/articles_filled_priced.csv')[['article_id']]
unq_articles = articles.article_id.unique().astype(str)
article_ids = articles['article_id'].astype(str).map(lambda x: x.encode('utf-8'))

In [21]:
train['customer_id'] = train.customer_id.astype(str)
train['article_id'] = train.article_id.astype(str)
train = train.to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [22]:
for x in train.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'714790003',
 'customer_id': b'39bc5e71c3609f8e37bd24099d7793dabbd9fdd73261e443f0a929e79b7e'
                b'658e',
 't_count': 1}


In [23]:
test['customer_id'] = test.customer_id.astype(str)
test['article_id'] = test.article_id.astype(str)
test = test.to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [24]:
for x in test.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'768759002',
 'customer_id': b'99b8cd105ee382f5c0bdaf672696522a0564eec294e94b11ac8b8d7bd2d1'
                b'3366',
 't_count': 1}


In [25]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.customer_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unq_cids , mask_token=None),
      tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
    ])

    # Compute embeddings for product.
    self.product_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unq_articles, mask_token=None),
      tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    cus_id, art_id = inputs

    cus_embedding = self.customer_embeddings(cus_id)
    art_embedding = self.product_embeddings(art_id)

    return self.ratings(tf.concat([cus_embedding, art_embedding], axis=1))

In [26]:
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [27]:
class ProductlensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["customer_id"], features["article_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("t_count")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [43]:
#Hyperparameters
alpha = 0.001
batch_size = 128
epochs = 5

In [44]:
model = ProductlensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=alpha))

In [45]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [46]:
model.fit(cached_train, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fabe75dbe20>

In [47]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.1125449687242508,
 'loss': 0.0002905943838413805,
 'regularization_loss': 0,
 'total_loss': 0.0002905943838413805}

In [48]:
test_counts = {}
sample = counts.sample(1000)
test_product_ids = zip(sample['customer_id'],sample['article_id'], sample['t_count'])
for pid, aid, t_count in test_product_ids:
  test_counts[pid] = (model({
      "customer_id": np.array(['73f7d8cca33b8236879d92a6188f691410a82763dc8bb65b8829568c07f8d9a7']),
      "article_id": np.array([pid])

  }).numpy()[0,0], t_count)

In [49]:
print("t_counts:")
for title, score in sorted(test_counts.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

t_counts:
94665b46e194622ccdbcadc0170f13a2f8ede1ff6d057d43a19b8938c808b662: (1.000554, 10)
e2115fc63d2a66bb51d6859336971a2d54ce64794a932264e7bc8c8242a49efd: (1.000554, 3)
de77258ebc4461f81aaec13984c8eb298e027ce7d13d7c846554ad54ca2cdf58: (1.000554, 2)
98158bad80501d2cde99489e8833f4e233a819bd52466cfb19c442d9a155dea3: (1.000554, 2)
6177112295d53d4cc4ae600703aa8d68e337fee0002e0beaeeacab5a58ed24cf: (1.000554, 2)
00227494dd4e87da02bb1ab4afc38f13f2e11c6517b1bbcdca56f212a9c470ab: (1.000554, 2)
77cdcc11e835b84fdf84df083bc371d326842d4e76cd65bd4a5b90387764ef3f: (1.000554, 2)
7b549ec86c17bdf75085c244cf5983827246f189c149bd78e5e5459529380564: (1.000554, 2)
dc0cadd81f8277ca00e28370652dab8688afbf9509503e48bc40673b2658810c: (1.000554, 2)
e0ae5cd316645b3d815f223678b21343bf585460914b28d4a1200250b2d08481: (1.000554, 2)
c6b096111bf0a6fe256013815014d51a572f380b6563590fc72f96c71eb5efb4: (1.000554, 2)
08e91b986d9162ea6ff8ff9a84d22b155e7ed00f41933bab339600fda8f9e3a9: (1.000554, 2)
ee3bb9bd89df7939ea8794f6744ef

# Including Features

In [134]:
#In this iteration, only ids are used as attributes for simplicity
counts = pd.read_csv('../../data/processed/combined_agg_filled_prices.csv')
articles = pd.read_csv('../../data/processed/articles_filled_priced.csv')
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [135]:
strings = [
    'customer_id',
    'article_id',
    "product_group_name",
    "product_type_name",
    "prod_name",
    "postal_code"
]

In [136]:
for c in strings:
    train[c] = train[c].astype(str)
    test[c] = test[c].astype(str)

In [137]:
train = train[strings].to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [138]:
test = test[strings].to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [139]:
embedding_dimension = 32

unq_cids = counts.customer_id.unique()
unq_articles = articles.article_id.unique().astype(str)
unq_prod_names = articles.prod_name.unique().astype(str)
unq_prod_grp_names = articles.product_group_name.unique().astype(str)
unq_product_type_name = articles.product_type_name.unique().astype(str)
unq_postal_code = counts.postal_code.unique().astype(str)

In [140]:
customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_cids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.\
  tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
])

In [141]:
customer_postal_code = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_postal_code, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [142]:
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_articles, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [143]:
product_name_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_prod_names, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [144]:
product_group_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_prod_grp_names, mask_token=None),
  tf.keras.layers.Embedding(len(unq_prod_grp_names) + 1, embedding_dimension)
])

In [145]:
product_type_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_product_type_name, mask_token=None),
  tf.keras.layers.Embedding(len(unq_product_type_name) + 1, embedding_dimension)
])

In [169]:
class CustomerModel(tf.keras.Model):

  def __init__(self, customer_model, customer_postal_code):
    super().__init__()

    self.customer_embedding = customer_model
    self.pcode_embedding = customer_postal_code

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.customer_embedding(inputs['customer_id']), 
        self.pcode_embedding(inputs['postal_code'])
    ], axis = 1)


In [170]:
class ProductModel(tf.keras.Model):

  def __init__(self, product_model, product_name_model, product_group_model, product_type_model):
    super().__init__()

    self.product_embedding = product_model
    self.prod_name_embedding = product_name_model
    self.prod_group_embedding = product_group_model
    self.prod_type_embedding = product_type_model
    

  def call(self, inputs):
    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.product_embedding(inputs['article_id']), 
        self.prod_name_embedding(inputs['prod_name']),
        self.prod_group_embedding(inputs['prod_name']),
        self.prod_type_embedding(inputs['prod_name']),
    ], axis = 1)

In [182]:
cus_model = CustomerModel(customer_model, customer_postal_code)
result = cus_model.predict(train.map(lambda x:{'customer_id': x['customer_id'], 'postal_code': x['postal_code']}).batch(2).take(1).as_numpy_iterator())
print(f"Computed representations: {result}")

Computed representations: [[-2.9230226e-02 -4.7407057e-02  2.2606660e-02  1.0103084e-02
   2.9501431e-03 -4.6536267e-02  2.5976110e-02 -3.7794016e-02
   4.4813763e-02  3.3072893e-02 -4.0295411e-02  2.7855206e-02
  -1.2080893e-03  1.6354982e-02 -4.1209985e-02  2.5193226e-02
   2.2053126e-02 -9.2621893e-04  2.5041763e-02  4.6579268e-02
  -2.0399535e-02  3.1951789e-02 -1.6342305e-02 -2.7934229e-02
  -3.0946661e-02  1.9952450e-02 -9.5707066e-03 -2.3289716e-02
  -1.3773989e-02 -4.5054182e-03  4.1814335e-03 -4.6160724e-02
  -2.3128927e-02 -3.2665752e-02  4.1474629e-02 -4.3498874e-02
   3.8852919e-02 -1.9758059e-02  4.7733080e-02 -8.4118135e-03
   1.5131485e-02  4.5296792e-02 -3.3617951e-02  4.6024289e-02
  -3.5803009e-02  4.5428287e-02  4.9531329e-02  1.0164451e-02
   4.5777250e-02  3.3685837e-02  3.0546639e-02 -4.3318629e-02
  -2.2270307e-03 -8.1417672e-03  4.9524691e-02  3.6463860e-02
   3.2982659e-02 -3.9024223e-02 -3.9633237e-02  4.2580906e-02
   4.3280497e-03 -2.8862273e-02 -4.1390419e-

2023-10-28 16:54:03.142845: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 9209440773478416177
2023-10-28 16:54:03.142909: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 17071570812926990502


In [183]:
prod_model = ProductModel(product_model, product_name_model, product_group_model, product_type_model)
result = prod_model.predict(train.map(lambda x:{'article_id': x['article_id'], 'prod_name':x['prod_name'], 'product_group_name':x['product_group_name'], 'product_type_name':x['product_type_name']}).batch(2).take(1).as_numpy_iterator())
print(f"Computed representations: {result}")

Computed representations: [[-0.04932239 -0.00554194  0.0097758  -0.04227427 -0.03740822  0.035476
  -0.02660975 -0.02698498 -0.02261945 -0.0479384   0.04392023 -0.03645116
   0.0447383   0.02398094  0.04003687 -0.01344709 -0.0003892   0.03450021
  -0.00163323  0.02711442  0.0347724  -0.00858748 -0.00101597  0.00041706
   0.01922674 -0.03225617  0.03370135 -0.02884432 -0.02716791  0.04245004
  -0.03047761  0.0418885  -0.04689301 -0.01202943  0.00862228  0.03216324
  -0.01812803  0.02805069  0.0239356  -0.01879107  0.01218547  0.04527893
  -0.00835783 -0.0163364  -0.03082787 -0.00798173  0.01657363  0.04806644
   0.03089077  0.03764597  0.04852272  0.0480516  -0.03868947 -0.03421472
   0.00923457  0.01202055 -0.02547543 -0.01344123  0.00899    -0.03201952
  -0.03558545 -0.00438095  0.0261238   0.03221149 -0.01598186  0.00189037
   0.04495383 -0.00028386  0.04367014 -0.02503189 -0.00088694 -0.01454159
   0.00662593 -0.0259274  -0.01507684 -0.00020577 -0.0234944   0.02216207
  -0.02081572 

2023-10-28 16:54:15.815640: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 14381376272857983699
2023-10-28 16:54:15.815706: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 14509443298838281313
2023-10-28 16:54:15.815732: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 4973267201423381505
2023-10-28 16:54:15.815749: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5276869222262990729
2023-10-28 16:54:15.815767: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 5683344778410076232
2023-10-28 16:54:15.815789: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv item cancelled. Key hash: 14507250806045515716
2023-10-28 16:54:15.815808: I tensorflow/core/framework/local_rendezvous.cc:421] Local rendezvous recv 

In [173]:
#Generating embeddings from article ids to be used by evaluation layer to calculate the similarity score
articles = pd.read_csv('../../data/processed/articles_filled_priced.csv')[['article_id', 'prod_name']]#.sample(sample_size, random_state=1)
article_ids = articles.astype(str).map(lambda x: x.encode('utf-8'))
article_embeddings = prod_model(article_ids)
candidates_dataset = tf.data.Dataset.from_tensor_slices(article_embeddings)

In [174]:
class QueryModel(tf.keras.Model):
  """Model for encoding user queries (customers)."""

  def __init__(self, layer_sizes, customer_model, customer_postal_code):
    """Model for encoding user queries (customers).

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    # We first use the user model for generating embeddings.
    self.embedding_model = CustomerModel(customer_model, customer_postal_code)

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [175]:
class CandidateModel(tf.keras.Model):
  """Model for encoding products."""

  def __init__(self, layer_sizes, product_model, product_name_model, product_group_model, product_type_model):
    """Model for encoding products.

    Args:
      layer_sizes:
        A list of integers where the i-th entry represents the number of units
        the i-th layer contains.
    """
    super().__init__()

    self.embedding_model = ProductModel(product_model, product_name_model, product_group_model, product_type_model)

    # Then construct the layers.
    self.dense_layers = tf.keras.Sequential()

    # Use the ReLU activation for all but the last layer.
    for layer_size in layer_sizes[:-1]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

    # No activation for the last layer.
    for layer_size in layer_sizes[-1:]:
      self.dense_layers.add(tf.keras.layers.Dense(layer_size))

  def call(self, inputs):
    feature_embedding = self.embedding_model(inputs)
    return self.dense_layers(feature_embedding)

In [193]:
class ProductlensModel(tfrs.models.Model):

    def __init__(self, layer_sizes, product_model, customer_model, customer_postal_code, product_name_model, product_group_model, product_type_model):
        super().__init__()
        self.query_model = QueryModel(layer_sizes, customer_model, customer_postal_code)
        self.candidate_model = CandidateModel(layer_sizes, product_model, product_name_model, product_group_model, product_type_model)
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=train.batch(128).map(self.candidate_model),
            ),
        )

    def compute_loss(self, features, training=False):
        # We only pass the user id and timestamp features into the query model. This
        # is to ensure that the training inputs would have the same keys as the
        # query inputs. Otherwise the discrepancy in input structure would cause an
        # error when loading the query model after saving it.
        query_embeddings = self.query_model({
            "customer_id": features["customer_id"],
            "postal_code": features["postal_code"]
        })
        product_embeddings = self.candidate_model({
            'article_id':features["article_id"], 
            "prod_name":features["prod_name"],
            'product_group_name':features['product_group_name'], 
            'product_type_name':features['product_type_name']
        })

        return self.task(
            query_embeddings, product_embeddings, compute_metrics=not training)


    def call(self, inputs, training=False):
        # Extract different inputs
        customer_id = inputs['customer_id']
        postal_code = inputs['postal_code']
        article_id = inputs['article_id']
        prod_name = inputs['prod_name']
        product_group_name = inputs['product_group_name']
        product_type_name = inputs['product_type_name']

        # Compute query and candidate embeddings
        query_embeddings = self.query_model({
            "customer_id": customer_id,
            "postal_code": postal_code,
        })
        product_embeddings = self.candidate_model({
            'article_id': article_id,
            'prod_name': prod_name,
            'product_group_name': product_group_name,
            'product_type_name': product_type_name,
        })

        # (You might need to add additional logic here, depending on your specific use case)

        return query_embeddings, product_embeddings

In [232]:
num_epochs = 1
learning_rate = 0.001
batch_size=128

In [233]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [234]:
model = ProductlensModel([128,64,32], product_model=product_model, customer_model=customer_model, customer_postal_code=customer_postal_code, product_name_model=product_name_model, product_group_model=product_group_model, product_type_model=product_type_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate))

In [229]:
example_batch = next(iter(cached_train))
model(example_batch)

2023-10-28 17:15:35.382971: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


(<tf.Tensor: shape=(128, 32), dtype=float32, numpy=
 array([[ 0.0245174 ,  0.03292342,  0.02845723, ..., -0.00230326,
         -0.05297849, -0.05727484],
        [-0.02250495, -0.04904301, -0.01298611, ...,  0.0128459 ,
         -0.12953466,  0.00859541],
        [-0.00806028,  0.04020363, -0.02803747, ...,  0.02295982,
         -0.07822144, -0.03084722],
        ...,
        [ 0.01989274,  0.00837284, -0.00828524, ...,  0.00248457,
         -0.01894611,  0.00043546],
        [ 0.00468841,  0.02643885,  0.01049398, ...,  0.01359197,
         -0.00958424, -0.01350108],
        [-0.02349425,  0.01075432, -0.03159231, ...,  0.03635402,
         -0.05501918, -0.00261772]], dtype=float32)>,
 <tf.Tensor: shape=(128, 32), dtype=float32, numpy=
 array([[-0.04177791,  0.00132753, -0.02987937, ..., -0.01257239,
          0.00306093,  0.05912175],
        [-0.04212455,  0.0253659 , -0.0036051 , ...,  0.08195554,
          0.05458586, -0.01644351],
        [-0.11145473,  0.03343907,  0.01572179, .

In [230]:
model.summary()

Model: "productlens_model_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 query_model_22 (QueryModel  multiple                  23143136  
 )                                                               
                                                                 
 candidate_model_22 (Candid  multiple                  6786464   
 ateModel)                                                       
                                                                 
 retrieval_20 (Retrieval)    multiple                  0 (unused)
                                                                 
Total params: 29929601 (114.17 MB)
Trainable params: 29929600 (114.17 MB)
Non-trainable params: 1 (4.00 Byte)
_________________________________________________________________


In [235]:
three_layer_history = model.fit(
    cached_train,
    epochs=num_epochs,
    verbose=1)




In [None]:
model.evaluate(cached_test, return_dict=True)