# Trying to adapt solution from [TFRS](https://www.tensorflow.org/recommenders/examples/basic_retrieval) tutorial for H&M data.

In [88]:
!pip install -q tensorflow-recommenders
!pip install -q scann

In [89]:
import os
import pprint
import tempfile
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

from typing import Dict, Text
from sklearn.model_selection import train_test_split

- H&M transaction data were subsampled using [custom function](https://github.com/omegatro/IGP_2023/blob/main/modules/preprocessing.py to 5% of the original records (seed = 4312).
- The subsampled data were transformed to represent number of transactions for individual customer-product combinations (number of times each product was bought by each customer). The following code was used:
```
df = df.groupby(['customer_id', 'article_id']).count()
df = df[['customer_id', 'article_id','t_dat']].reset_index()
df.rename(columns={'t_dat':'t_count'}, inplace = True)
df.to_csv('0_05_4312_cus_art_grp_count.csv', header=True, index=False)
```
- The resulting file was uploaded to google drive to work with from colab.

In [193]:
#In this iteration, only ids are used as attributes for simplicity
sample_size = 10_000
counts = pd.read_csv('/content/drive/MyDrive/0_05_4312_cus_art_grp_count.csv')[['customer_id', 'article_id']].sample(sample_size, random_state=1)
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [194]:
#Getting product ids (required to transform those into embeddings by the model)
articles = pd.read_csv('/content/drive/MyDrive/articles.csv')[['article_id']].sample(sample_size, random_state=1)
unq_articles = articles.article_id.unique().astype(str)
article_ids = articles['article_id'].astype(str).map(lambda x: x.encode('utf-8'))

In [195]:
print(article_ids[:5])

38159    b'656677021'
69760    b'758050001'
55574    b'711043002'
88565    b'833548001'
79069    b'796535023'
Name: article_id, dtype: object


In [172]:
print(unq_articles[:5])

['656677021' '758050001' '711043002' '833548001' '796535023']


In [173]:
len(unq_cids)

46906

In [174]:
len(unq_articles)

50000

# Converting from pandas df to tensorflow datasets for compatibility

In [196]:
train['customer_id'] = train.customer_id.astype(str)
train['article_id'] = train.article_id.astype(str)
train = train.to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [197]:
test['customer_id'] = test.customer_id.astype(str)
test['article_id'] = test.article_id.astype(str)
test = test.to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [177]:
for x in train.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'718743001',
 'customer_id': b'e20942b0dee85a7b49e70cdafa4c14688e6c6b6336821e6652927e9cc1f8'
                b'dbf0'}


In [178]:
for x in test.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'685816041',
 'customer_id': b'a6ef8e2cecaa0d8264fc900edb0e0316ba6d59b64e8f0a72a402eea6cbb3'
                b'31a6'}


# Adapting the model

In [180]:
embedding_dimension = 32

# Converting features into embedding vectors


In [181]:
customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_cids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
])

In [182]:
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_articles, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [183]:
#Generating embeddings from article ids to be used by evaluation layer to calculate the similarity score
article_embeddings = product_model(article_ids)
candidates_dataset = tf.data.Dataset.from_tensor_slices(article_embeddings)

In [184]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=candidates_dataset.batch(128)  # Batch the candidates for efficiency
)

In [185]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [187]:
class ProductlensModel(tfrs.Model):

  def __init__(self, customer_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    customer_embeddings = self.customer_model(features["customer_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features["article_id"])

    # The task computes the loss and the metrics.
    return self.task(customer_embeddings, positive_product_embeddings)

In [188]:
class NoBaseClassProductlensModel(tf.keras.Model):

  def __init__(self, customer_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      customer_embeddings = self.customer_model(features["customer_id"])
      positive_product_embeddings = self.product_model(features["article_id"])
      loss = self.task(customer_embeddings, positive_product_embeddings)

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    customer_embeddings = self.customer_model(features["customer_id"])
    positive_product_embeddings = self.product_model(features["article_id"])
    loss = self.task(customer_embeddings, positive_product_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

In [198]:
#Hyperparameters
alpha = 0.1
batch_size = 128
epochs = 1

In [199]:
model = ProductlensModel(customer_model, product_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=alpha))

In [200]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [201]:
model.fit(cached_train, epochs=epochs)



<keras.src.callbacks.History at 0x79241f67cdf0>

In [202]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.1315000057220459,
 'factorized_top_k/top_5_categorical_accuracy': 0.1459999978542328,
 'factorized_top_k/top_10_categorical_accuracy': 0.15299999713897705,
 'factorized_top_k/top_50_categorical_accuracy': 0.1720000058412552,
 'factorized_top_k/top_100_categorical_accuracy': 0.18000000715255737,
 'loss': 350.40692138671875,
 'regularization_loss': 0,
 'total_loss': 350.40692138671875}

# Ranking model

In [227]:
#In this iteration, only ids are used as attributes for simplicity
counts = pd.read_csv('/content/drive/MyDrive/0_05_4312_cus_art_grp_count.csv')[['customer_id', 'article_id','t_count']]
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [228]:
#Getting product ids (required to transform those into embeddings by the model)
articles = pd.read_csv('/content/drive/MyDrive/articles.csv')[['article_id']].sample(sample_size, random_state=1)
unq_articles = articles.article_id.unique().astype(str)
article_ids = articles['article_id'].astype(str).map(lambda x: x.encode('utf-8'))

In [229]:
train['customer_id'] = train.customer_id.astype(str)
train['article_id'] = train.article_id.astype(str)
train = train.to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [255]:
for x in train.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'776740001',
 'customer_id': b'503fbc985779237bbd7ddf8c46194d1c3cc2162f7533d59c9598be578c72'
                b'bae8',
 't_count': 1}


In [230]:
test['customer_id'] = test.customer_id.astype(str)
test['article_id'] = test.article_id.astype(str)
test = test.to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [256]:
for x in test.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'253448003',
 'customer_id': b'73f7d8cca33b8236879d92a6188f691410a82763dc8bb65b8829568c07f8'
                b'd9a7',
 't_count': 1}


In [231]:
class RankingModel(tf.keras.Model):

  def __init__(self):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.customer_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unq_cids , mask_token=None),
      tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
    ])

    # Compute embeddings for product.
    self.product_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unq_articles, mask_token=None),
      tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
    ])

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    cus_id, art_id = inputs

    cus_embedding = self.customer_embeddings(cus_id)
    art_embedding = self.product_embeddings(art_id)

    return self.ratings(tf.concat([cus_embedding, art_embedding], axis=1))

In [232]:
task = tfrs.tasks.Ranking(
  loss = tf.keras.losses.MeanSquaredError(),
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [233]:
class ProductlensModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.ranking_model: tf.keras.Model = RankingModel()
    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
    return self.ranking_model(
        (features["customer_id"], features["article_id"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("t_count")

    rating_predictions = self(features)

    # The task computes the loss and the metrics.
    return self.task(labels=labels, predictions=rating_predictions)

In [250]:
#Hyperparameters
alpha = 0.5
batch_size = 128
epochs = 1

In [251]:
model = ProductlensModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=alpha))

In [252]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [253]:
model.fit(cached_train, epochs=epochs)



<keras.src.callbacks.History at 0x79240b07c1c0>

In [254]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 0.10878291726112366,
 'loss': 0.035050321370363235,
 'regularization_loss': 0,
 'total_loss': 0.035050321370363235}

In [267]:
test_counts = {}
sample = counts.sample(1000)
test_product_ids = zip(sample['customer_id'],sample['article_id'], sample['t_count'])
for pid, aid, t_count in test_product_ids:
  test_counts[pid] = (model({
      "customer_id": np.array(['73f7d8cca33b8236879d92a6188f691410a82763dc8bb65b8829568c07f8d9a7']),
      "article_id": np.array([pid])

  }).numpy()[0,0], t_count)

In [268]:
print("t_counts:")
for title, score in sorted(test_counts.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

t_counts:
135371be9c58c90de31ac8a480c2cf851eaadd919316f3755a59a10b4428b192: (1.0121313, 2)
09faaaadaac803254c5022faa85d2f837dc074959a2b5e1b266d857bab59f97e: (1.0121313, 2)
d113b3af1e2e63556176ea7c5487051b19402dce749ba244a5f706fc7e20584b: (1.0121313, 2)
99bae865fdd6149e70655cbbf259441af27e2a99bb0d25fc4aa778f58e8d6d78: (1.0121313, 2)
aecf2345b24e7fe4ad9bd35ecad9fa61064c113d04a27ea641b369fe9cc15269: (1.0121313, 2)
828c68f825109981834778837fbf9810811a0aefba41117a54be258149951f8d: (1.0121313, 2)
dc0cadd81f8277ca00e28370652dab8688afbf9509503e48bc40673b2658810c: (1.0121313, 2)
c8f2aa51e091d41db8a731633349eef4a77eb06a41292dc00f9351025cc754fe: (1.0121313, 2)
3fc2c838f058a2a837c344cb1c46faabfae6a60cb9f3b65a295d1b7d99984f4e: (1.0121313, 1)
a37f20661292a3572e1c56c3b1f79b738ff5ab7e8940a59867f1bd81dc1e74c1: (1.0121313, 1)
b2da5cdd895ed5577a59252a148b45487396d2f48ac2354e3c48f7364520a7bb: (1.0121313, 1)
6fd70d387c9931d4f8f4a790d41c8f289e72de1cfe8f7069c505f076e23d3ac7: (1.0121313, 1)
97e24208561457fbf6