# Trying to adapt solution from [TFRS](https://www.tensorflow.org/recommenders/examples/basic_retrieval) tutorial for H&M data.

In [3]:
!pip install -q tensorflow-recommenders
!pip install -q scann

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import pprint
import tempfile
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs

from typing import Dict, Text
from sklearn.model_selection import train_test_split

- H&M transaction data were subsampled using [custom function](https://github.com/omegatro/IGP_2023/blob/main/modules/preprocessing.py to 5% of the original records (seed = 4312).
- The subsampled data were transformed to represent number of transactions for individual customer-product combinations (number of times each product was bought by each customer). The following code was used:
```
df = df.groupby(['customer_id', 'article_id']).count()
df = df[['customer_id', 'article_id','t_dat']].reset_index()
df.rename(columns={'t_dat':'t_count'}, inplace = True)
df.to_csv('0_05_4312_cus_art_grp_count.csv', header=True, index=False)
```
- The resulting file was uploaded to google drive to work with from colab.

In [11]:
#In this iteration, only ids are used as attributes for simplicity
sample_size = 10000
counts = pd.read_csv('/content/drive/MyDrive/0_05_4312_cus_art_grp_count.csv')[['customer_id', 'article_id']].sample(sample_size)
#Generating training and test subsets
train, test = train_test_split(counts, test_size=0.2)
#Getting unique user ids (required to transform those into embeddings by the model)
unq_cids = counts.customer_id.unique()

In [12]:
#Getting product ids (required to transform those into embeddings by the model)
articles = pd.read_csv('/content/drive/MyDrive/articles.csv')[['article_id']]
unq_articles = articles.article_id.unique().astype(str)
article_ids = articles['article_id'].astype(str).map(lambda x: x.encode('utf-8'))

In [13]:
print(article_ids[:5])

0    b'108775015'
1    b'108775044'
2    b'108775051'
3    b'110065001'
4    b'110065002'
Name: article_id, dtype: object


In [14]:
print(unq_articles[:5])

['108775015' '108775044' '108775051' '110065001' '110065002']


In [15]:
len(unq_cids)

9874

In [16]:
len(unq_articles)

105542

# Converting from pandas df to tensorflow datasets for compatibility

In [17]:
train['customer_id'] = train.customer_id.astype(str)
train['article_id'] = train.article_id.astype(str)
train = train.to_dict(orient='list')
train = tf.data.Dataset.from_tensor_slices(train)

In [18]:
test['customer_id'] = test.customer_id.astype(str)
test['article_id'] = test.article_id.astype(str)
test = test.to_dict(orient='list')
test = tf.data.Dataset.from_tensor_slices(test)

In [19]:
for x in train.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'849859003',
 'customer_id': b'79b2bc962d80f91406ff8e85f580ff7470741f5a53b3a49f7292eaab0866'
                b'5479'}


In [20]:
for x in test.take(1).as_numpy_iterator():
  pprint.pprint(x)

{'article_id': b'758002004',
 'customer_id': b'c5f1f946ae8be31aca5c3a38996ed94005f3402b094ff1dc497147b5c2fa'
                b'021f'}


# Adapting the model

In [21]:
embedding_dimension = 32

# Converting features into embedding vectors


In [22]:
customer_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_cids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unq_cids) + 1, embedding_dimension)
])

In [23]:
product_model = tf.keras.Sequential([
  tf.keras.layers.StringLookup(
      vocabulary=unq_articles, mask_token=None),
  tf.keras.layers.Embedding(len(unq_articles) + 1, embedding_dimension)
])

In [24]:
#Generating embeddings from article ids to be used by evaluation layer to calculate the similarity score
article_embeddings = product_model(article_ids)
candidates_dataset = tf.data.Dataset.from_tensor_slices(article_embeddings)

In [25]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=candidates_dataset.batch(128)  # Batch the candidates for efficiency
)

In [26]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [27]:
class MovielensModel(tfrs.Model):

  def __init__(self, customer_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    customer_embeddings = self.customer_model(features["customer_id"])
    # And pick out the movie features and pass them into the movie model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features["article_id"])

    # The task computes the loss and the metrics.
    return self.task(customer_embeddings, positive_product_embeddings)

In [28]:
class NoBaseClassMovielensModel(tf.keras.Model):

  def __init__(self, customer_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.customer_model: tf.keras.Model = customer_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      customer_embeddings = self.customer_model(features["customer_id"])
      positive_product_embeddings = self.product_model(features["article_id"])
      loss = self.task(customer_embeddings, positive_product_embeddings)

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    customer_embeddings = self.customer_model(features["customer_id"])
    positive_product_embeddings = self.product_model(features["article_id"])
    loss = self.task(customer_embeddings, positive_product_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

In [29]:
#Hyperparameters
alpha = 0.5
batch_size = 128
epochs = 1

In [30]:
model = MovielensModel(customer_model, product_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=alpha))

In [31]:
cached_train = train.batch(batch_size).cache()
cached_test = test.batch(batch_size).cache()

In [32]:
model.fit(cached_train, epochs=epochs)



<keras.src.callbacks.History at 0x79241f12cc40>

In [33]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.010999999940395355,
 'factorized_top_k/top_5_categorical_accuracy': 0.012500000186264515,
 'factorized_top_k/top_10_categorical_accuracy': 0.014999999664723873,
 'factorized_top_k/top_50_categorical_accuracy': 0.02250000089406967,
 'factorized_top_k/top_100_categorical_accuracy': 0.029999999329447746,
 'loss': 350.56378173828125,
 'regularization_loss': 0,
 'total_loss': 350.56378173828125}