In [1]:
import pandas

In [2]:
import numpy as np
import tensorflow as tf
# import tensorflow_datasets as tfds

In [3]:
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

In [4]:
train_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_train").cache() #data\ratings_train
test_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_test").cache()

ratings = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_all").cache()

In [5]:
next(iter(ratings))

{'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>,
 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>,
 'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Postman, The (1997)'>,
 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=885409515>}

In [6]:
movies = ratings.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_movie_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [7]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [30]:

class ItemModel(tf.keras.Model):
    def __init__(
        self,
        unique_item_ids,
        unique_item_gics

        ):
        super().__init__()

        self.max_tokens = 10000
        self.unique_item_ids = unique_item_ids
        self.unique_item_gics = unique_item_gics

        self.embed_item_id = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_item_ids,
                mask_token =None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_item_ids)+1,
                output_dim = 32 #32
            )
        ])

        self.embed_items_gics = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_item_gics,
                mask_token = None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_item_gics)+1,
                output_dim = 16 #len(unique_item_gics)
            )
        ])
        
    
    def call(self, inputs):

        item_id,  item_gics = inputs

        return tf.concat([
            self.embed_item_id(item_id),
            self.embed_items_gics(item_gics)
        ],
        axis = 2)


class UserModel(tf.keras.Model):
    def __init__(
        self,
        unique_user_ids, 
        
        ):

        super().__init__()

        self.unique_user_ids = unique_user_ids

        self.embed_user_id = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_user_ids,
                mask_token = None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_user_ids)+1,
                output_dim = 32
            )
        ])
        
    def call(self, inputs):

        (user_id) = inputs

        return self.embed_user_id(user_id)
        

In [112]:
from tensorflow.keras import layers

def squeeze_layer(inputs, axis=-1):
  return tf.squeeze(inputs, axis=axis)

squeeze_custom_layer = layers.Lambda(squeeze_layer) #, arguments={'axis': 1}

def global_average_mean(x):
  """Custom layer to perform global average mean pooling."""
  axis = -2  # Reduce mean along the last dimension
  return tf.reduce_mean(x, axis=axis)

def reshaper(x):
    shape = (-1,5,1)
    return tf.reshape(x, shape)

class ItemModel_v2(tf.keras.Model):
    def __init__(
        self,
        unique_item_ids,
        unique_item_gics,
        unique_item_names

        ):
        super().__init__()

        self.max_tokens = 10000
        self.unique_item_ids = unique_item_ids
        self.unique_item_gics = unique_item_gics
        self.unique_item_names = unique_item_names

        self.embed_item_id = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_item_ids,
                mask_token =None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_item_ids)+1,
                output_dim = 16 #32
            )
        ])

        self.embed_items_gics = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = unique_item_gics,
                mask_token = None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(unique_item_gics)+1,
                output_dim = 16 #len(unique_item_gics)
            )
        ])

        self.textvectorizer = tf.keras.layers.TextVectorization(
            max_tokens = self.max_tokens,
            # ragged = True
        )

        self.embed_item_name = tf.keras.Sequential([

            # tf.keras.layers.Reshape((-1,5,1)),
            tf.keras.layers.Lambda(reshaper),

            self.textvectorizer,

            tf.keras.layers.Embedding(
                input_dim = self.max_tokens,
                output_dim = 32,
                mask_zero = True
            ),

            tf.keras.layers.Lambda(global_average_mean)
            # tf.keras.layers.GlobalAveragePooling1D(), # reduces dimensionality to 1d (embedding layer embeddeds each word in a title one by one)
            
            # tf.keras.layers.Flatten() 
            # squeeze_custom_layer()
        ])

        self.textvectorizer.adapt(self.unique_item_names)


    
    def call(self, inputs):

        item_id,  item_gics, item_name = inputs

        return tf.concat([
            self.embed_item_id(item_id),
            self.embed_items_gics(item_gics),
            self.embed_item_name(item_name)
        ],
        axis = 2)

In [10]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = UserModel(
        unique_user_ids = unique_movie_user_ids
    )

    self.movie_embeddings = ItemModel(
        unique_item_ids = unique_movie_titles
    )

    self.score_model = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):

    user_embeddings = self.user_embeddings(features["user_id"])

    movie_embeddings = self.movie_embeddings(features["movie_title"])

    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [11]:
epochs = 5

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [12]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [13]:
mse_model.fit(cached_train, epochs= 5 , verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23cf96bce80>

# Atrad Model

In [14]:
portfolios = tf.data.Dataset.load("../../data/portfolios_tfds_lists")

In [15]:
train_list_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/train_lists_ds").cache()
test_list_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/test_lists_ds").cache()

In [16]:
next(iter(train_list_ds))

{'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-23479-LI/00'>,
 'GICS': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Materials', b'Health Care Equipment & Services',
        b'Food Beverage & Tobacco', b'Consumer Durables & Apparel',
        b'Capital Goods'], dtype=object)>,
 'STOCKCODE': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'CIC', b'AMSL', b'RAL', b'REG', b'AEL'], dtype=object)>,
 'RATING': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4., 2., 5., 5., 2.], dtype=float32)>,
 'UNIX_TS': <tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([1.6475418e+09, 1.6681050e+09, 1.7100954e+09, 1.7030970e+09,
        1.6655994e+09], dtype=float32)>,
 'STOCKNAME': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'C I C HOLDINGS PLC', b'ASIRI SURGICAL HOSPITAL PLC',
        b'RENUKA AGRI FOODS PLC', b'REGNIS (LANKA) PLC',
        b'ACCESS ENGINEERING PLC'], dtype=object)>}

In [17]:
items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

unique_user_ids = np.unique(np.concatenate(list(user_ids)))

In [18]:
len(unique_item_ids)

268

In [19]:
# from user_embedding import UserModel
# # from item_embedding import ItemModel

# class RankingModel_trad(tfrs.Model):

#   def __init__(self, portfolios ,loss):
#     super().__init__()

#     embedding_dimension = 32,
#     self.loss = loss
    
#     self.portfolios = portfolios

#     self.items_ids = self.portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
#     self.item_GICS = self.portfolios.batch(10000).map(lambda x: x["GICS"])
#     self.unique_item_ids = np.unique(np.concatenate(list(self.items_ids)))
#     self.unique_item_gics = np.unique(np.concatenate(list(self.item_GICS)))

#     self.user_ids = self.portfolios.batch(10000).map(lambda x: x["CDSACCNO"])
#     self.unique_user_ids = np.unique(np.concatenate(list(self.user_ids)))


#     self.user_embeddings = UserModel(
#       unique_user_ids = unique_user_ids
#     )

#     # Compute embeddings for movies.
#     # self.embed_item_code = tf.keras.Sequential([
#     #   tf.keras.layers.StringLookup(
#     #     vocabulary= self.unique_item_ids),
#     #   tf.keras.layers.Embedding(270, 32) #len(unique_item_ids) + 2, embedding_dimension
#     # ])

#     # self.embed_items_gics = tf.keras.Sequential([
#     #         tf.keras.layers.StringLookup(
#     #             vocabulary = self.unique_item_gics,
#     #             mask_token = None
#     #         ),
#     #         tf.keras.layers.Embedding(
#     #             input_dim = len(self.unique_item_gics)+1,
#     #             output_dim = 16 #len(unique_item_gics)
#     #         )
#     #     ])

#     self.item_embeddings = ItemModel(
#       unique_item_ids = self.unique_item_ids,
#       # unique_item_gics = self.unique_item_gics
#     )

#     # Compute predictions.
#     self.score_model = tf.keras.Sequential([
#       tf.keras.layers.Dense(256, activation="relu"),
#       tf.keras.layers.Dense(64, activation="relu"),
#       tf.keras.layers.Dense(1)
#     ])

#     self.task = tfrs.tasks.Ranking(
#       loss=loss,
#       metrics=[
#         tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
#         tf.keras.metrics.RootMeanSquaredError()
#       ]
#     )

#   def call(self, features):
#     user_embeddings = self.user_embeddings(features["CDSACCNO"])
#     # code_embeddings = self.embed_item_code(features["STOCKCODE"])
#     # gics_embeddings = self.embed_items_gics(features['GICS'])

#     item_embeddings = self.item_embeddings((
#       features['STOCKCODE'],
#       # features['GICS']
#       ))


#     print()
#     list_length = features["STOCKCODE"].shape[1]
#     user_embedding_repeated = tf.repeat(
#         tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

#     concatenated_embeddings = tf.concat(
#         [user_embedding_repeated, item_embeddings], 2)

#     return self.score_model(concatenated_embeddings)

#   def compute_loss(self, features, training=False):
#     labels = features.pop("RATING")

#     scores = self(features)

#     return self.task(
#         labels=labels,
#         predictions=tf.squeeze(scores, axis=-1),
#     )

In [20]:
# import sys

# sys.exit()

In [32]:
class RankingModel_trad_v2(tfrs.Model):

  def __init__(self, portfolios, loss):
    super().__init__()

    embedding_dimension = 32,
    self.loss = loss
    
    self.portfolios = portfolios

    self.items_ids = self.portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
    self.unique_item_ids = np.unique(np.concatenate(list(self.items_ids)))

    self.item_GICS = self.portfolios.batch(10000).map(lambda x: x["GICS"])
    self.unique_item_gics = np.unique(np.concatenate(list(self.item_GICS)))

    self.item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
    self.unique_item_names = np.unique(np.concatenate(list(item_names)))
    
    self.user_ids = self.portfolios.batch(10000).map(lambda x: x["CDSACCNO"])
    self.unique_user_ids = np.unique(np.concatenate(list(self.user_ids)))


    # Compute embeddings for users.
    self.user_embeddings = UserModel(
        unique_user_ids = self.unique_user_ids
    )

    # self.item_embeddings = ItemModel(
    #     unique_item_ids = unique_movie_titles
    # )

    self.item_embeddings = ItemModel(
        unique_item_ids = self.unique_item_ids,
        unique_item_gics = self.unique_item_gics,
        # unique_item_names = self.unique_item_names
    )

    self.score_model = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):

    user_embeddings = self.user_embeddings(features["CDSACCNO"])

    item_embeddings = self.item_embeddings((features["STOCKCODE"], features['GICS'])) #, features['STOCKNAME']

    print(user_embeddings.shape,' | ' ,item_embeddings.shape)
    list_length = features["STOCKCODE"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    print(user_embedding_repeated.shape,' | ' ,item_embeddings.shape)
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, item_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("RATING")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [33]:
epochs = 5

cached_train_trad = train_list_ds.shuffle(100_000).batch(128).cache()
cached_test_trad = test_list_ds.batch(128).cache()
trad_model = RankingModel_trad_v2(
    portfolios = portfolios,
    loss = tf.keras.losses.MeanSquaredError()
    )
trad_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
trad_model.fit(cached_train_trad, epochs= 5 , verbose=1)

Epoch 1/5
(None, 32)  |  (None, 5, 48)
(None, 5, 32)  |  (None, 5, 48)
(None, 32)  |  (None, 5, 48)
(None, 5, 32)  |  (None, 5, 48)
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23cfa80b2b0>

In [233]:
class RankingModel_trad_v3(tfrs.Model):

  def __init__(self, portfolios, loss):
    super().__init__()

    embedding_dimension = 32,
    self.loss = loss
    
    self.portfolios = portfolios

    self.items_ids = self.portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
    self.unique_item_ids = np.unique(np.concatenate(list(self.items_ids)))

    self.item_GICS = self.portfolios.batch(10000).map(lambda x: x["GICS"])
    self.unique_item_gics = np.unique(np.concatenate(list(self.item_GICS)))

    self.item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
    self.unique_item_names = np.unique(np.concatenate(list(item_names)))
    
    self.user_ids = self.portfolios.batch(10000).map(lambda x: x["CDSACCNO"])
    self.unique_user_ids = np.unique(np.concatenate(list(self.user_ids)))


    # Compute embeddings for users.
    self.user_embeddings = UserModel(
        unique_user_ids = self.unique_user_ids
    )

    # self.item_embeddings = ItemModel(
    #     unique_item_ids = unique_movie_titles
    # )

    self.item_embeddings = ItemModel_v2(
        unique_item_ids = self.unique_item_ids,
        unique_item_gics = self.unique_item_gics,
        unique_item_names = self.unique_item_names
    )

    self.score_model = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):

    user_embeddings = self.user_embeddings(features["CDSACCNO"])

    item_embeddings = self.item_embeddings((features["STOCKCODE"], features['GICS'], features['STOCKNAME'])) #, features['STOCKNAME']

    list_length = features["STOCKCODE"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    print(user_embedding_repeated.shape,' | ' ,item_embeddings.shape)
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, item_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("RATING")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [234]:
epochs = 5

cached_train_trad = train_list_ds.shuffle(100_000).batch(128).cache()
cached_test_trad = test_list_ds.batch(128).cache()
trad_model = RankingModel_trad_v3(
    portfolios = portfolios,
    loss = tf.keras.losses.MeanSquaredError()
    )
trad_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
trad_model.fit(cached_train_trad, epochs= 5 , verbose=1)

Epoch 1/5
(None, 5, 32)  |  (None, 5, 64)
(None, 5, 32)  |  (None, 5, 64)
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23cfd0688b0>

#  Sandbox

In [104]:
test_batch = next(iter(train_list_ds.batch(2)))
test_batch

{'CDSACCNO': <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'RPS-23479-LI/00', b'RPS-23479-LI/00'], dtype=object)>,
 'GICS': <tf.Tensor: shape=(2, 5), dtype=string, numpy=
 array([[b'Materials', b'Health Care Equipment & Services',
         b'Food Beverage & Tobacco', b'Consumer Durables & Apparel',
         b'Capital Goods'],
        [b'Capital Goods', b'Food Beverage & Tobacco',
         b'Diversified Financials', b'Real Estate', b'Utilities']],
       dtype=object)>,
 'STOCKCODE': <tf.Tensor: shape=(2, 5), dtype=string, numpy=
 array([[b'CIC', b'AMSL', b'RAL', b'REG', b'AEL'],
        [b'RICH', b'TAFL', b'ASIY', b'MHDL', b'HPWR']], dtype=object)>,
 'RATING': <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
 array([[4., 2., 5., 5., 2.],
        [3., 3., 2., 2., 3.]], dtype=float32)>,
 'UNIX_TS': <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
 array([[1.6475418e+09, 1.6681050e+09, 1.7100954e+09, 1.7030970e+09,
         1.6655994e+09],
        [1.6782138e+09, 1.6695738e+09, 1.6511

In [105]:
test_names = test_batch['STOCKNAME']
test_names

<tf.Tensor: shape=(2, 5), dtype=string, numpy=
array([[b'C I C HOLDINGS PLC', b'ASIRI SURGICAL HOSPITAL PLC',
        b'RENUKA AGRI FOODS PLC', b'REGNIS (LANKA) PLC',
        b'ACCESS ENGINEERING PLC'],
       [b'RICHARD PIERIS AND COMPANY PLC', b'THREE ACRE FARMS PLC',
        b'ASIA SIYAKA COMMODITIES LIMITED',
        b'MILLENNIUM HOUSING DEVELOPERS PLC', b'RESUS ENERGY PLC']],
      dtype=object)>

In [123]:
x1 = tf.keras.layers.Lambda(reshaper)(test_names)
x1

<tf.Tensor: shape=(2, 5, 1), dtype=string, numpy=
array([[[b'C I C HOLDINGS PLC'],
        [b'ASIRI SURGICAL HOSPITAL PLC'],
        [b'RENUKA AGRI FOODS PLC'],
        [b'REGNIS (LANKA) PLC'],
        [b'ACCESS ENGINEERING PLC']],

       [[b'RICHARD PIERIS AND COMPANY PLC'],
        [b'THREE ACRE FARMS PLC'],
        [b'ASIA SIYAKA COMMODITIES LIMITED'],
        [b'MILLENNIUM HOUSING DEVELOPERS PLC'],
        [b'RESUS ENERGY PLC']]], dtype=object)>

In [121]:

item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])

unique_item_names = np.unique(np.concatenate(list(item_names)))

textvectorizer = tf.keras.layers.TextVectorization(max_tokens = 10000)
textvectorizer.adapt(unique_item_names)

In [124]:
x2 = textvectorizer(x1)
x2

<tf.Tensor: shape=(2, 5, 5), dtype=int64, numpy=
array([[[ 15,  81,  15,   6,   2],
        [ 97, 132,  83,   2,   0],
        [ 21, 348,  22,   2,   0],
        [156,   3,   2,   0,   0],
        [352,  34,   2,   0,   0]],

       [[ 63,  67,  18,   8,   2],
        [123, 350,  89,   2,   0],
        [ 23, 139, 299, 213,   0],
        [196,  82, 289,   2,   0],
        [153,  50,   2,   0,   0]]], dtype=int64)>

In [127]:
(2.07468607e-02 + -3.57418135e-03 + 2.07468607e-02 + -7.21329451e-03 + -4.65601198e-02)/5

-0.0031707748519999987

In [182]:
from tensorflow.keras.utils import set_random_seed

seed_value = 123
set_random_seed(seed_value)

In [196]:
tf.random.set_seed(1234)

In [227]:
x3 = tf.keras.layers.Embedding(
                
                input_dim = 10000,
                output_dim = 32,
                mask_zero = True
            )(x2)
x3

<tf.Tensor: shape=(2, 5, 5, 32), dtype=float32, numpy=
array([[[[ 0.04647887,  0.01380355,  0.0191029 , ...,  0.04463607,
          -0.01402754,  0.04014876],
         [-0.02963115, -0.00749248,  0.04200221, ...,  0.03177402,
           0.02745868, -0.02750083],
         [ 0.04647887,  0.01380355,  0.0191029 , ...,  0.04463607,
          -0.01402754,  0.04014876],
         [ 0.03557892,  0.00042997,  0.03149417, ..., -0.02142387,
          -0.01747759,  0.03523878],
         [ 0.04741463, -0.0119987 , -0.00853122, ...,  0.01016723,
          -0.01890842, -0.04851372]],

        [[ 0.01256475, -0.02138459, -0.00165056, ..., -0.03795421,
          -0.02191701,  0.0158107 ],
         [-0.03303277,  0.04009474, -0.01997585, ...,  0.02339408,
           0.02161757, -0.02798464],
         [ 0.04065387, -0.03330387, -0.03906216, ...,  0.0360481 ,
           0.02510151, -0.02937832],
         [ 0.04741463, -0.0119987 , -0.00853122, ...,  0.01016723,
          -0.01890842, -0.04851372],
       

In [228]:
tf.keras.layers.Lambda(global_average_mean)(x3)

<tf.Tensor: shape=(2, 5, 32), dtype=float32, numpy=
array([[[ 0.02926403,  0.00170917,  0.02063419,  0.00716705,
          0.00048582,  0.00328937, -0.00435741, -0.00167292,
         -0.00647552, -0.01260974,  0.00584995, -0.02642995,
         -0.00546757, -0.01581376, -0.02787273, -0.00899226,
          0.01918392,  0.02188797,  0.00063136, -0.0173961 ,
         -0.02518235,  0.02134777, -0.0014424 ,  0.03179277,
          0.00770793,  0.0169559 , -0.01282751,  0.01455733,
          0.01220166,  0.02195791, -0.00739648,  0.00790435],
        [ 0.01069652, -0.00029755, -0.01060555, -0.00251505,
          0.00764672, -0.0084525 ,  0.01172255,  0.00454845,
         -0.00674037,  0.01325659, -0.0049839 ,  0.00029466,
          0.00998067, -0.0057143 , -0.00938127,  0.00261642,
          0.01537797, -0.00765257, -0.01713399,  0.00169834,
          0.01726803, -0.00675656,  0.00477711, -0.00860311,
         -0.0077878 ,  0.00031102, -0.00901622,  0.01927438,
          0.00833847,  0.0052849

In [129]:
name_embedder = trad_model.item_embeddings.embed_item_name

In [130]:
name_embedder(test_names)

<tf.Tensor: shape=(2, 5, 32), dtype=float32, numpy=
array([[[ 2.61454694e-02,  2.93889251e-02,  6.16274029e-03,
         -8.10009614e-03, -1.29013360e-02,  9.73228365e-03,
         -2.33040601e-02, -1.70757677e-02,  1.50211472e-02,
          6.73760986e-03, -7.71821011e-03,  1.32497596e-02,
          2.42977520e-03, -6.09293487e-03,  3.43111897e-04,
         -1.14609338e-02, -1.04145287e-02, -1.73679721e-02,
          2.68269032e-02, -7.20136147e-03, -1.33724418e-02,
         -2.52336939e-03, -1.24337226e-02,  2.14219075e-02,
          1.95146650e-02,  4.64443117e-03, -1.65033098e-02,
         -6.60325680e-03,  5.74517623e-03, -4.78223723e-04,
          1.84162650e-02,  2.57913526e-02],
        [-5.38586813e-04, -6.52092416e-03,  6.44294196e-04,
         -6.73938636e-03, -1.04875378e-02, -3.52804619e-03,
          3.01169907e-03,  1.57165024e-02, -1.24798827e-02,
          1.38406828e-03, -8.65799654e-03, -9.68982372e-03,
          2.74600722e-02,  1.32546853e-02,  5.60571346e-03,
    

In [111]:
reshaper = tf.keras.layers.Reshape((-1,5,1))
tf.reshape(test_names, (-1,5,1))

<tf.Tensor: shape=(2, 5, 1), dtype=string, numpy=
array([[[b'C I C HOLDINGS PLC'],
        [b'ASIRI SURGICAL HOSPITAL PLC'],
        [b'RENUKA AGRI FOODS PLC'],
        [b'REGNIS (LANKA) PLC'],
        [b'ACCESS ENGINEERING PLC']],

       [[b'RICHARD PIERIS AND COMPANY PLC'],
        [b'THREE ACRE FARMS PLC'],
        [b'ASIA SIYAKA COMMODITIES LIMITED'],
        [b'MILLENNIUM HOUSING DEVELOPERS PLC'],
        [b'RESUS ENERGY PLC']]], dtype=object)>

In [110]:
reshaper(test_names)

<tf.Tensor: shape=(2, 1, 5, 1), dtype=string, numpy=
array([[[[b'C I C HOLDINGS PLC'],
         [b'ASIRI SURGICAL HOSPITAL PLC'],
         [b'RENUKA AGRI FOODS PLC'],
         [b'REGNIS (LANKA) PLC'],
         [b'ACCESS ENGINEERING PLC']]],


       [[[b'RICHARD PIERIS AND COMPANY PLC'],
         [b'THREE ACRE FARMS PLC'],
         [b'ASIA SIYAKA COMMODITIES LIMITED'],
         [b'MILLENNIUM HOUSING DEVELOPERS PLC'],
         [b'RESUS ENERGY PLC']]]], dtype=object)>

In [64]:
items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

unique_user_ids = np.unique(np.concatenate(list(user_ids)))

test_model = ItemModel_v2(
    unique_item_ids,
    unique_item_gics,
    unique_item_names
)

In [65]:
embed_name = test_model.embed_item_name
embed_id = test_model.embed_item_id

In [66]:
test_batch_2 = next(iter(train_list_ds.batch(2)))

In [67]:
embed_id(test_batch_2['STOCKCODE']).shape

TensorShape([2, 5, 16])

In [68]:
test_names = test_batch_2['STOCKNAME']
test_names

<tf.Tensor: shape=(2, 5), dtype=string, numpy=
array([[b'C I C HOLDINGS PLC', b'ASIRI SURGICAL HOSPITAL PLC',
        b'RENUKA AGRI FOODS PLC', b'REGNIS (LANKA) PLC',
        b'ACCESS ENGINEERING PLC'],
       [b'RICHARD PIERIS AND COMPANY PLC', b'THREE ACRE FARMS PLC',
        b'ASIA SIYAKA COMMODITIES LIMITED',
        b'MILLENNIUM HOUSING DEVELOPERS PLC', b'RESUS ENERGY PLC']],
      dtype=object)>

In [69]:
tf.reshape(test_names, (-1,5,1))

<tf.Tensor: shape=(2, 5, 1), dtype=string, numpy=
array([[[b'C I C HOLDINGS PLC'],
        [b'ASIRI SURGICAL HOSPITAL PLC'],
        [b'RENUKA AGRI FOODS PLC'],
        [b'REGNIS (LANKA) PLC'],
        [b'ACCESS ENGINEERING PLC']],

       [[b'RICHARD PIERIS AND COMPANY PLC'],
        [b'THREE ACRE FARMS PLC'],
        [b'ASIA SIYAKA COMMODITIES LIMITED'],
        [b'MILLENNIUM HOUSING DEVELOPERS PLC'],
        [b'RESUS ENERGY PLC']]], dtype=object)>

In [70]:
textvectorizer = tf.keras.layers.TextVectorization(max_tokens = 500)
textvectorizer.adapt(unique_item_names)

In [116]:
vectors = textvectorizer(tf.reshape(test_names, (-1,5,1)))
vectors

<tf.Tensor: shape=(2, 5, 5), dtype=int64, numpy=
array([[[ 15,  81,  15,   6,   2],
        [ 97, 132,  83,   2,   0],
        [ 21, 348,  22,   2,   0],
        [156,   3,   2,   0,   0],
        [352,  34,   2,   0,   0]],

       [[ 63,  67,  18,   8,   2],
        [123, 350,  89,   2,   0],
        [ 23, 139, 299, 213,   0],
        [196,  82, 289,   2,   0],
        [153,  50,   2,   0,   0]]], dtype=int64)>

In [118]:
(-0.03526639 + -0.0114462 + -0.03526639 + 0.00864266 + -0.04725361)/5

-0.024117986

In [117]:
embedder = tf.keras.layers.Embedding(
                input_dim = 500,
                output_dim = 32,
                mask_zero = True
            )
embeddings = embedder(vectors)
embeddings

<tf.Tensor: shape=(2, 5, 5, 32), dtype=float32, numpy=
array([[[[-0.03526639,  0.0063043 ,  0.0020691 , ..., -0.0284256 ,
          -0.03538778, -0.02623254],
         [-0.0114462 , -0.01860519,  0.00267724, ...,  0.03415303,
           0.03719492,  0.03268776],
         [-0.03526639,  0.0063043 ,  0.0020691 , ..., -0.0284256 ,
          -0.03538778, -0.02623254],
         [ 0.00864266,  0.02571342, -0.00659993, ..., -0.01677393,
          -0.01116262, -0.04866154],
         [-0.04725361,  0.04160712,  0.04662491, ...,  0.04618387,
           0.00121211, -0.01943322]],

        [[-0.01647447, -0.01326246,  0.03902424, ..., -0.03844554,
           0.03941436, -0.04744345],
         [ 0.03146467, -0.04312798,  0.01367838, ...,  0.02890309,
          -0.01371483,  0.00424818],
         [ 0.01097937, -0.03166606, -0.01123408, ..., -0.04804216,
          -0.04659515, -0.04653599],
         [-0.04725361,  0.04160712,  0.04662491, ...,  0.04618387,
           0.00121211, -0.01943322],
       

In [73]:
import tensorflow as tf
from tensorflow.keras import layers

def global_average_mean(x):
  """Custom layer to perform global average mean pooling."""
  axis = -2  # Reduce mean along the last dimension
  return tf.reduce_mean(x, axis=axis)

# data = tf.random.normal(shape=(2, 5, 5, 32))

pooling_layer = layers.Lambda(global_average_mean)

pooled_output = pooling_layer(embeddings)

print(pooled_output.shape)


(2, 5, 32)


In [74]:
pooled_output

<tf.Tensor: shape=(2, 5, 32), dtype=float32, numpy=
array([[[ 0.0044423 , -0.0336897 ,  0.01354054, -0.00931381,
          0.00549586,  0.01362762, -0.03901863, -0.01187994,
          0.02129803, -0.00613534,  0.00781265, -0.01290489,
         -0.01340685, -0.00717874,  0.01737031, -0.00609877,
         -0.01554356, -0.01132214, -0.01526339,  0.01019208,
          0.0088367 , -0.00601292,  0.00354197, -0.00140394,
          0.01031147,  0.01117107,  0.01016952,  0.03967675,
         -0.01369134, -0.00948287, -0.01087032,  0.01436354],
        [-0.00204879, -0.00146163, -0.00058674,  0.02167712,
          0.02092775,  0.00589011,  0.00998723,  0.01519163,
          0.00787401,  0.0090885 ,  0.00409649, -0.00570285,
          0.00550621,  0.00910141,  0.00170779, -0.01646154,
         -0.01633897, -0.00600685,  0.02351312,  0.00239507,
          0.00744135,  0.00674596, -0.00980858,  0.00904744,
         -0.03092899, -0.01030598, -0.01142093, -0.00435836,
          0.00885344,  0.0201718

In [235]:
# embed_name(test_names) #tf.reshape(test_names, (-1, 5, 1))