In [92]:
import pandas

In [93]:
import numpy as np
import tensorflow as tf
# import tensorflow_datasets as tfds

In [94]:
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs

In [95]:
train_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_train").cache() #data\ratings_train
test_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_test").cache()

ratings = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/movie_lens/ratings_all").cache()

In [96]:
next(iter(ratings))

{'movie_title': <tf.Tensor: shape=(), dtype=string, numpy=b'Postman, The (1997)'>,
 'timestamp': <tf.Tensor: shape=(), dtype=int64, numpy=885409515>,
 'user_rating': <tf.Tensor: shape=(), dtype=float32, numpy=4.0>,
 'user_id': <tf.Tensor: shape=(), dtype=string, numpy=b'681'>}

In [97]:
movies = ratings.map(lambda x: x["movie_title"])

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_movie_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

In [98]:
tf.random.set_seed(42)

# Split between train and tests sets, as before.
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)
test = tfrs.examples.movielens.sample_listwise(
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)

In [99]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_movie_titles),
      tf.keras.layers.Embedding(len(unique_movie_titles) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [100]:
epochs = 5

cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [101]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [102]:
mse_model.fit(cached_train, epochs= 5 , verbose=1)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e30bd997f0>

# Atrad Model

In [103]:
portfolios = tf.data.Dataset.load("../../data/portfolios_tfds_lists")

In [104]:
train_list_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/train_lists_ds").cache()
test_list_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/test_lists_ds").cache()

In [105]:
next(iter(train_list_ds))

{'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-23479-LI/00'>,
 'STOCKCODE': <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'CIC', b'AMSL', b'RAL', b'REG', b'AEL'], dtype=object)>,
 'UNIX_TS': <tf.Tensor: shape=(5,), dtype=float32, numpy=
 array([1.6475418e+09, 1.6681050e+09, 1.7100954e+09, 1.7030970e+09,
        1.6655994e+09], dtype=float32)>,
 'GICS': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'Materials', b'Health Care Equipment & Services',
        b'Food Beverage & Tobacco', b'Consumer Durables & Apparel',
        b'Capital Goods'], dtype=object)>,
 'STOCKNAME': <tf.Tensor: shape=(5,), dtype=string, numpy=
 array([b'C I C HOLDINGS PLC', b'ASIRI SURGICAL HOSPITAL PLC',
        b'RENUKA AGRI FOODS PLC', b'REGNIS (LANKA) PLC',
        b'ACCESS ENGINEERING PLC'], dtype=object)>,
 'RATING': <tf.Tensor: shape=(5,), dtype=float32, numpy=array([4., 2., 5., 5., 2.], dtype=float32)>}

In [145]:
items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

In [153]:
# # from user_embedding import UserModel

# class UserModel(tf.keras.Model):
#     def __init__(
#         self,
#         unique_user_ids, 
#         ):

#         super().__init__()

#         self.unique_user_ids = unique_user_ids
        
#         self.embed_user_id = tf.keras.Sequential([
#             tf.keras.layers.StringLookup(
#                 vocabulary = self.unique_user_ids,
#                 mask_token = None
#             ),
#             tf.keras.layers.Embedding(
#                 input_dim = len(self.unique_user_ids)+1,
#                 output_dim = 32
#             )
#         ])

    
#     def call(self, inputs):

#         (user_id) = inputs  #, timestamp
#         return self.embed_user_id(user_id)

In [159]:
from user_embedding import UserModel
from item_embedding import ItemModel

class RankingModel_trad(tfrs.Model):

  def __init__(self, portfolios ,loss):
    super().__init__()

    embedding_dimension = 32,
    self.loss = loss
    
    self.portfolios = portfolios

    self.items_ids = self.portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
    self.item_GICS = self.portfolios.batch(10000).map(lambda x: x["GICS"])
    self.unique_item_ids = np.unique(np.concatenate(list(self.items_ids)))
    self.unique_item_gics = np.unique(np.concatenate(list(self.item_GICS)))

    self.user_ids = self.portfolios.batch(10000).map(lambda x: x["CDSACCNO"])
    self.unique_user_ids = np.unique(np.concatenate(list(self.user_ids)))


    self.user_embeddings = UserModel(
      unique_user_ids = unique_user_ids
    )

    ## Compute embeddings for movies.
    # self.embed_item_code = tf.keras.Sequential([
    #   tf.keras.layers.StringLookup(
    #     vocabulary= self.unique_item_ids),
    #   tf.keras.layers.Embedding(len(unique_item_ids) + 2, embedding_dimension)
    # ])

    # self.embed_items_gics = tf.keras.Sequential([
    #         tf.keras.layers.StringLookup(
    #             vocabulary = self.unique_item_gics,
    #             mask_token = None
    #         ),
    #         tf.keras.layers.Embedding(
    #             input_dim = len(self.unique_item_gics)+1,
    #             output_dim = 16 #len(unique_item_gics)
    #         )
    #     ])

    self.item_embeddings = ItemModel(
      unique_item_ids = self.unique_item_ids,
      unique_item_gics = self.unique_item_gics
    )

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    user_embeddings = self.user_embeddings(features["CDSACCNO"])
    # code_embeddings = self.embed_item_code(features["STOCKCODE"])
    # gics_embeddings = self.embed_items_gics(features['GICS'])

    item_embeddings = self.item_embeddings((
      features['STOCKCODE'],
      features['GICS']
      ))

    print()
    list_length = features["STOCKCODE"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, item_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("RATING")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )

In [160]:
len(train_list_ds)


6950

In [161]:
epochs = 5

cached_train_trad = train_list_ds.shuffle(100_000).batch(128).cache()
cached_test_trad = test_list_ds.batch(128).cache()

In [162]:
# next(iter(cached_train_trad))

In [165]:
trad_model = RankingModel_trad(
    portfolios = portfolios,
    loss = tf.keras.losses.MeanSquaredError()
    )
trad_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [166]:
trad_model.fit(cached_train_trad, epochs= 5 , verbose=1)

Epoch 1/5


UnboundLocalError: in user code:

    File "c:\Users\bpadmin\anaconda3\envs\atrad_cars_v2\lib\site-packages\keras\engine\training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\bpadmin\anaconda3\envs\atrad_cars_v2\lib\site-packages\keras\engine\training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\bpadmin\anaconda3\envs\atrad_cars_v2\lib\site-packages\keras\engine\training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\bpadmin\anaconda3\envs\atrad_cars_v2\lib\site-packages\tensorflow_recommenders\models\base.py", line 68, in train_step
        loss = self.compute_loss(inputs, training=True)
    File "C:\Users\naradaw\AppData\Local\Temp\ipykernel_23188\1574606882.py", line 88, in compute_loss
        scores = self(features)
    File "c:\Users\bpadmin\anaconda3\envs\atrad_cars_v2\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\naradaw\AppData\Local\Temp\__autograph_generated_filekbjyz1n_.py", line 11, in tf__call
        item_embeddings = ag__.converted_call(ag__.ld(item_embeddings), ((ag__.ld(features)['STOCKCODE'], ag__.ld(features)['GICS']),), None, fscope)

    UnboundLocalError: Exception encountered when calling layer 'ranking_model_trad_5' (type RankingModel_trad).
    
    in user code:
    
        File "C:\Users\naradaw\AppData\Local\Temp\ipykernel_23188\1574606882.py", line 70, in call  *
            item_embeddings = item_embeddings((
    
        UnboundLocalError: local variable 'item_embeddings' referenced before assignment
    
    
    Call arguments received by layer 'ranking_model_trad_5' (type RankingModel_trad):
      • features={'CDSACCNO': 'tf.Tensor(shape=(None,), dtype=string)', 'STOCKCODE': 'tf.Tensor(shape=(None, 5), dtype=string)', 'UNIX_TS': 'tf.Tensor(shape=(None, 5), dtype=float32)', 'GICS': 'tf.Tensor(shape=(None, 5), dtype=string)', 'STOCKNAME': 'tf.Tensor(shape=(None, 5), dtype=string)'}


In [152]:
# next(iter(cached_train_trad))