In [16]:
import os
import pprint
import tempfile

from typing import Dict, Text

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

In [17]:
import tensorflow_recommenders as tfrs

In [18]:
# ratings = tfds.load("movielens/100k-ratings", split="train")

# ratings = ratings.map(lambda x: {
#     "movie_title": x["movie_title"],
#     "timestamp": x["timestamp"],
#     "user_id": x["user_id"],
#     "user_rating": x["user_rating"]
# })

In [19]:
portfolios = tf.data.Dataset.load("../../data/portfolios_tfds")

In [20]:
next(iter(portfolios))

{'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'SEMB'>,
 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>,
 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Diversified Financials'>,
 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-696600287-VN/00'>,
 'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1643567400.0>,
 'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'S M B LEASING PLC'>}

In [21]:
len(portfolios)

3847

In [22]:
tf.random.set_seed(42)
shuffled = portfolios.shuffle(5000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(len(portfolios)*0.8))
test = shuffled.skip(int(len(portfolios)*0.8)).take(int(len(portfolios)*0.2))

In [94]:
train.save("../../data/train")
test.save("../../data/test")

In [23]:
items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# need these to initialize timestamp embedding layers in future steps

timestamps = np.concatenate(list(portfolios.map(lambda x: x["UNIX_TS"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [24]:
'''
this handles embedding user Identifiers and contextual data.
time stamp is used as the contexual information here.
using timestamp is 
'''

class UserModel(tf.keras.Model):
    def __init__(
        self,
        use_timestamp,
        unique_user_ids, 
        timestamps,
        timestamp_buckets):

        super().__init__()

        self.use_timestamp = use_timestamp
        self.unique_user_ids = unique_user_ids
        self.timestamp_buckets = timestamp_buckets
        self.timestamps = timestamps
        
        self.embed_user_id = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_user_ids,
                mask_token = None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_user_ids)+1,
                output_dim = 32
            )
        ])

        if self.use_timestamp:
            self.embed_timestamp = tf.keras.Sequential([
                tf.keras.layers.Discretization(
                    bin_boundaries = list(self.timestamp_buckets)
                ),

                tf.keras.layers.Embedding(
                    input_dim = len(list(self.timestamp_buckets))+1 ,
                    output_dim = 32
                )
            ])

            self.normalize_timestamp = tf.keras.layers.Normalization(
                axis = None #calcuate a scaler mean and variance 
            )
            self.normalize_timestamp.adapt(self.timestamps)

    
    def call(self, inputs):

        user_id, timestamp = inputs

        if self.use_timestamp:
            user_id_embed = self.embed_user_id(user_id)
            timestamp_embed = self.embed_timestamp(timestamp)
            norm_timestamp = tf.reshape(self.normalize_timestamp(timestamp), (-1,1)) #(-1,1) means first dimension to be infered

            return tf.concat([user_id_embed, timestamp_embed, norm_timestamp], axis = 1) #concatenate vertically
            
        return self.embed_user_id(user_id)

In [25]:
'''
{'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'S M B LEASING PLC'>,
 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Diversified Financials'>,
 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>,
 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-696600287-VN/00'>,
 'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1643567400.0>,
 'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'SEMB'>}
 '''

"\n{'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'S M B LEASING PLC'>,\n 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Diversified Financials'>,\n 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>,\n 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-696600287-VN/00'>,\n 'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1643567400.0>,\n 'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'SEMB'>}\n "

In [63]:
'''
this handles embedding item Identifiers and contextual data.
movie title itself is used as the contexual information here.
using timestamp is 
'''

class ItemModel(tf.keras.Model):
    def __init__(
        self,
        unique_item_ids,
        unique_item_names,
        unique_item_gics

        ):
        super().__init__()

        self.max_tokens = 10000
        self.unique_item_ids = unique_item_ids
        self.unique_item_names = unique_item_names
        self.unique_item_gics = unique_item_gics

        self.embed_item_id = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = self.unique_item_ids,
                mask_token =None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(self.unique_item_ids)+1,
                output_dim = 32
            )
        ])

        self.embed_items_gics = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary = unique_item_gics,
                mask_token = None
            ),
            tf.keras.layers.Embedding(
                input_dim = len(unique_item_gics)+1,
                output_dim = len(unique_item_gics)
            )
        ])


        self.textvectorizer = tf.keras.layers.TextVectorization(
            max_tokens = self.max_tokens
        )

        self.embed_item_name = tf.keras.Sequential([
            self.textvectorizer,

            tf.keras.layers.Embedding(
                input_dim = self.max_tokens,
                output_dim = 32,
                mask_zero = True
            ),

            tf.keras.layers.GlobalAveragePooling1D() # reduces dimensionality to 1d (embedding layer embeddeds each word in a title one by one)
        ])

        self.textvectorizer.adapt(self.unique_item_names)
    
    def call(self, inputs):

        item_id, item_name, item_gics = inputs

        return tf.concat([
            self.embed_item_id(item_id),
            self.embed_item_name(item_name),
            self.embed_items_gics(item_gics)
        ],
        axis = 1)
        
        # return self.embed_item_title(inputs['movie_title'])

In [69]:
class RankingModel(tf.keras.Model):

  def __init__(

    self,
    use_timestamp,
    unique_user_ids, 
    timestamps, 
    timestamp_buckets,
    unique_item_ids,
    unique_item_names,
    unique_item_gics
    ):
    
    super().__init__()

    # embedding_dimension = 32
    self.use_timestamp = use_timestamp
    self.unique_user_ids = unique_user_ids 
    self.timestamps = timestamps
    self.timestamp_buckets = timestamp_buckets
    self.unique_item_ids = unique_item_ids
    self.unique_item_names = unique_item_names
    self.unique_item_gics = unique_item_gics

    self.user_embeddings = UserModel(
      use_timestamp = self.use_timestamp,
      unique_user_ids = self.unique_user_ids, 
      timestamps = self.timestamps, 
      timestamp_buckets = self.timestamp_buckets
      )

    self.item_embeddings = ItemModel(
      unique_item_ids = self.unique_item_ids,
      unique_item_names = self.unique_item_names,
      unique_item_gics = self.unique_item_gics
      )

    # Compute predictions.
    self.ratings = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
  ])

  def call(self, inputs):

    user_id, timestamp, item_id, item_name, item_gics = inputs

    user_embedding = self.user_embeddings((user_id,timestamp))
    item_embedding = self.item_embeddings((item_id, item_name, item_gics))

    return self.ratings(tf.concat([user_embedding, item_embedding], axis=1))

In [34]:
test_user_model = UserModel(
    use_timestamp = True,
    unique_user_ids = unique_user_ids, 
    timestamps = timestamps, 
    timestamp_buckets = timestamp_buckets,
)

In [37]:
next(iter(portfolios))

{'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'SEMB'>,
 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>,
 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Diversified Financials'>,
 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-696600287-VN/00'>,
 'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1643567400.0>,
 'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'S M B LEASING PLC'>}

In [64]:
test_item_model = ItemModel(
    unique_item_ids = unique_item_ids,
    unique_item_names = unique_item_names,
    unique_item_gics = unique_item_gics
)

In [65]:
# test_user_model((["RPS-696600287-VN/00"],[1643567400.0]))
test_item_model((['SEMB'],['S M B LEASING PLC'],['Diversified Financials']))





<tf.Tensor: shape=(1, 97), dtype=float32, numpy=
array([[ 4.3705329e-03, -2.2394991e-02, -3.5920691e-02, -2.1699822e-02,
        -2.9262269e-02,  4.4337008e-02, -6.8609938e-03, -2.2273827e-02,
        -1.1803556e-02, -1.7403759e-02,  3.2857466e-02, -4.0586103e-02,
         3.7861932e-02, -1.7795421e-02, -4.2396702e-02, -7.9156756e-03,
         3.2449391e-02, -2.6859224e-02, -4.7786739e-02, -2.9492307e-02,
         3.7730385e-02, -5.2641630e-03,  2.5108252e-02, -2.3457885e-02,
        -4.8597634e-02, -1.6620744e-02, -6.5700896e-03, -2.9219568e-02,
        -8.4600076e-03,  3.0909069e-03, -3.2440647e-03, -4.2207129e-03,
         1.3744962e-02,  7.0432923e-03,  1.6609214e-03, -1.9461032e-02,
        -8.6226838e-04,  6.2110219e-03,  1.7358879e-02,  4.6559861e-03,
         2.9412438e-03, -1.7536521e-02,  1.2989625e-02,  1.5450200e-02,
         2.3383677e-02,  5.7675401e-03,  1.7021524e-02,  3.4264922e-03,
        -1.2886984e-02,  1.6103169e-02,  1.1692495e-02, -1.0436247e-02,
        -2.2977

In [70]:
test_ranking_model = RankingModel(
    use_timestamp = True,
    unique_user_ids = unique_user_ids, 
    timestamps = timestamps, 
    timestamp_buckets = timestamp_buckets,
    unique_item_ids = unique_item_ids,
    unique_item_names = unique_item_names,
    unique_item_gics = unique_item_gics
    )

test_ranking_model((["RPS-696600287-VN/00"],[1643567400.0],['SEMB'],['S M B LEASING PLC'],['Diversified Financials']))



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.13506046]], dtype=float32)>

In [73]:
class MovielensModel(tfrs.models.Model):

  def __init__(
    self,
    use_timestamp,
    unique_user_ids, 
    timestamps, 
    timestamp_buckets,
    unique_item_ids,
    unique_item_names,
    unique_item_gics
    ):

    super().__init__()

    self.use_timestamp = use_timestamp
    self.unique_user_ids = unique_user_ids 
    self.timestamps = timestamps
    self.timestamp_buckets = timestamp_buckets
    self.unique_item_ids = unique_item_ids
    self.unique_item_names = unique_item_names
    self.unique_item_gics = unique_item_gics

    self.ranking_model: tf.keras.Model = RankingModel(
      use_timestamp = self.use_timestamp,
      unique_user_ids = self.unique_user_ids,
      timestamps = self.timestamps,
      timestamp_buckets = self.timestamp_buckets,
      unique_item_ids = self.unique_item_ids,
      unique_item_names = self.unique_item_names,
      unique_item_gics = self.unique_item_gics
      )

    self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
      loss = tf.keras.losses.MeanSquaredError(),
      metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )

  def call(self, features) -> tf.Tensor:
    return self.ranking_model(
        (features["CDSACCNO"],
        features['UNIX_TS'],
        features["STOCKCODE"],
        features["STOCKNAME"],
        features["GICS"]))

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    labels = features.pop("RATING")

    rating_predictions = self(features)

    return self.task(labels=labels, predictions=rating_predictions)

In [None]:
'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'SEMB'>,
 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=5.0>,
 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Diversified Financials'>,
 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'RPS-696600287-VN/00'>,
 'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1643567400.0>,
 'STOCKNAME

In [74]:
model = MovielensModel(
    use_timestamp = True,
    unique_user_ids = unique_user_ids, 
    timestamps = timestamps, 
    timestamp_buckets = timestamp_buckets,
    unique_item_ids = unique_item_ids,
    unique_item_names = unique_item_names,
    unique_item_gics = unique_item_gics
    )
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [75]:
cached_train = train.shuffle(10000).batch(128).cache()
cached_test = test.batch(128).cache()

In [76]:
model.fit(cached_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2151580fca0>

In [77]:
model.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 1.5991066694259644,
 'loss': 0.31283149123191833,
 'regularization_loss': 0,
 'total_loss': 0.31283149123191833}

In [84]:
import pandas as pd

stock_info = pd.read_excel('../../data/stock_data.xlsx')
stock_info = stock_info.drop(['Unnamed: 0'],axis = 1)
stock_info.shape

(282, 4)

In [88]:
stock_info.head()

Unnamed: 0,symbol,name,buisnesssummary,gics_code
0,HBS,hSenid Business Solutions PLC,An indigenous multinational catering towards m...,45103010 - Application Software
1,TYRE,KELANI TYRES PLC,Manufacturing tyres and tubes and marketing lo...,Automobiles & Components
2,ABL,AMANA BANK PLC,unknown,Banks
3,DFCC,DFCC BANK PLC,The principal activities of DFCC Bank include ...,Banks
4,COMB,COMMERCIAL BANK OF CEYLON PLC,Commercial Banking,Banks


In [89]:
symb_to_name = dict(zip(stock_info.symbol,stock_info.name))
symb_to_gics = dict(zip(stock_info.symbol, stock_info.gics_code))

In [93]:
test_ratings = {}
test_item_ids = ["SINS"] #, "KZOO", "LOFC","DIST"
for item_id in test_item_ids:
  test_ratings[item_id] = model({
      "CDSACCNO": np.array(["RPS-797423181-VN/00"]),
      "UNIX_TS": np.array([1664821800.0]),
      "STOCKCODE": np.array([item_id]),
      "STOCKNAME":np.array([symb_to_name.get(item_id)]),
      "GICS":np.array([symb_to_gics.get(item_id)])
  })

print("Ratings:")
for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Ratings:
SINS: [[0.6293522]]


In [92]:
next(iter(test.batch(1)))

{'STOCKCODE': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'SINS'], dtype=object)>,
 'RATING': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.], dtype=float32)>,
 'GICS': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Retailing'], dtype=object)>,
 'CDSACCNO': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'RPS-797423181-VN/00'], dtype=object)>,
 'UNIX_TS': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.6648218e+09], dtype=float32)>,
 'STOCKNAME': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'SINGER (SRI LANKA) PLC'], dtype=object)>}