In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

import numpy as np
import pandas as pd

In [2]:
tf.__version__

'2.10.1'

In [3]:
from tqdm.keras import TqdmCallback

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
test_ds = tf.data.Dataset.load(r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_test").cache()

portfolios = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/portfolios_v2/portfolios").cache()

items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# need these to initialize timestamp embedding layers in future steps

timestamps = np.concatenate(list(portfolios.map(lambda x: x["UNIX_TS"]).batch(100)))
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [5]:
len(portfolios)

157854

In [6]:
from retrieval_recommender import Retriever

retriever = Retriever(
    use_timestamp = True,
    portfolios = portfolios
)

retriever.load_weights(r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\retriever_port_v2")

retriever.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [7]:
from ranker_recommender import Ranker

ranker = Ranker(
    loss = tf.keras.losses.MeanSquaredError(),
    portfolios = portfolios
)

ranker.load_weights(r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\tf_listwise_ranking_2024_05_27_11_20")
ranker.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [8]:
stock_info = pd.read_excel('../../data/stock_data.xlsx')
stock_info = stock_info.drop(['Unnamed: 0','buisnesssummary'],axis = 1)
stock_info = stock_info.rename(columns = {
    'symbol':'STOCKCODE',
    'name' : 'STOCKNAME',
    'gics_code' : 'GICS'
})
stock_info = stock_info[~stock_info['GICS'].isna()]

stock_info.shape
print("items data shape :: {}".format(stock_info.shape))

items_ds = tf.data.Dataset.from_tensor_slices(stock_info.to_dict(orient= 'list'))

items data shape :: (280, 3)


# Retriever function

In [9]:
items_identiifiers = items_ds.map(lambda x: x["STOCKCODE"])
items_identiifiers = next(iter(items_identiifiers.batch(len(items_identiifiers))))
items_identiifiers.shape

TensorShape([280])

In [10]:
index = tfrs.layers.factorized_top_k.BruteForce(retriever.user_model)
retriever_item_model = retriever.item_model
mapped_items = items_ds.batch(len(items_ds)).map(lambda x : retriever_item_model(x, map_ = True))

mapped_items_tensor = next(iter(mapped_items))
index.index(mapped_items_tensor, items_identiifiers)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x1c0873b7430>

In [11]:
next(iter(test_ds))

{'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1663266600.0>,
 'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
 'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Real Estate'>,
 'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'OSEA'>,
 'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'OVERSEAS REALTY (CEYLON) PLC'>,
 'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'HDF-40956-LI/00'>}

In [12]:
# a test user ID & timestamp examples

test_user = str('RPS-797423181-VN/00')
test_timestamp = 1664821800.0

In [13]:
_, recommendations = index(
    (
        tf.constant([test_user]),
        tf.constant([test_timestamp])
        )
    )

recommendations = [reco.decode('utf-8') for reco in recommendations.numpy().flatten()]
print(f"Recommendations for user %s: {recommendations}" %(test_user))
# titles[0]

Recommendations for user RPS-797423181-VN/00: ['BBH', 'HSIG', 'MAL', 'JKL', 'SIGV', 'TAJ', 'LIOC', 'BALA', 'CARS', 'HOPL']


# Ranker function

In [14]:
stock_info.head(2)

Unnamed: 0,STOCKCODE,STOCKNAME,GICS
0,HBS,hSenid Business Solutions PLC,45103010 - Application Software
1,TYRE,KELANI TYRES PLC,Automobiles & Components


In [15]:
code2name = dict(zip(stock_info.STOCKCODE, stock_info.STOCKNAME))
code2gics = dict(zip(stock_info.STOCKCODE, stock_info.GICS))

In [16]:
names = np.array([code2name[code] for code in recommendations])
gics = np.array([code2gics[code] for code in recommendations])

In [17]:
names

array(['BROWNS BEACH HOTELS PLC', 'HOTEL SIGIRIYA PLC',
       'MALWATTE VALLEY PLANTATION PLC', 'JOHN KEELLS PLC',
       'SIGIRIYA VILLAGE HOTELS PLC', 'TAL LANKA HOTELS PLC',
       'LANKA IOC PLC', 'BALANGODA  PLANTATIONS  PLC',
       'CARSON CUMBERBATCH PLC', 'HORANA PLANTATIONS PLC'], dtype='<U30')

In [18]:
user = {
    'CDSACCNO' : np.array([test_user]),
    'STOCKCODE' : np.array(recommendations).reshape(-1,10),
    'GICS' : gics.reshape(-1,10),
    'STOCKNAME' : names.reshape(-1,10)
    }

In [19]:
user['STOCKNAME']

array([['BROWNS BEACH HOTELS PLC', 'HOTEL SIGIRIYA PLC',
        'MALWATTE VALLEY PLANTATION PLC', 'JOHN KEELLS PLC',
        'SIGIRIYA VILLAGE HOTELS PLC', 'TAL LANKA HOTELS PLC',
        'LANKA IOC PLC', 'BALANGODA  PLANTATIONS  PLC',
        'CARSON CUMBERBATCH PLC', 'HORANA PLANTATIONS PLC']], dtype='<U30')

In [20]:
pred_ratings = ranker(user)
pred_ratings.numpy().flatten()

array([0.99589384, 1.8489531 , 2.004943  , 2.0225039 , 5.455021  ,
       1.3958615 , 2.3344648 , 2.0428324 , 1.4689542 , 2.0648806 ],
      dtype=float32)

In [21]:
recommendations_w_ratings = pd.DataFrame()
recommendations_w_ratings['STOCKCODE'] = recommendations
recommendations_w_ratings['PRED_RATING'] = pred_ratings.numpy().flatten()
recommendations_w_ratings = recommendations_w_ratings.sort_values( by = ['PRED_RATING'], ascending= False)
recommendations_w_ratings

Unnamed: 0,STOCKCODE,PRED_RATING
4,SIGV,5.455021
6,LIOC,2.334465
9,HOPL,2.064881
7,BALA,2.042832
3,JKL,2.022504
2,MAL,2.004943
1,HSIG,1.848953
8,CARS,1.468954
5,TAJ,1.395862
0,BBH,0.995894


# complete recommender function

In [22]:
def recommend_(CDSACCNO, timestamp = 1664821800.0):
    _, recommendations = index(
    (
        tf.constant([CDSACCNO]),
        tf.constant([timestamp])
        )
    )

    recommendations = [reco.decode('utf-8') for reco in recommendations.numpy().flatten()]
    # print(f"Recommendations for user %s: {recommendations}" %(test_user))

    user = {
    'CDSACCNO' : np.array([CDSACCNO]),
    'STOCKCODE' : np.array(recommendations).reshape(-1,10),
    'GICS' : gics.reshape(-1,10),
    'STOCKNAME' : names.reshape(-1,10)
    }

    pred_ratings = ranker(user)

    recommendations_w_ratings = pd.DataFrame()
    recommendations_w_ratings['STOCKCODE'] = recommendations
    recommendations_w_ratings['PRED_RATING'] = pred_ratings.numpy().flatten()
    recommendations_w_ratings = recommendations_w_ratings.sort_values( by = ['PRED_RATING'], ascending= False)
    return recommendations_w_ratings

In [23]:
test_user = str('RPS-797423181-VN/00')
test_timestamp = 1664821800.0

recommend_(test_user, test_timestamp)

Unnamed: 0,STOCKCODE,PRED_RATING
4,SIGV,5.455021
6,LIOC,2.334465
9,HOPL,2.064881
7,BALA,2.042832
3,JKL,2.022504
2,MAL,2.004943
1,HSIG,1.848953
8,CARS,1.468954
5,TAJ,1.395862
0,BBH,0.995894


In [24]:
len(test_ds)

31570

In [25]:
train_ds = tf.data.Dataset.load(r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_train").cache()

In [26]:
train_ds_1 = train_ds.batch(len(train_ds))
test_ds_1 = test_ds.batch(len(test_ds))

In [27]:
train_ds_dict = next(iter(train_ds_1))
test_ds_dict = next(iter(test_ds_1))

In [28]:
#test
data_dict = {
    'CDSACCNO': test_ds_dict['CDSACCNO'].numpy(),
    'STOCKCODE': test_ds_dict['STOCKCODE'].numpy(),
    'STOCKNAME': test_ds_dict['STOCKNAME'].numpy(),
    'GICS': test_ds_dict['GICS'].numpy(),
    'UNIX_TS': test_ds_dict['UNIX_TS'].numpy(),
    'RATING': test_ds_dict['RATING'].numpy(),
}

test_df = pd.DataFrame.from_dict(data_dict)

test_df = test_df.astype(
    {
        'CDSACCNO' : 'str',
        'STOCKCODE' : 'str',
        'STOCKNAME' : 'str',
        'GICS' : 'str'
        })

#train
data_dict = {
    'CDSACCNO': train_ds_dict['CDSACCNO'].numpy(),
    'STOCKCODE': train_ds_dict['STOCKCODE'].numpy(),
    'STOCKNAME': train_ds_dict['STOCKNAME'].numpy(),
    'GICS': train_ds_dict['GICS'].numpy(),
    'UNIX_TS': train_ds_dict['UNIX_TS'].numpy(),
    'RATING': train_ds_dict['RATING'].numpy(),
}

train_df = pd.DataFrame.from_dict(data_dict)

train_df = train_df.astype(
    {
        'CDSACCNO' : 'str',
        'STOCKCODE' : 'str',
        'STOCKNAME' : 'str',
        'GICS' : 'str'
        })

In [29]:
test_df.dtypes

CDSACCNO      object
STOCKCODE     object
STOCKNAME     object
GICS          object
UNIX_TS      float32
RATING       float32
dtype: object

In [30]:
test_df.head()

Unnamed: 0,CDSACCNO,STOCKCODE,STOCKNAME,GICS,UNIX_TS,RATING
0,HDF-40956-LI/00,OSEA,OVERSEAS REALTY (CEYLON) PLC,Real Estate,1663267000.0,2.0
1,BMS-921630085-VN/00,EDEN,EDEN HOTEL LANKA PLC,Consumer Services,1707849000.0,1.0
2,BMS-54136-LI/00,HNB,HATTON NATIONAL BANK PLC,Banks,1654108000.0,2.0
3,BMS-902032762-VN/00,RCL,ROYAL CERAMICS LANKA PLC,Capital Goods,1643222000.0,3.0
4,SBL-762412608-VN/00,CINS,CEYLINCO INSURANCE PLC,Insurance,1704911000.0,3.0


In [31]:
test_tfds = tf.data.Dataset.from_tensor_slices(test_ds_dict)

In [32]:
type(test_tfds), type(test_ds)

(tensorflow.python.data.ops.dataset_ops.TensorSliceDataset,
 tensorflow.python.data.ops.dataset_ops.CacheDataset)

In [33]:
import tensorflow as tf

tf.__version__

'2.10.1'

In [34]:
# import tensorflow_datasets as tfds

# tfds.__version__
# test_tfds.as_dataframe(test_tfds)

In [35]:
test_users_ = test_df.groupby('CDSACCNO')
train_users_ = train_df.groupby('CDSACCNO')

In [36]:
name = 'SBL-762412608-VN/00'

In [37]:
test_df.iloc[test_users_.groups[name]]

Unnamed: 0,CDSACCNO,STOCKCODE,STOCKNAME,GICS,UNIX_TS,RATING
4,SBL-762412608-VN/00,CINS,CEYLINCO INSURANCE PLC,Insurance,1704911000.0,3.0
2594,SBL-762412608-VN/00,LIOC,LANKA IOC PLC,Energy,1692729000.0,4.0
14747,SBL-762412608-VN/00,HHL,HEMAS HOLDINGS PLC,Capital Goods,1686595000.0,3.0
15204,SBL-762412608-VN/00,TJL,TEEJAY LANKA PLC,Consumer Durables & Apparel,1683830000.0,2.0


In [41]:
user_test_port_ = train_df.iloc[train_users_.groups[name]].sort_values('RATING', ascending = False)
user_test_port_

Unnamed: 0,CDSACCNO,STOCKCODE,STOCKNAME,GICS,UNIX_TS,RATING
34707,SBL-762412608-VN/00,SAMP,SAMPATH BANK PLC,Banks,1706553000.0,5.0
45541,SBL-762412608-VN/00,GRAN,CEYLON GRAIN ELEVATORS PLC,FOOD BEVERAGE & TOBACCO,1684348000.0,5.0
12661,SBL-762412608-VN/00,JKH,JOHN KEELLS HOLDINGS PLC,Capital Goods,1681929000.0,4.0
109756,SBL-762412608-VN/00,MELS,MELSTACORP PLC,Food Beverage & Tobacco,1694025000.0,4.0
47129,SBL-762412608-VN/00,LFIN,LB FINANCE PLC,Diversified Financials,1706812000.0,3.0
59395,SBL-762412608-VN/00,HNB,HATTON NATIONAL BANK PLC,Banks,1687113000.0,3.0
23159,SBL-762412608-VN/00,ELPL,ELPITIYA PLANTATIONS PLC,Food Beverage & Tobacco,1683743000.0,1.0


In [40]:
recommendations_ = recommend_(name, test_timestamp)
recommendations_

Unnamed: 0,STOCKCODE,PRED_RATING
6,HNB,5.361472
4,JKH,4.698636
8,GRAN,3.02487
0,NEST,2.528221
5,LWL,2.344155
2,KGAL,2.207677
9,LFIN,2.179858
3,LLUB,1.733934
1,MELS,1.650706
7,LCEY,1.099964


In [53]:
recommendations_.iloc[5]

STOCKCODE          KGAL
PRED_RATING    2.207677
Name: 2, dtype: object

In [56]:
len(user_test_port_[user_test_port_.STOCKCODE == recommendations_.iloc[5]['STOCKCODE']].RATING)

0

In [50]:
recommendations_['ACT_RATING'] = recommendations_.apply(lambda x: user_test_port_[user_test_port_.STOCKCODE == x['STOCKCODE']].RATING.item(), axis = 1)
recommendations_

ValueError: can only convert an array of size 1 to a Python scalar