In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs

import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import os
import array
import collections

from typing import Dict, List, Optional, Text, Tuple

In [2]:
retriever_location_ = r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_06_07_22\retriever_port_v2_hoo" #r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\retriever_port_v2"
ranking_location_ = r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\tf_listwise_ranking_2024_05_27_11_20"
stock_info_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\stock_data.xlsx"

test_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_hoo_test"
train_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_hoo_train"

portfolios_loc = r"D:/dev work/recommender systems/Atrad_CARS/data/portfolios_v2/portfolios"

In [3]:
# retriever_location_ = r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_06_11_34\retriever_port_v2" #r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\retriever_port_v2"
# ranking_location_ = r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_05_27\tf_listwise_ranking_2024_05_27_11_20"
# stock_info_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\stock_data.xlsx"

# test_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_test"
# train_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_train"

# portfolios_loc = r"D:/dev work/recommender systems/Atrad_CARS/data/portfolios_v2/portfolios"

In [4]:
results_loc = r"D:\dev work\recommender systems\Atrad_CARS\results"

In [5]:
test_ds = tf.data.Dataset.load(test_ds_loc).cache()

train_ds = tf.data.Dataset.load(train_ds_loc).cache()

train_ds_1 = train_ds.batch(len(train_ds))
test_ds_1 = test_ds.batch(len(test_ds))

portfolios = tf.data.Dataset.load(portfolios_loc).cache()

items_ids = portfolios.batch(10000).map(lambda x: x["STOCKCODE"])
item_names = portfolios.batch(10000).map(lambda x: x["STOCKNAME"])
item_GICS = portfolios.batch(10000).map(lambda x: x["GICS"])

user_ids = portfolios.batch(10000).map(lambda x: x["CDSACCNO"])

unique_item_ids = np.unique(np.concatenate(list(items_ids)))
unique_item_names = np.unique(np.concatenate(list(item_names)))
unique_item_gics = np.unique(np.concatenate(list(item_GICS)))

unique_user_ids = np.unique(np.concatenate(list(user_ids)))

# need these to initialize timestamp embedding layers in future steps

timestamps = np.concatenate(list(portfolios.map(lambda x: x["UNIX_TS"]).batch(100)))
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

In [6]:
from retrieval_recommender_v2 import Retriever

retriever = Retriever(
    use_timestamp = True,
    portfolios = portfolios
)

retriever.load_weights(retriever_location_)

retriever.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [7]:
from ranker_recommender import Ranker

ranker = Ranker(
    loss = tf.keras.losses.MeanSquaredError(),
    portfolios = portfolios
)

ranker.load_weights(ranking_location_)
ranker.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [35]:
stock_info = pd.read_excel(stock_info_loc)
stock_info = stock_info.drop(['Unnamed: 0','buisnesssummary'],axis = 1)
stock_info = stock_info.rename(columns = {
    'symbol':'STOCKCODE',
    'name' : 'STOCKNAME',
    'gics_code' : 'GICS'
})
stock_info = stock_info[~stock_info['GICS'].isna()]

stock_info.shape
print("items data shape :: {}".format(stock_info.shape))
unique_items_ = np.unique(np.concatenate(list(train_ds.batch(1000).map(lambda x: x["STOCKCODE"]).as_numpy_iterator())))
stock_info = stock_info[stock_info['STOCKCODE'].isin([item.decode('utf-8') for item in unique_items_])]

items_ds = tf.data.Dataset.from_tensor_slices(stock_info.to_dict(orient= 'list'))

items data shape :: (280, 3)


# evaluation function

In [37]:
def evaluate(retriever,
             test: tf.data.Dataset,
             train: Optional[tf.data.Dataset] = None,
             timestamp: int = datetime.timestamp(datetime.now()),
             k: int = 10):
  
  item_ids = np.concatenate(list(items_ds.batch(1000).map(lambda x: x["STOCKCODE"]).as_numpy_iterator()))

  item_vocabulary = dict(zip(item_ids.tolist(), range(len(item_ids))))
  item_vocabulary_inv = {v: k for k, v in item_vocabulary.items()}

  train_user_to_items = collections.defaultdict(lambda: array.array("i"))
  test_user_to_items = collections.defaultdict(lambda: array.array("i"))

  if train is not None:
    for row in train.as_numpy_iterator():
      user_id = row["CDSACCNO"]
      item_id = item_vocabulary[row["STOCKCODE"]]
      train_user_to_items[user_id].append(item_id)

  for row in test.as_numpy_iterator():
    user_id = row["CDSACCNO"]
    item_id = item_vocabulary[row["STOCKCODE"]]
    test_user_to_items[user_id].append(item_id)

  item_embeddings = np.concatenate(list(items_ds.batch(len(items_ds)).map(lambda x: retriever.item_model(x)).as_numpy_iterator()))

  user_ids = []
  precision_values = []
  recall_values = []
  num_test_items = []
  num_train_items = []
  recommendations = []

  for user_id, test_items in tqdm(test_user_to_items.items()):
    user_embedding = retriever.user_model(
      {
        'CDSACCNO' : tf.constant([user_id]),
        'UNIX_TS' : tf.constant([timestamp])
      }
      ).numpy()
    scores = (user_embedding @ item_embeddings.T).flatten()

    test_items = np.frombuffer(test_items, dtype=np.int32)
    
    if train is not None:
      train_items = np.frombuffer(
          train_user_to_items[user_id], dtype=np.int32)
      scores[train_items] = -1e6

    

    top_items = np.argsort(-scores)[:k]
    recommendations.append([item_vocabulary_inv[item_id].decode('utf-8') for item_id in top_items])

    num_test_items_in_k = sum(x in top_items for x in  test_items)
    precision_values.append(num_test_items_in_k / k)
    
    recall_values.append(num_test_items_in_k / len(test_items))
    num_test_items.append(len((test_items)))
    num_train_items.append(len(train_user_to_items[user_id]))
    user_ids.append(user_id)

  results_df_ = pd.DataFrame(
    columns = ['CDSACCNO','precision@k', 'recall@k','num_test_items','portfolio_size', 'recommendations'],
    data = list(zip(user_ids, precision_values, recall_values, num_test_items, num_train_items, recommendations))
  )

  return {
      "precision_at_k": np.mean(precision_values),
      "recall_at_k": np.mean(recall_values),
      "results_df_" : results_df_
  }

In [38]:
results = evaluate(
    retriever,
    test_ds,
    train_ds
)

100%|██████████| 5906/5906 [00:19<00:00, 305.46it/s]


In [39]:
results['precision_at_k'] , results['recall_at_k']

(0.05374195733152726, 0.12099826568855412)

In [40]:
results['results_df_']['CDSACCNO'] = results['results_df_']['CDSACCNO'].apply(lambda x: x.decode('utf-8'))
results['results_df_']
# results

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations
0,HDF-74565-LI/00,0.1,0.500000,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH..."
1,BMS-800262640-VN/00,0.0,0.000000,2,10,"[PLR, CINV, UAL, CALT, MARA, SEMB, ALLI, GRAN,..."
2,COM-69742-LC/00,0.1,0.100000,10,38,"[AHPL, CFIN, GUAR, CINS, RENU, CHMX, WAPO, LMF..."
3,BMS-861802000-VN/00,0.1,0.071429,14,58,"[HPFL, CARS, NEST, SHAL, CERA, CHL, AFS, BFL, ..."
4,HDF-743463188-VN/00,0.0,0.000000,2,9,"[ACAP, CALF, AMF, ATL, MFL, UBC, ACME, MEL, BF..."
...,...,...,...,...,...,...
5901,BMS-68660-LI/00,0.0,0.000000,3,10,"[LCBF, AMF, BFN, MFL, WAPO, ACAP, PMB, MBSL, C..."
5902,BMS-683600342-VN/00,0.0,0.000000,3,10,"[GUAR, COMD, CARG, SEYB, BUKI, CARS, AHUN, TAF..."
5903,CAS-86452-LI/00,0.0,0.000000,3,10,"[ABAN, LMF, SOY, GUAR, CINS, LVEN, RENU, UAL, ..."
5904,CMB-5826-LC/00,0.0,0.000000,2,8,"[SAMP, HASU, COMB, DFCC, HAYL, DIAL, NTB, CFIN..."


In [41]:
results_loc_ = r"D:\dev work\recommender systems\Atrad_CARS\results"

retriever_name = os.path.basename(retriever_location_)
ranker_name = os.path.basename(ranking_location_)

results_file_name = retriever_name + "_&_" + ranker_name + "_results_hoo.csv"

results_save_path = os.path.join(results_loc_, results_file_name)
# results_save_path
results['results_df_'].to_csv(results_save_path, index = False)