In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
import tensorflow as tf 
import ast 

import os
import plotly.express as px

In [2]:
import sys
sys.path.append(r'D:\dev work\recommender systems\Atrad_CARS\code\v5_integrated')

# Training Data

In [3]:
stock_info_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\stock_data.xlsx"

test_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_test"
train_ds_loc = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_train"

portfolios_loc = r"D:/dev work/recommender systems/Atrad_CARS/data/portfolios_v2/portfolios"

retriever_location_ = r"D:\dev work\recommender systems\Atrad_CARS\model_weights\2024_06_11_34\retriever_port_v2"

In [4]:
test_ds = tf.data.Dataset.load(test_ds_loc).cache()

train_ds = tf.data.Dataset.load(train_ds_loc).cache()

portfolio_ds = tf.data.Dataset.load(portfolios_loc).cache()

In [5]:
from retrieval_recommender_v2 import Retriever

retriever = Retriever(
    use_timestamp = True,
    portfolios = portfolio_ds
)

retriever.load_weights(retriever_location_)

retriever.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


In [6]:
stock_info = pd.read_excel(stock_info_loc)
stock_info = stock_info.drop(['Unnamed: 0','buisnesssummary'],axis = 1)
stock_info = stock_info.rename(columns = {
    'symbol':'STOCKCODE',
    'name' : 'STOCKNAME',
    'gics_code' : 'GICS'
})
stock_info = stock_info[~stock_info['GICS'].isna()]

stock_info.shape
print("items data shape :: {}".format(stock_info.shape))
unique_items_ = np.unique(np.concatenate(list(train_ds.batch(1000).map(lambda x: x["STOCKCODE"]).as_numpy_iterator())))
stock_info = stock_info[stock_info['STOCKCODE'].isin([item.decode('utf-8') for item in unique_items_])]

items_ds = tf.data.Dataset.from_tensor_slices(stock_info.to_dict(orient= 'list'))

items data shape :: (280, 3)


In [7]:
item_ids = np.concatenate(list(items_ds.batch(1000).map(lambda x: x["STOCKCODE"]).as_numpy_iterator()))

item_vocabulary = dict(zip(item_ids.tolist(), range(len(item_ids))))
item_vocabulary_inv = {v: k for k, v in item_vocabulary.items()}

item_embeddings = np.concatenate(list(items_ds.batch(len(items_ds)).map(lambda x: retriever.item_model(x)).as_numpy_iterator()))

In [8]:
# item_vocabulary

In [9]:
item_embeddings[0]

array([ 0.47059426, -0.03441931,  0.8039652 ,  0.49620765, -0.3432839 ,
       -0.34953424, -0.43810797,  0.07336081,  0.06476133,  0.05644546,
        0.6591988 ,  0.53730595,  0.18032736,  0.1575897 , -0.38884285,
        0.64881575,  0.39851853,  0.33278266, -0.3679638 ,  0.08010481,
       -0.04730263,  0.27726573,  0.08596109,  0.16110842,  0.14957792,
       -0.22457942,  0.42398468,  0.44594562, -0.15358464,  0.14417289,
        0.13290778, -0.18641517], dtype=float32)

In [10]:
def cosine_sim(vec1, vec2):

    return np.dot(vec1, vec2)/(norm(vec1)*norm(vec2))

In [11]:
cosine_sim(
    item_embeddings[0],
    item_embeddings[200]
)

NameError: name 'norm' is not defined

In [None]:
train_df = pd.DataFrame(
    data = list(train_ds.as_numpy_iterator())
)

train_df = train_df.astype(
    {
        'CDSACCNO' : 'str',
        'STOCKCODE' : 'str',
        'STOCKNAME' : 'str',
        'GICS' : 'str'
        })

In [None]:
test_df = pd.DataFrame(
    data = list(test_ds.as_numpy_iterator())
)

test_df = test_df.astype(
    {
        'CDSACCNO' : 'str',
        'STOCKCODE' : 'str',
        'STOCKNAME' : 'str',
        'GICS' : 'str'
        })

In [None]:
portfolio_df = pd.DataFrame(
    data = list(portfolio_ds.as_numpy_iterator())
)

portfolio_df = portfolio_df.astype(
    {
        'CDSACCNO' : 'str',
        'STOCKCODE' : 'str',
        'STOCKNAME' : 'str',
        'GICS' : 'str'
        })

In [None]:
unique_train_items = set(train_df['STOCKCODE'].unique())

unique_test_items = set(test_df['STOCKCODE'].unique())

unique_port_items = set(portfolio_df['STOCKCODE'].unique())

In [None]:
len(unique_train_items) , len(unique_test_items) , len(unique_port_items)

(275, 275, 275)

In [None]:
train_df.head()


Unnamed: 0,STOCKNAME,RATING,STOCKCODE,GICS,CDSACCNO,UNIX_TS
0,SOFTLOGIC LIFE INSURANCE PLC,1.0,AAIC,Insurance,BMS-10073-LC/00,1676313000.0
1,SOFTLOGIC HOLDINGS PLC,1.0,SHL,Capital Goods,BMS-10073-LC/00,1642617000.0
2,HOUSING DEVELOPMENT FINANCE CORPORATION BANK O...,1.0,HDFC,Banks,BMS-10073-LC/00,1644777000.0
3,LANKA IOC PLC,5.0,LIOC,Energy,BMS-10073-LC/00,1641494000.0
4,HAYLEYS LEISURE PLC,1.0,CONN,Consumer Services,BMS-10073-LC/00,1642531000.0


In [None]:
items_value_counts = train_df['STOCKCODE'].value_counts()

max_pop = items_value_counts.max()
min_pop = items_value_counts.min()

items_value_counts = items_value_counts.apply(lambda x: np.round((x-min_pop)*100/(max_pop-min_pop), 2))
items_pop_dict = dict(items_value_counts.items())
items_pop_dict

{'BIL': 100.0,
 'EXPO': 88.59,
 'LOFC': 78.87,
 'LIOC': 75.72,
 'RCL': 65.66,
 'HAYL': 63.06,
 'SCAP': 60.67,
 'SAMP': 59.48,
 'VONE': 56.02,
 'DIPD': 53.06,
 'ACL': 50.46,
 'SUN': 45.93,
 'AAIC': 44.86,
 'CALT': 43.27,
 'COMB': 43.27,
 'KOTA': 42.14,
 'AEL': 40.52,
 'AGST': 40.21,
 'CFVF': 40.06,
 'FCT': 39.72,
 'MGT': 39.2,
 'PLR': 38.56,
 'LOLC': 38.29,
 'LLUB': 36.54,
 'HELA': 36.33,
 'JKH': 35.78,
 'CIC': 35.38,
 'SLTL': 35.05,
 'DIAL': 34.31,
 'AGAL': 33.12,
 'TKYO': 32.91,
 'SEMB': 32.02,
 'RAL': 31.87,
 'PABC': 31.71,
 'PACK': 31.47,
 'COOP': 30.21,
 'HNB': 29.97,
 'JAT': 29.97,
 'LWL': 28.44,
 'ALUM': 28.35,
 'MELS': 27.98,
 'TILE': 27.86,
 'RICH': 27.43,
 'HHL': 26.7,
 'PLC': 25.78,
 'LDEV': 25.47,
 'NDB': 25.11,
 'HAYC': 24.37,
 'LFIN': 23.7,
 'KVAL': 23.67,
 'VFIN': 23.36,
 'TJL': 23.18,
 'PARQ': 22.81,
 'EDEN': 22.6,
 'DIST': 22.54,
 'LALU': 22.45,
 'ABL': 22.42,
 'BRWN': 22.23,
 'TESS': 22.05,
 'ASPH': 21.99,
 'COCR': 21.96,
 'HBS': 21.93,
 'VPEL': 21.62,
 'TAP': 21.53,
 

In [None]:
user_port_pop_map = dict(train_df.groupby('CDSACCNO').apply(lambda x: np.round(sum([items_pop_dict[item] for item in x['STOCKCODE'].values]), 2)).items())
# user_port_pop_map

  user_port_pop_map = dict(train_df.groupby('CDSACCNO').apply(lambda x: np.round(sum([items_pop_dict[item] for item in x['STOCKCODE'].values]), 2)).items())


# Results

In [None]:
results_file_path = r"D:\dev work\recommender systems\Atrad_CARS\results\retriever_port_v2_&_tf_listwise_ranking_2024_05_27_11_20_results.csv"
results_df = pd.read_csv(results_file_path)
results_df['recommendations'] = results_df["recommendations"].apply(lambda x: ast.literal_eval(x))
results_df.head()

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations
0,HDF-74565-LI/00,0.1,0.5,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH..."
1,BMS-800262640-VN/00,0.0,0.0,2,10,"[PLR, CINV, UAL, CALT, MARA, SEMB, ALLI, GRAN,..."
2,COM-69742-LC/00,0.1,0.1,10,38,"[AHPL, CFIN, GUAR, CINS, RENU, CHMX, WAPO, LMF..."
3,BMS-861802000-VN/00,0.1,0.071429,14,58,"[HPFL, CARS, NEST, SHAL, CERA, CHL, AFS, BFL, ..."
4,HDF-743463188-VN/00,0.0,0.0,2,9,"[ACAP, CALF, AMF, ATL, MFL, UBC, ACME, MEL, BF..."


In [None]:
results_df['portfolio_size'].max(), results_df['portfolio_size'].min()

portfolios_buckets = np.linspace(
    results_df['portfolio_size'].max(),
    1,
    11
    )

portfolios_buckets = np.sort(portfolios_buckets)
portfolios_buckets

labels = [idx for idx in range(1,11)]
labels

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [None]:
results_df['portfolio_size_bucket'] = pd.cut(
    results_df['portfolio_size'],
    bins = portfolios_buckets,
    labels = labels
)
results_df.head(3)

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations,portfolio_size_bucket
0,HDF-74565-LI/00,0.1,0.5,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH...",1
1,BMS-800262640-VN/00,0.0,0.0,2,10,"[PLR, CINV, UAL, CALT, MARA, SEMB, ALLI, GRAN,...",1
2,COM-69742-LC/00,0.1,0.1,10,38,"[AHPL, CFIN, GUAR, CINS, RENU, CHMX, WAPO, LMF...",2


In [None]:
items_pop_dict['SCAP']

60.67

In [None]:
results_df['recommendation_popularity'] = results_df['recommendations'].apply(lambda x: np.round( sum([items_pop_dict[item] for item in x]) ,2))
results_df['portfolio_item_popularity'] = results_df['CDSACCNO'].apply(lambda x: user_port_pop_map[x])
results_df.head(3)

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations,portfolio_size_bucket,recommendation_popularity,portfolio_item_popularity
0,HDF-74565-LI/00,0.1,0.5,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH...",1,316.73,533.24
1,BMS-800262640-VN/00,0.0,0.0,2,10,"[PLR, CINV, UAL, CALT, MARA, SEMB, ALLI, GRAN,...",1,266.38,471.11
2,COM-69742-LC/00,0.1,0.1,10,38,"[AHPL, CFIN, GUAR, CINS, RENU, CHMX, WAPO, LMF...",2,85.02,1203.91


# EDA

In [None]:
fig = px.histogram(results_df, x="portfolio_size_bucket")

fig.update_layout(width=600, height=400, bargap=0.2)
fig.show()

In [None]:
results_df.head(1)

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations,portfolio_size_bucket,recommendation_popularity,portfolio_item_popularity
0,HDF-74565-LI/00,0.1,0.5,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH...",1,316.73,533.24


In [None]:
fig = px.scatter(results_df, x="precision@k", y="recall@k", color="portfolio_size_bucket") #symbol
fig.update_traces(marker=dict(size=10))
fig.show()

In [None]:
fig = px.scatter(results_df, x="portfolio_item_popularity", y="recommendation_popularity", color="precision@k") #symbol
fig.update_traces(marker=dict(size=10))
fig.show()

In [None]:
fig = px.scatter(results_df, x="portfolio_item_popularity", y="recommendation_popularity", color="recall@k") #symbol
fig.update_traces(marker=dict(size=10))
fig.show()

In [None]:
results_df.head(1)

Unnamed: 0,CDSACCNO,precision@k,recall@k,num_test_items,portfolio_size,recommendations,portfolio_size_bucket,recommendation_popularity,portfolio_item_popularity
0,HDF-74565-LI/00,0.1,0.5,2,9,"[SCAP, SEMB, CALT, CSF, LOLC, SLTL, COCR, ASPH...",1,316.73,533.24


In [None]:
fig = px.scatter(
    results_df, 
    x="portfolio_size_bucket", 
    y="portfolio_item_popularity", 
    color = "precision@k",
    # size = "precision@k",
    size_max = 30
    ) #symbol #, color="portfolio_size_bucket"
fig.update_traces(marker=dict(size=15))
fig.show()

In [None]:
fig = px.scatter(
    results_df, 
    x="portfolio_size", 
    y="portfolio_item_popularity", 
    color = "precision@k",
    # size = "precision@k",
    # size_max = 30
    ) #symbol #, color="portfolio_size_bucket"
fig.update_traces(marker=dict(size=15))
fig.show()

In [None]:
fig = px.scatter(
    results_df, 
    x="portfolio_size_bucket", 
    y="portfolio_item_popularity", 
    color = "recall@k",
    # size = "precision@k",
    size_max = 30
    ) #symbol #, color="portfolio_size_bucket"
fig.update_traces(marker=dict(size=15))
fig.show()

In [None]:
fig = px.scatter(
    results_df, 
    x="portfolio_size", 
    y="portfolio_item_popularity", 
    color = "recall@k",
    # size = "precision@k",
    size_max = 30
    ) #symbol #, color="portfolio_size_bucket"
fig.update_traces(marker=dict(size=15))
fig.show()