In [9]:
import pandas as pd
import numpy as np
import time
import datetime
import os
import tensorflow as tf


# Utils

In [10]:
def split_symbol(symbol):
    return symbol.split('.')[0]

In [11]:
def mapper(symb, mapper):
    try:
        return mapper.get(symb)
    except:
        return 'Empty'

In [12]:
def infer_rating(df, qnt_col = 'SHARESQTY', price_col= 'SHAREPRICE', rating_col = 'RATING'):

    order_prices = df[qnt_col] * df[price_col]
    scaler = MinMaxScaler(feature_range=(1,5))
    scaled_price = scaler.fit_transform(order_prices.values.reshape(-1, 1))
    df[rating_col] = np.clip(scaled_price, 1, 5)
    return df

In [13]:
def get_max_values(df, col_name = 'RATING', round_method = 'round'):
    max_rating_row = df.loc[df[col_name].idxmax()]
    if round_method == 'round':
        max_rating_row[col_name] = max_rating_row[col_name].round(0)
        
    elif round_method == 'ceil':
        max_rating_row[col_name] = np.ceil(max_rating_row[col_name].array)
    
    return max_rating_row

In [14]:
def to_timestamp(date):
    return datetime.datetime.timestamp(date)

# Preprocessing

In [15]:
# portfolios = pd.read_excel(
#     "../../data/PORTFOLIODETAILS_0403.xlsx",
#     sheet_name = ['Sheet 1','Sheet 2'])
# # portfolios.shape

In [16]:
data_path = r"D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\raw"

portfolios = pd.DataFrame()
for fname in os.listdir(data_path):
    
    broker_df_ = pd.read_csv(os.path.join(data_path,fname), sep = '|')
    print("--- reading : {}".format(fname))
    
    portfolios = pd.concat([portfolios, broker_df_], ignore_index = True)

  broker_df_ = pd.read_csv(os.path.join(data_path,fname), sep = '|')


--- reading : Bartleet.txt


  broker_df_ = pd.read_csv(os.path.join(data_path,fname), sep = '|')


--- reading : CAS.txt
--- reading : FCE.txt
--- reading : NLE.txt
--- reading : RPS.txt


In [17]:
portfolios.shape

(3728894, 8)

In [18]:
portfolios.head()

Unnamed: 0,CDSACCNO,STOCKCODE,REFERANCE,TRAN_TYPE,SHARESQTY,SHAREPRICE,TRADE_DATE,TRADE_TIME
0,BMS-731900310-VN/00,AGPL.N0000,2024153263,B,125,7.5,5/13/2024,5/13/2024 12:25:45.000000 PM
1,BMS-800262640-VN/00,RIL.N0000,2024153264,S,-100,8.5,5/13/2024,5/13/2024 12:25:48.000000 PM
2,BMS-722151828-VN/00,PACK.N0000,2024153265,B,5000,15.0,5/13/2024,5/13/2024 12:26:03.000000 PM
3,BMS-42281-LI/00,PACK.N0000,2024153266,B,300,15.0,5/13/2024,5/13/2024 12:26:03.000000 PM
4,BMS-478-LC/00,HNB.N0000,2024153267,B,350,202.25,5/13/2024,5/13/2024 12:26:42.000000 PM


In [19]:
portfolios['TRADE_TIME'] = pd.to_datetime(portfolios['TRADE_TIME'])
portfolios['TRADE_DATE'] = pd.to_datetime(portfolios['TRADE_DATE'])

In [20]:
portfolios.dtypes

CDSACCNO              object
STOCKCODE             object
REFERANCE             object
TRAN_TYPE             object
SHARESQTY              int64
SHAREPRICE           float64
TRADE_DATE    datetime64[ns]
TRADE_TIME    datetime64[ns]
dtype: object

In [21]:
stock_info = pd.read_excel('../../data/stock_data.xlsx')
stock_info = stock_info.drop(['Unnamed: 0'],axis = 1)
stock_info.shape

(282, 4)

In [22]:
stock_info = stock_info.dropna()

In [23]:
stock_info.head()

Unnamed: 0,symbol,name,buisnesssummary,gics_code
0,HBS,hSenid Business Solutions PLC,An indigenous multinational catering towards m...,45103010 - Application Software
1,TYRE,KELANI TYRES PLC,Manufacturing tyres and tubes and marketing lo...,Automobiles & Components
2,ABL,AMANA BANK PLC,unknown,Banks
3,DFCC,DFCC BANK PLC,The principal activities of DFCC Bank include ...,Banks
4,COMB,COMMERCIAL BANK OF CEYLON PLC,Commercial Banking,Banks


In [24]:
unique_symbols = set(stock_info.symbol.unique())
len(unique_symbols)

280

In [25]:
# portfolios_df = pd.concat([portfolios['Sheet 1'],portfolios['Sheet 2']], ignore_index= True)
portfolios_df = portfolios.copy()
portfolios_df = portfolios_df.loc[portfolios_df.TRAN_TYPE == 'B']

In [26]:
portfolios_df.shape

(1869633, 8)

In [27]:
portfolios_df.head()

Unnamed: 0,CDSACCNO,STOCKCODE,REFERANCE,TRAN_TYPE,SHARESQTY,SHAREPRICE,TRADE_DATE,TRADE_TIME
0,BMS-731900310-VN/00,AGPL.N0000,2024153263,B,125,7.5,2024-05-13,2024-05-13 12:25:45
2,BMS-722151828-VN/00,PACK.N0000,2024153265,B,5000,15.0,2024-05-13,2024-05-13 12:26:03
3,BMS-42281-LI/00,PACK.N0000,2024153266,B,300,15.0,2024-05-13,2024-05-13 12:26:03
4,BMS-478-LC/00,HNB.N0000,2024153267,B,350,202.25,2024-05-13,2024-05-13 12:26:42
11,BMS-478-LC/00,HNB.N0000,2024153274,B,109,202.25,2024-05-13,2024-05-13 12:29:02


In [28]:
portfolios_df_fil_1 = portfolios_df.groupby(by = 'CDSACCNO').filter(lambda x: x['STOCKCODE'].nunique() > 10)
portfolios_df_fil_1['UNIX_TS'] = portfolios_df_fil_1['TRADE_DATE'].apply(lambda x: to_timestamp(x))
portfolios_df_fil_1.head()

Unnamed: 0,CDSACCNO,STOCKCODE,REFERANCE,TRAN_TYPE,SHARESQTY,SHAREPRICE,TRADE_DATE,TRADE_TIME,UNIX_TS
0,BMS-731900310-VN/00,AGPL.N0000,2024153263,B,125,7.5,2024-05-13,2024-05-13 12:25:45,1715539000.0
2,BMS-722151828-VN/00,PACK.N0000,2024153265,B,5000,15.0,2024-05-13,2024-05-13 12:26:03,1715539000.0
3,BMS-42281-LI/00,PACK.N0000,2024153266,B,300,15.0,2024-05-13,2024-05-13 12:26:03,1715539000.0
4,BMS-478-LC/00,HNB.N0000,2024153267,B,350,202.25,2024-05-13,2024-05-13 12:26:42,1715539000.0
11,BMS-478-LC/00,HNB.N0000,2024153274,B,109,202.25,2024-05-13,2024-05-13 12:29:02,1715539000.0


In [29]:
portfolios_df_fil_1.CDSACCNO.nunique(), portfolios_df_fil_1.STOCKCODE.nunique()

(5906, 346)

In [30]:
prev_symbols = set(list(portfolios_df_fil_1.STOCKCODE.unique()))
portfolios_df_fil_1.shape

(1674385, 9)

In [31]:
portfolios_df_fil_1['STOCKCODE'] = portfolios_df_fil_1.STOCKCODE.apply(lambda x : split_symbol(x))

In [32]:
portfolios_df_fil_1['STOCKCODE'] = portfolios_df_fil_1.STOCKCODE.apply(lambda x : split_symbol(x))

In [33]:
unique_port_symbols = set(portfolios_df_fil_1.STOCKCODE.unique())
len(unique_port_symbols)

288

In [34]:
to_remove = list(unique_port_symbols - unique_symbols)
to_remove

['GSF',
 'WATA',
 'AGPL',
 'UBF',
 'CLC',
 'CBNK',
 'SFL',
 'YORK',
 'CALI',
 'LGIL',
 'WIND',
 'PDL',
 'CITW']

In [35]:
portfolios_df_fil_1 = portfolios_df_fil_1[~portfolios_df_fil_1.STOCKCODE.isin(to_remove)]

In [36]:
symb_to_name = dict(zip(stock_info.symbol,stock_info.name))
symb_to_gics = dict(zip(stock_info.symbol, stock_info.gics_code))

In [37]:
portfolios_df_fil_1['STOCKNAME'] = portfolios_df_fil_1.STOCKCODE.apply(lambda x: mapper(x,symb_to_name))
portfolios_df_fil_1['GICS'] = portfolios_df_fil_1.STOCKCODE.apply(lambda x: mapper(x,symb_to_gics))

In [38]:
# portfolios_df_fil_1[portfolios_df_fil_1['STOCKNAME'] == np.nan]

In [39]:
portfolios_df_fil_1.CDSACCNO.nunique(), portfolios_df_fil_1.STOCKCODE.nunique()

(5906, 275)

In [40]:
from sklearn.preprocessing import MinMaxScaler

# portfolios_df_fil_3 = filter_portfolios(portfolios_df_fil_2)
portfolios_df_fil_3 = portfolios_df_fil_1.groupby('CDSACCNO', group_keys= False).apply(lambda x: infer_rating(x)).groupby(['CDSACCNO','STOCKCODE'], group_keys= False).apply(lambda x: get_max_values(x)).reset_index(drop =True).sort_values('RATING', ascending= False)

  portfolios_df_fil_3 = portfolios_df_fil_1.groupby('CDSACCNO', group_keys= False).apply(lambda x: infer_rating(x)).groupby(['CDSACCNO','STOCKCODE'], group_keys= False).apply(lambda x: get_max_values(x)).reset_index(drop =True).sort_values('RATING', ascending= False)
  portfolios_df_fil_3 = portfolios_df_fil_1.groupby('CDSACCNO', group_keys= False).apply(lambda x: infer_rating(x)).groupby(['CDSACCNO','STOCKCODE'], group_keys= False).apply(lambda x: get_max_values(x)).reset_index(drop =True).sort_values('RATING', ascending= False)


In [41]:
portfolios_df_fil_3.CDSACCNO.nunique(), portfolios_df_fil_3.STOCKCODE.nunique()

(5906, 275)

In [42]:
portfolios_df_fil_3.head(2)

Unnamed: 0,CDSACCNO,STOCKCODE,REFERANCE,TRAN_TYPE,SHARESQTY,SHAREPRICE,TRADE_DATE,TRADE_TIME,UNIX_TS,STOCKNAME,GICS,RATING
24126,BMS-48991-LI/00,LWL,24595,B,1365,49.0,2023-01-11,NaT,1673375000.0,LANKA WALLTILE PLC,Capital Goods,5.0
140760,HDF-743612299-VN/00,SEMB,2023178157,B,10000,0.7,2023-07-28,2023-07-28 01:15:30,1690483000.0,S M B LEASING PLC,Diversified Financials,5.0


In [43]:
portfolios_df_fil_3.GICS.nunique()

33

In [44]:
portfolios_df_fil_4 = portfolios_df_fil_3[['CDSACCNO','STOCKCODE','UNIX_TS','RATING','GICS','STOCKNAME']] #,'GICS','STOCKNAME'

In [45]:
data_dict = portfolios_df_fil_4.to_dict(orient='list')

In [46]:
dataset = tf.data.Dataset.from_tensor_slices(data_dict)

In [47]:
next(iter(dataset.batch(1)))

{'CDSACCNO': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'BMS-48991-LI/00'], dtype=object)>,
 'STOCKCODE': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'LWL'], dtype=object)>,
 'UNIX_TS': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.6733754e+09], dtype=float32)>,
 'RATING': <tf.Tensor: shape=(1,), dtype=float32, numpy=array([5.], dtype=float32)>,
 'GICS': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'Capital Goods'], dtype=object)>,
 'STOCKNAME': <tf.Tensor: shape=(1,), dtype=string, numpy=array([b'LANKA WALLTILE PLC'], dtype=object)>}

In [61]:
dataset.save("../../data/portfolios_v2/portfolios")

In [49]:
len(dataset)

157854

In [50]:
tf.random.set_seed(42)
shuffled = dataset.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(len(dataset)* 0.8))
test = shuffled.skip(int(len(dataset)* 0.8)).take(int(len(dataset)* 0.2))

In [51]:
train.save("../../data/portfolios_v2/retriver_train")
test.save("../../data/portfolios_v2/retriver_test")

In [52]:
# new_dataset = tf.data.Dataset.load("../../data/portfolios_v2/portfolios_tfds")

# Work Here

In [53]:
import array
import collections

from typing import Dict, List, Optional, Text, Tuple

def _create_feature_dict() -> Dict[Text, List[tf.Tensor]]:
  return {"STOCKCODE": [], "RATING": [], "GICS": [], "STOCKNAME": [], "UNIX_TS": []}

def _sample_list(
    feature_lists: Dict[Text, List[tf.Tensor]],
    num_examples_per_list: int,
    random_state: Optional[np.random.RandomState] = None,
) -> Tuple[tf.Tensor, tf.Tensor]:
  """Function for sampling a list example from given feature lists."""
  if random_state is None:
    random_state = np.random.RandomState()

  sampled_indices = random_state.choice(
      range(len(feature_lists["STOCKCODE"])),
      size=num_examples_per_list,
      replace=False,
  )
  sampled_STOCKCODE = [
      feature_lists["STOCKCODE"][idx] for idx in sampled_indices
  ]
  sampled_RATING = [
      feature_lists["RATING"][idx]
      for idx in sampled_indices
  ]
  sampled_GICS = [
      feature_lists["GICS"][idx] for idx in sampled_indices
  ]
  sampled_STOCKNAME = [
      feature_lists["STOCKNAME"][idx]
      for idx in sampled_indices
  ]
  sampled_UNIX_TS = [
      feature_lists["UNIX_TS"][idx] for idx in sampled_indices
  ]

  return (
      tf.stack(sampled_STOCKCODE, 0),
      tf.stack(sampled_RATING, 0),
      tf.stack(sampled_GICS, 0),
      tf.stack(sampled_STOCKNAME, 0),
      tf.stack(sampled_UNIX_TS, 0)
  )


def sample_listwise(
    rating_dataset: tf.data.Dataset,
    num_list_per_user: int = 10,
    num_examples_per_list: int = 10,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
  
  random_state = np.random.RandomState(seed)

  example_lists_by_user = collections.defaultdict(_create_feature_dict)

  movie_title_vocab = set()
  for example in rating_dataset:
    user_id = example["CDSACCNO"].numpy()
    example_lists_by_user[user_id]["STOCKCODE"].append(
        example["STOCKCODE"])
    example_lists_by_user[user_id]["RATING"].append(
        example["RATING"])
    example_lists_by_user[user_id]["GICS"].append(
        example["GICS"])
    example_lists_by_user[user_id]["STOCKNAME"].append(
        example["STOCKNAME"])
    example_lists_by_user[user_id]["UNIX_TS"].append(
        example["UNIX_TS"])
    
    movie_title_vocab.add(example["STOCKNAME"].numpy())

    

  tensor_slices = {"CDSACCNO": [], "STOCKCODE": [], "RATING": [], "GICS": [], "STOCKNAME": [], "UNIX_TS": []}

  for user_id, feature_lists in example_lists_by_user.items():
    for _ in range(num_list_per_user):

      # Drop the user if they don't have enough ratings.
      if len(feature_lists["STOCKNAME"]) < num_examples_per_list:
        continue

        '''sampled_STOCKCODE, 0),
      tf.stack(sampled_RATING, 0),
      tf.stack(sampled_GICS, 0),
      tf.stack(sampled_STOCKNAME, 0),
      tf.stack(sampled_UNIX_TS'''

      sampled_STOCKCODE, sampled_RATING, sampled_GICS, sampled_STOCKNAME, sampled_UNIX_TS  = _sample_list(
          feature_lists,
          num_examples_per_list,
          random_state=random_state,
      )
      tensor_slices["CDSACCNO"].append(user_id)
      tensor_slices["STOCKCODE"].append(sampled_STOCKCODE)
      tensor_slices["RATING"].append(sampled_RATING)
      tensor_slices["GICS"].append(sampled_GICS)
      tensor_slices["STOCKNAME"].append(sampled_STOCKNAME)
      tensor_slices["UNIX_TS"].append(sampled_UNIX_TS)

  return tf.data.Dataset.from_tensor_slices(tensor_slices)

In [54]:
# portfolios = tf.data.Dataset.load("../../data/portfolios_tfds_lists")
portfolios = dataset

In [55]:
# train_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/train_lists").cache() #data\ratings_train
# test_ds = tf.data.Dataset.load("D:/dev work/recommender systems/Atrad_CARS/data/test_lists").cache()

train_ds = train
test_ds = test

In [56]:
next(iter(train_ds)), len(train_ds)

({'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'HDF-733381418-VN/00'>,
  'STOCKCODE': <tf.Tensor: shape=(), dtype=string, numpy=b'LWL'>,
  'UNIX_TS': <tf.Tensor: shape=(), dtype=float32, numpy=1660501800.0>,
  'RATING': <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
  'GICS': <tf.Tensor: shape=(), dtype=string, numpy=b'Capital Goods'>,
  'STOCKNAME': <tf.Tensor: shape=(), dtype=string, numpy=b'LANKA WALLTILE PLC'>},
 126283)

In [57]:
train_v1 = sample_listwise(
    train_ds,
    num_list_per_user=50,
    num_examples_per_list=10,
    seed=42
)

test_v1 = sample_listwise(
    test_ds,
    num_list_per_user=1,
    num_examples_per_list=10,
    seed=42
)

In [58]:
next(iter(train_v1))

{'CDSACCNO': <tf.Tensor: shape=(), dtype=string, numpy=b'HDF-733381418-VN/00'>,
 'STOCKCODE': <tf.Tensor: shape=(10,), dtype=string, numpy=
 array([b'LIOC', b'LITE', b'MBSL', b'PLC', b'LWL', b'ALLI', b'SHL',
        b'HAYL', b'CFVF', b'KAHA'], dtype=object)>,
 'RATING': <tf.Tensor: shape=(10,), dtype=float32, numpy=array([2., 1., 1., 1., 2., 2., 1., 1., 1., 2.], dtype=float32)>,
 'GICS': <tf.Tensor: shape=(10,), dtype=string, numpy=
 array([b'Energy', b'Capital Goods', b'Diversified Financials',
        b'Diversified Financials', b'Capital Goods',
        b'Diversified Financials', b'Capital Goods', b'Capital Goods',
        b'Diversified Financials', b'Food Beverage & Tobacco'],
       dtype=object)>,
 'STOCKNAME': <tf.Tensor: shape=(10,), dtype=string, numpy=
 array([b'LANKA IOC PLC', b'LAXAPANA BATTERIES PLC',
        b'MERCHANT BANK OF SRI LANKA & FINANCE PLC',
        b"PEOPLE'S LEASING & FINANCE PLC", b'LANKA WALLTILE PLC',
        b'ALLIANCE FINANCE COMPANY PLC', b'SOFTLOGIC HOL

In [59]:
len(train_v1)

258200

In [60]:
train_v1.save("../../data/portfolios_v2/ranker_train")
test_v1.save("../../data/portfolios_v2/ranker_test")

In [65]:
train_ds = tf.data.Dataset.load("D:\dev work\recommender systems\Atrad_CARS\data\portfolios_v2\retriver_train").cache()

InvalidArgumentError: NewRandomAccessFile failed to Create/Open: D:\dev workecommender systems\Atrad_CARS\data\portfolios_v2etriver_train\dataset_spec.pb : The filename, directory name, or volume label syntax is incorrect.
; no protocol option