<a href="https://colab.research.google.com/github/omkar-salunke/Trading_algos/blob/main/Stock_ranking_using_graph.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/sumansaha66/stock-ranking-using-list-wise-approach.git

Cloning into 'stock-ranking-using-list-wise-approach'...
remote: Enumerating objects: 171, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (160/160), done.[K
remote: Total 171 (delta 47), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (171/171), 2.59 MiB | 4.70 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [None]:
# /content/stock-ranking-using-list-wise-approach/training

import sys
sys.path.append('/content/stock-ranking-using-list-wise-approach/training')

In [None]:
import math
import numpy as np
import scipy.stats as sps
from rbo import rbo_at_k_normalised_w

def bt_long_calculator(pre_topn, ground_truth, bt_longn, i):
    # back testing on top k stocks
    real_ret_rat_topn = 0
    for pre in pre_topn:
        real_ret_rat_topn += ground_truth[pre][i]
    real_ret_rat_topn /= len(pre_topn)
    bt_longn += real_ret_rat_topn
    return bt_longn

def evaluate(prediction, ground_truth, mask, report=False):
    assert ground_truth.shape == prediction.shape, 'shape mis-match'
    # Performance is the dictionary which will contain the mse, mrrt and btl
    performance = {}
    # calculation of mse. this is equivalent to reg_loss or regression loss
    performance['mse'] = np.linalg.norm((prediction - ground_truth) * mask)**2\
        / np.sum(mask)
    mrr_top = 0.0
    all_miss_days_top = 0
    bt_long = 1.0
    bt_long5 = 1.0
    bt_long10 = 1.0
    bt_long20 = 1.0
    bt_long50 = 1.0
    rbo_at_5_normalised=0.0
    rbo_at_10_normalised=0.0
    rbo_at_20_normalised=0.0
    rbo_at_50_normalised=0.0

    for i in range(prediction.shape[1]):
        # prediction.shape[1] is the number of days
        # This loop will iterate over the length of test and validation set
        # Actual rank based on ground truth
        rank_gt = np.argsort(ground_truth[:, i])
        gt_top1 = [] # will contain index of the top 1 stock by actual return
        gt_top5 = [] # will contain index of the top 5 stock by actual return
        gt_top10 = [] # will contain  index of the top 10 stock by actual return
        gt_top20 = [] # will contain index of the top 20 stock by actual return
        gt_top50 = [] # will contain index of the top 50 stock by actual return
        
        # Creasting list of top 1, 5, 10, 20 and 50 based on actual rank
        for j in range(1, prediction.shape[0] + 1):
            # This loop will iterate over the number of stocks (1 to 1026)
            cur_rank = rank_gt[-1 * j] # Actual rank
            if mask[cur_rank][i] < 0.5:
                continue
            if len(gt_top1) < 1:
                gt_top1.append(cur_rank) # index of the top 1 stock by actual return
            if len(gt_top5) < 5:
                gt_top5.append(cur_rank) # index of the top 5 stock by actual return
            if len(gt_top10) < 10:
                gt_top10.append(cur_rank) # index of the top 10 stock by actual return
            if len(gt_top20) < 20:
                gt_top20.append(cur_rank) # index of the top 20 stock by actual return
            if len(gt_top50) < 50:
                gt_top50.append(cur_rank) # index of the top 50 stock by actual return

        # Predicted Rank
        rank_pre = np.argsort(prediction[:, i])

        pre_top1 = [] # index of the top 1 stock by predicted return
        pre_top5 = [] # index of the top 5 stock by predicted return
        pre_top10 = [] # index of the top 10 stock by predicted return
        pre_top20 = [] # index of the top 20 stock by predicted return
        pre_top50 = [] # index of the top 50 stock by predicted return
        for j in range(1, prediction.shape[0] + 1):
            # This loop will iterate over the number of stocks (1 to num_company)
            cur_rank = rank_pre[-1 * j]
            if mask[cur_rank][i] < 0.5:
                continue
            if len(pre_top1) < 1:
                pre_top1.append(cur_rank) # index of the top 1 stock by predicted return
            if len(pre_top5) < 5:
                pre_top5.append(cur_rank) # index of the top 5 stock by predicted return
            if len(pre_top10) < 10:
                pre_top10.append(cur_rank) # index of the top 10 stock by predicted return
            if len(pre_top20) < 20:
                pre_top20.append(cur_rank) # index of the top 20 stock by predicted return
            if len(pre_top50) < 50:
                pre_top50.append(cur_rank) # index of the top 50 stock by predicted return

        # calculate mrr of top1
        top1_pos_in_gt = 0
        for j in range(1, prediction.shape[0] + 1):
            # This loop will iterate over the number of stocks (1 to num_company)
            cur_rank = rank_gt[-1 * j]
            if mask[cur_rank][i] < 0.5:
                continue
            else:
                # top1_pos_in_gt will calculate the rank of the predicted top stock
                # in actual ground truth
                top1_pos_in_gt += 1
                if cur_rank in pre_top1:
                    break
        if top1_pos_in_gt == 0:
            all_miss_days_top += 1
        else:
            # mrr_top will contain sum over all days/length of validation and
            # test set
            mrr_top += 1.0 / top1_pos_in_gt

        # back testing on top 1 to calculate IRR
        real_ret_rat_top = ground_truth[(pre_top1)[0]][i]
        bt_long += real_ret_rat_top

        # back testing
        bt_long5= bt_long_calculator(pre_top5, ground_truth, bt_long5, i) # back testing on top 5        
        bt_long10= bt_long_calculator(pre_top10, ground_truth, bt_long10, i) # back testing on top 10
        bt_long20= bt_long_calculator(pre_top20, ground_truth, bt_long20, i) # back testing on top 20
        bt_long50= bt_long_calculator(pre_top50, ground_truth, bt_long50, i) # back testing on top 50
        
        # nrbo calculation
        rbo_at_5_normalised+=rbo_at_k_normalised_w(pre_top5,gt_top5,p=0.80, depth=5)
        rbo_at_10_normalised+=rbo_at_k_normalised_w(pre_top5,gt_top5,p=0.90, depth=10)
        rbo_at_20_normalised+=rbo_at_k_normalised_w(pre_top5,gt_top5,p=0.95,depth=20)
        rbo_at_50_normalised+=rbo_at_k_normalised_w(pre_top5,gt_top5,p=0.98, depth=50)


    performance['mrrt'] = mrr_top / (prediction.shape[1] - all_miss_days_top)
    performance['rbo_at_5_normalized'] = rbo_at_5_normalised / (prediction.shape[1])
    performance['rbo_at_10_normalized'] = rbo_at_10_normalised / (prediction.shape[1])
    performance['rbo_at_20_normalized'] = rbo_at_20_normalised / (prediction.shape[1])
    performance['rbo_at_50_normalized'] = rbo_at_50_normalised / (prediction.shape[1])
    performance['btl'] = bt_long
    performance['bt5_unweighted'] = bt_long5
    performance['bt10_unweighted'] = bt_long10
    performance['bt20_unweighted'] = bt_long20
    performance['bt50_unweighted'] = bt_long50
    return performance

def make_df_loss(rr_lstm,epoch,cur_valid_perf,cur_test_perf,tra_loss,tra_reg_loss,val_loss,test_loss):
    loss_df=rr_lstm.df_loss.append({
                        'epoch':epoch,
                        'market':rr_lstm.market_name,
                        'relation_name':rr_lstm.relation_name,
                        'loss_name':rr_lstm.loss_name,
                        'train_total_loss':tra_loss.numpy() / (rr_lstm.valid_index - rr_lstm.parameters['seq'] - rr_lstm.steps + 1),
                        'train_reg_loss':tra_reg_loss.numpy() / (rr_lstm.valid_index - rr_lstm.parameters['seq'] - rr_lstm.steps + 1),
                        'valid_total_loss':val_loss.numpy()  / (rr_lstm.test_index - rr_lstm.valid_index),
                        'valid_reg_loss':cur_valid_perf['mse'],'valid_mrrt':cur_valid_perf['mrrt'],
                        'valid_bt1':cur_valid_perf['btl'],'valid_bt5_unweighted':cur_valid_perf['bt5_unweighted'],
                        'valid_bt10_unweighted':cur_valid_perf['bt10_unweighted'],
                        'valid_bt20_unweighted':cur_valid_perf['bt20_unweighted'],
                        'valid_bt50_unweighted':cur_valid_perf['bt50_unweighted'],
                        'valid_rbo_at_5_normalized':cur_valid_perf['rbo_at_5_normalized'],
                        'valid_rbo_at_10_normalized':cur_valid_perf['rbo_at_10_normalized'],
                        'valid_rbo_at_20_normalized':cur_valid_perf['rbo_at_20_normalized'],
                        'valid_rbo_at_50_normalized':cur_valid_perf['rbo_at_50_normalized'],
                        'test_total_loss':test_loss.numpy() / (rr_lstm.trade_dates - rr_lstm.test_index),
                        'test_reg_loss':cur_test_perf['mse'],'test_mrrt':cur_test_perf['mrrt'],
                        'test_bt1':cur_test_perf['btl'],'test_bt5_unweighted':cur_test_perf['bt5_unweighted'],
                        'test_bt10_unweighted':cur_test_perf['bt10_unweighted'],
                        'test_bt20_unweighted':cur_test_perf['bt20_unweighted'],
                        'test_bt50_unweighted':cur_test_perf['bt50_unweighted'],
                        'test_rbo_at_5_normalized':cur_test_perf['rbo_at_5_normalized'],
                        'test_rbo_at_10_normalized':cur_test_perf['rbo_at_10_normalized'],
                        'test_rbo_at_20_normalized':cur_test_perf['rbo_at_20_normalized'],
                        'test_rbo_at_50_normalized':cur_test_perf['rbo_at_50_normalized']},
                    ignore_index=True)
    return loss_df

In [None]:
import copy
import numpy as np
import os


# Used for loading sequential embedding
def load_EOD_data(data_path, market_name, tickers, steps=1):
    eod_data = []
    masks = []
    ground_truth = []
    base_price = []
    # Go through all the tickers one by one
    for index, ticker in enumerate(tickers):
        # Load raw data of each ticker. There are six columns when loading from 2013-01-01 folder.
        # column[0] index or time
        # column[1] most likely 5 day average of normalized price
        # column[2]: most likely 10 day average of normalized price
        # column [3]: most likely 20 day average of normalized price
        # column [4]: most likely 30 day average of normalized price
        # column [5]: most likely the normalized price
        # The length is 1245 which represents total number of days from 2013-2017
        single_EOD = np.genfromtxt(
            os.path.join(data_path, market_name + '_' + ticker + '_1.csv'),
            dtype=np.float32, delimiter=',', skip_header=False
        )
        if market_name == 'NASDAQ':
            # remove the last day since lots of missing data
            single_EOD = single_EOD[:-1, :]
        if index == 0:
            # Print the length of the overall time series
            print('single EOD data shape:', single_EOD.shape)
            # tensor of time series data
            eod_data = np.zeros([len(tickers), single_EOD.shape[0],
                                 single_EOD.shape[1] - 1], dtype=np.float32) # (num_company,num_days,5)
            # Initially all the masks will be 1
            masks = np.ones([len(tickers), single_EOD.shape[0]],
                            dtype=np.float32) # (num_company,num_days)
            
            ground_truth = np.zeros([len(tickers), single_EOD.shape[0]],
                                    dtype=np.float32) # (num_company,num_days)
            # Take the price of all stocks
            base_price = np.zeros([len(tickers), single_EOD.shape[0]],
                                  dtype=np.float32) # (num_company,num_days)
        for row in range(single_EOD.shape[0]): # for each day range(0, num_days)
            # Calculate return or ground truth
            if abs(single_EOD[row][-1] + 1234) < 1e-8:
                # This is most likely to deal with missing data. If any day is 
                # missing for a stock, mask for that day will be 0. Raw data of
                # that day is -1234 in the raw/individual file
                masks[index][row] = 0.0
            elif row > steps - 1 and abs(single_EOD[row - steps][-1] + 1234)> 1e-8:
                # Return is calculated as ground truth
                ground_truth[index][row] =(single_EOD[row][-1] - single_EOD[row - steps][-1]) /single_EOD[row - steps][-1]
            for col in range(single_EOD.shape[1]):
                if abs(single_EOD[row][col] + 1234) < 1e-8:
                    single_EOD[row][col] = 1.1
        eod_data[index, :, :] = single_EOD[:, 1:] # take all eod data except index
        base_price[index, :] = single_EOD[:, -1] # Take the normalized price of all stocks. Last column of single_EOD
    return eod_data, masks, ground_truth, base_price


def load_graph_relation_data(relation_file, lap=False):
    relation_encoding = np.load(relation_file) # (num_company, num_company, relation_types)
    print('relation encoding shape:', relation_encoding.shape)
    rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]]
    mask_flags = np.equal(np.zeros(rel_shape, dtype=int),
                          np.sum(relation_encoding, axis=2))
    ajacent = np.where(mask_flags, np.zeros(rel_shape, dtype=float),
                       np.ones(rel_shape, dtype=float))
    degree = np.sum(ajacent, axis=0)
    for i in range(len(degree)):
        degree[i] = 1.0 / degree[i]
    np.sqrt(degree, degree)
    deg_neg_half_power = np.diag(degree)
    if lap:
        return np.identity(ajacent.shape[0], dtype=float) - np.dot(
            np.dot(deg_neg_half_power, ajacent), deg_neg_half_power)
    else:
        return np.dot(np.dot(deg_neg_half_power, ajacent), deg_neg_half_power)

# Used for loading relational data
def load_relation_data(relation_file):
    relation_encoding = np.load(relation_file) # Contains relation data (only 0 or 1)
    print('relation encoding shape:', relation_encoding.shape) # (num_company, num_company, relation_type)
    rel_shape = [relation_encoding.shape[0], relation_encoding.shape[1]] # (num_company, num_company)
    # Sum all types of relations. mask_flags will be 1 if no relation exists between
    # two companies
    mask_flags = np.equal(np.zeros(rel_shape, dtype=int),
                          np.sum(relation_encoding, axis=2)) # (num_company, num_company)
    
    mask = np.where(mask_flags, np.ones(rel_shape) * -1e9, np.zeros(rel_shape)) # If there is no relation the mask will be a large negative number there -1e9
    return relation_encoding, mask # return individual and masked relations


def build_SFM_data(data_path, market_name, tickers):
    eod_data = []
    for index, ticker in enumerate(tickers):
        single_EOD = np.genfromtxt(
            os.path.join(data_path, market_name + '_' + ticker + '_1.csv'),
            dtype=np.float32, delimiter=',', skip_header=False
        )
        if index == 0:
            print('single EOD data shape:', single_EOD.shape)
            eod_data = np.zeros([len(tickers), single_EOD.shape[0]],
                                dtype=np.float32)

        for row in range(single_EOD.shape[0]):
            if abs(single_EOD[row][-1] + 1234) < 1e-8:
                # handle missing data
                if row < 3:
                    # eod_data[index, row] = 0.0
                    for i in range(row + 1, single_EOD.shape[0]):
                        if abs(single_EOD[i][-1] + 1234) > 1e-8:
                            eod_data[index][row] = single_EOD[i][-1]
                            # print(index, row, i, eod_data[index][row])
                            break
                else:
                    eod_data[index][row] = np.sum(
                        eod_data[index, row - 3:row]) / 3
                    # print(index, row, eod_data[index][row])
            else:
                eod_data[index][row] = single_EOD[row][-1]
        # print('test point')
    np.save(market_name + '_sfm_data', eod_data)

In [None]:
'''This code is the test with inclusion of node2vec'''

# Import packages
import argparse
import copy
import numpy as np
import os
import pandas as pd

import random
from time import time 

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense

from load_data import load_EOD_data, load_relation_data
from evaluator import evaluate, make_df_loss
from loss_functions_tgc import reg_loss_tgc, rank_loss_tgc, listnet_loss
from graph_embedding import relation_node2vec

# Set up random seeds
seed = 123456789
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Function for initialising arrays
def prediction_ground_truth_mask_initializer(dim1,dim2):
    pred_array = np.zeros([dim1, dim2],dtype=float)
    gt_array=np.zeros([dim1, dim2],dtype=float)
    mask_array=np.zeros([dim1, dim2],dtype=float)
    return pred_array, gt_array, mask_array



# Another model for 'inner product weight' can be similarly built
class MyModel(Model):
  def __init__(self, nCom, rel_mask, inner_prod, flat,rel_encoding,num_random_walks,len_random_walk,
               p_val,q_val,n2vemb_size,units = 0):
    super(MyModel, self).__init__()
    self.rel_mask = rel_mask
    self.inner_prod = inner_prod
    self.flat = flat
    self.all_one = tf.ones([nCom, 1], dtype=tf.float32)
    self.rel_encoding=rel_encoding.astype('float32')
    self.num_random_walks=num_random_walks
    self.len_random_walk=len_random_walk
    self.p_val=p_val
    self.q_val=q_val
    self.n2vemb_size=n2vemb_size
    self.prediction_layer = Dense(1,activation=tf.keras.layers.LeakyReLU(), 
                                  kernel_initializer='glorot_uniform')
    if self.flat:
        print('one more hidden layer')
        self.hidden_layer =  Dense(units, activation=tf.keras.layers.LeakyReLU(),
                                   kernel_initializer='glorot_uniform')
    else:
        self.hidden_layer = None
    
  def call(self, Feature):
      weight_masked=relation_node2vec(self.rel_encoding,self.num_random_walks,self.len_random_walk,
                                      self.p_val,self.q_val,self.n2vemb_size) # we are directly using embedding from node2vec
      rel_weight=weight_masked
      outputs_proped=weight_masked
      
      if self.flat:
          outputs_concated = self.hidden_layer(
              tf.concat([Feature, outputs_proped], axis=1))
      else:
          outputs_concated = tf.concat([Feature, outputs_proped], axis=1)
      prediction = self.prediction_layer(outputs_concated)
      print('prediction layer input shape: ',outputs_concated.shape)
      print('prediction layer output shape: ',prediction.shape)
       
      return rel_weight,prediction

class ReRaLSTM:
    def __init__(self, data_path, market_name, tickers_fname, relation_name,
                 emb_fname, parameters, depth, loss_name, num_random_walks,len_random_walk,
                 p_val, q_val, n2vemb_size, steps=1,
                 epochs=50, batch_size=None, flat=False, gpu=False, in_pro=False):
    
        seed = 123456789
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        self.data_path = data_path
        self.market_name = market_name
        self.tickers_fname = tickers_fname
        self.relation_name = relation_name
        self.df_loss=pd.DataFrame()
        self.depth=depth
        self.loss_name=loss_name
        self.parameters = copy.copy(parameters)
        self.steps = steps
        self.epochs = epochs
        self.flat = flat
        self.inner_prod = in_pro
        self.valid_index = 756
        self.test_index = 1008
        self.fea_dim = 5
        self.gpu = gpu
        self.num_random_walks=num_random_walks
        self.len_random_walk=len_random_walk
        self.p_val=p_val
        self.q_val=q_val
        self.n2vemb_size=n2vemb_size
        # load data
        self.tickers = np.genfromtxt(os.path.join(data_path, '..', tickers_fname),
                                         dtype=str, delimiter='\t', skip_header=False)
        
        print('#tickers selected:', len(self.tickers))

        # mask_data: mask for time series data, all 1, (num_company,num_days) shape, numpy array
        # mask_data is to deal with missing time series data. It will be 0 if there
        # is any missing data on a day for a company
        # price_data contains normalized price of all days for all stocks. (num_company,num_days)
        # gt_data is ground truth or actual daily return. shape (num_company,num_days)
        
        self.eod_data, self.mask_data, self.gt_data, self.price_data = load_EOD_data(data_path, market_name, self.tickers, steps)
        print('price_data shape: ', self.price_data.shape)
        print('gt_data shape ', self.gt_data.shape)
        
        # relation data
        rname_tail = {'sector_industry': '_industry_relation.npy',
                      'wikidata': '_wiki_relation.npy'}
        # rel_encoding: True relations not masked. (num_companies, num_companies, rel_types)
        # rel_mask: mask for relation (num_company, num_company).
        # If there is a relation the mask will be 0, otherwise, a large negative number there -1e9
        if self.relation_name in ['sector_industry','wikidata']:
            self.rel_encoding, self.rel_mask = load_relation_data(
                    os.path.join(self.data_path,'..', 'relation', self.relation_name,
                                 self.market_name + rname_tail[self.relation_name])
                    )
            # The next part is only relevant if the number of nodes is less than the total
            # number of available nodes in the original study. I am assuming that the nodes are 
            # in the same order in the adjacency matrix as in the ticker file
            self.rel_encoding=self.rel_encoding[:self.gt_data.shape[0],:self.gt_data.shape[0],:]
            self.rel_mask=self.rel_mask[:self.gt_data.shape[0],:self.gt_data.shape[0]]
                    
        self.rel_mask = self.rel_mask.astype('float32')
        print('relation encoding shape:', self.rel_encoding.shape)
        print('relation mask shape:', self.rel_mask.shape)
        
        # trained pre-trained sequential embedding (num_company, num_days, embedding dimension).
        # The last dimension is U or embedding shape
        self.embedding = np.load(
            os.path.join(data_path, '..', 'pretrain', emb_fname))
        print('embedding shape:', self.embedding.shape)
        # The next part is only relevant if the number of nodes is less than the total
        # number of available nodes in the original study. I am assuming that the nodes are 
        # in the same order in the adjacency matrix as in the ticker file'''
        self.embedding=self.embedding[:self.gt_data.shape[0],:,:] # sequential embedding
        
        print('embedding shape:', self.embedding.shape)
        if batch_size is None:
            self.batch_size = len(self.tickers)
        else:
            self.batch_size = batch_size

        self.trade_dates = self.mask_data.shape[1]
        self.numCompany = self.rel_mask.shape[0]       
        self.model = MyModel(self.numCompany, self.rel_mask, 
                             self.inner_prod, self.flat,self.rel_encoding, self.num_random_walks,self.len_random_walk,
                             self.p_val,self.q_val, self.n2vemb_size, self.parameters['unit'])


    def get_batch(self, offset=None):
        if offset is None:
            offset = random.randrange(0, self.valid_index)
        seq_len = self.parameters['seq']
        mask_batch = self.mask_data[:, offset: offset + seq_len + self.steps]
        mask_batch = np.min(mask_batch, axis=1)
        return self.embedding[:, offset, :], np.expand_dims(mask_batch, axis=1), np.expand_dims(
                self.price_data[:, offset + seq_len - 1], axis=1), np.expand_dims(
                        self.gt_data[:, offset + seq_len + self.steps - 1], axis=1)
    def train(self):
        seed = 123456789
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)
        if self.gpu == True:
            device_name = '/gpu:0'
        else:
            device_name = '/cpu:0'
        print('device name:', device_name) 

        optimizer = tf.keras.optimizers.Adam()
        #train_loss = tf.keras.metrics.Mean(name='train_loss')
 
        @tf.function
        def train_step(Feature, base_price, ground_truth, mask):
          with tf.GradientTape() as tape:
            # training=True is only needed if there are layers with different
            # behavior during training versus inference (e.g. Dropout).
            rel_weight, prediction = self.model(Feature, training=True)
            return_ratio = tf.divide(tf.subtract(prediction, base_price), base_price) #(num_company,1) tensor
            reg_loss=reg_loss_tgc(ground_truth, return_ratio, mask)
            rank_loss=rank_loss_tgc(ground_truth, return_ratio, mask, self)
            if self.loss_name=='reg_rank_loss':
                loss = reg_loss + tf.cast(parameters['alpha'], tf.float32) * rank_loss
            elif self.loss_name=='listnet_loss':
                loss= listnet_loss(ground_truth, return_ratio, mask, self)
          gradients = tape.gradient(loss, self.model.trainable_variables)
          print('trainable variables: ', self.model.trainable_variables)
          optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
        
          '''new code: added rel_weight in return list'''
          return rel_weight, loss, reg_loss, rank_loss, return_ratio
        
        @tf.function
        def test_step(Feature, base_price, ground_truth, mask):
            # The test step is not doing any further training. It is using the
            # model trained in the train_step. training=False
            
            rel_weight, prediction = self.model(Feature, training=False)
            return_ratio = tf.divide(tf.subtract(prediction, base_price), base_price)
            reg_loss=reg_loss_tgc(ground_truth, return_ratio, mask)
            rank_loss=rank_loss_tgc(ground_truth, return_ratio, mask, self)
            if self.loss_name=='reg_rank_loss':
                loss = reg_loss + tf.cast(parameters['alpha'], tf.float32) * rank_loss
            elif self.loss_name=='listnet_loss':
                loss= listnet_loss(ground_truth, return_ratio, mask, self)
            return loss, reg_loss, rank_loss, return_ratio


        best_valid_pred, best_valid_gt, best_valid_mask=prediction_ground_truth_mask_initializer(
                len(self.tickers),
                self.test_index - self.valid_index)
        best_test_pred, best_test_gt, best_test_mask=prediction_ground_truth_mask_initializer(
                len(self.tickers),
                self.trade_dates - self.parameters['seq'] -self.test_index - self.steps + 1)
        best_valid_loss = np.inf

        batch_offsets = np.arange(start=0, stop=self.valid_index, dtype=int)
        
        '''train on training data'''               
        for epoch in range(self.epochs):
            t1 = time()
            np.random.shuffle(batch_offsets)
            tra_loss = 0.0
            tra_reg_loss = 0.0
            tra_rank_loss = 0.0
            for j in range(self.valid_index - self.parameters['seq'] -
                                   self.steps + 1):
                emb_batch, mask_batch, price_batch, gt_batch = self.get_batch(batch_offsets[j])
                
                rel_weight, train_cur_loss, train_cur_reg_loss, train_cur_rank_loss, cur_rr= train_step(
                        emb_batch, price_batch, gt_batch, mask_batch)
                
                 
                tra_loss += train_cur_loss
                tra_reg_loss += train_cur_reg_loss
                tra_rank_loss += train_cur_rank_loss

            print('Train Loss:',
                  tra_loss.numpy() / (self.valid_index - self.parameters['seq'] - self.steps + 1))
            
            
            '''test on validation set'''
            cur_valid_pred, cur_valid_gt, cur_valid_mask = prediction_ground_truth_mask_initializer(
                    len(self.tickers),
                    self.test_index - self.valid_index)
            val_loss = 0.0
            val_reg_loss = 0.0
            val_rank_loss = 0.0
            for cur_offset in range(
                        self.valid_index - self.parameters['seq'] - self.steps + 1,
                        self.test_index - self.parameters['seq'] - self.steps + 1
                    ):
                emb_batch, mask_batch, price_batch, gt_batch = self.get_batch(
                            cur_offset)
                # using test_step to get the validation loss
                val_cur_loss, val_cur_reg_loss, val_cur_rank_loss, cur_rr = test_step(emb_batch, price_batch, gt_batch, mask_batch)
                val_loss += val_cur_loss
                val_reg_loss += val_cur_reg_loss
                val_rank_loss += val_cur_rank_loss
        
                cur_valid_pred[:, cur_offset - (self.valid_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(cur_rr[:, 0])
                cur_valid_gt[:, cur_offset - (self.valid_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(gt_batch[:, 0]) 
                cur_valid_mask[:, cur_offset - (self.valid_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(mask_batch[:, 0])
            print('Valid loss:',
                          val_loss.numpy()  / (self.test_index - self.valid_index))
          
            
            '''test on testing set'''
            cur_test_pred,cur_test_gt,cur_test_mask = prediction_ground_truth_mask_initializer(
                    len(self.tickers),
                    self.trade_dates - self.test_index)
            test_loss = 0.0
            test_reg_loss = 0.0
            test_rank_loss = 0.0
            for cur_offset in range(
                    self.test_index - self.parameters['seq'] - self.steps + 1,
                    self.trade_dates - self.parameters['seq'] - self.steps + 1):
                emb_batch, mask_batch, price_batch, gt_batch = self.get_batch(
                            cur_offset) # sequential_embedding, mask for time series data, price data and ground truth data
                # using test step to get the test loss for current epoch
                test_cur_loss, test_cur_reg_loss, test_cur_rank_loss, cur_rr = test_step(emb_batch, price_batch, gt_batch, mask_batch)
        
                test_loss += test_cur_loss
                test_reg_loss += test_cur_reg_loss
                test_rank_loss += test_cur_rank_loss
        
                cur_test_pred[:, cur_offset - (self.test_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(cur_rr[:, 0])
                cur_test_gt[:, cur_offset - (self.test_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(gt_batch[:, 0])
                cur_test_mask[:, cur_offset - (self.test_index -
                            self.parameters['seq'] - self.steps + 1)] = copy.copy(mask_batch[:, 0])
            print('Test loss:',
                          test_loss.numpy() / (self.trade_dates - self.test_index))
            if val_loss / (self.test_index - self.valid_index) < best_valid_loss:
                     best_valid_loss = val_loss.numpy() / (self.test_index - self.valid_index)
                     best_valid_gt = copy.copy(cur_valid_gt)
                     best_valid_pred = copy.copy(cur_valid_pred)
                     best_valid_mask = copy.copy(cur_valid_mask)
                     best_test_gt = copy.copy(cur_test_gt)
                     best_test_pred = copy.copy(cur_test_pred)
                     best_test_mask = copy.copy(cur_test_mask)
                     print('Better valid loss:', best_valid_loss)
            '''Calculate the evaluation performance after certain epochs. If epoch==15000, use it 50 or 100'''
            if epoch%10==0:
                cur_valid_perf = evaluate(cur_valid_pred, cur_valid_gt, cur_valid_mask)
                print('\t Valid preformance:', cur_valid_perf)
                cur_test_perf = evaluate(cur_test_pred, cur_test_gt, cur_test_mask)
                print('\t Test performance:', cur_test_perf)
                self.df_loss=make_df_loss(self, epoch, cur_valid_perf, cur_test_perf, tra_loss, tra_reg_loss,
                                          val_loss, test_loss)
                self.df_loss.to_csv('df_loss_node2vec_'+self.loss_name+'_'+self.market_name+'_'+
                                    self.relation_name+'_'+
                                    str(self.epochs)+'_epochs_'+str(RR_LSTM.num_random_walks)+'_num_random_walks_'+
                                    str(RR_LSTM.len_random_walk)+'_len_random_walk_'+str(RR_LSTM.p_val)+'_p_val_'+
                                    str(RR_LSTM.q_val)+'_q_val_'+str(RR_LSTM.n2vemb_size)+'_n2vemb_size'+'.csv',index=False)
            t4 = time()
            print('epoch:', epoch, ('time: %.4f ' % (t4 - t1)))
        
                
        # The function is returning model in addition to other stats
        return self.model, best_valid_pred, best_valid_gt, best_valid_mask, best_test_pred, best_test_gt, best_test_mask



if __name__ == '__main__':
    desc = 'train a relational rank lstm model'
    parser = argparse.ArgumentParser(description=desc)
    parser.add_argument('-paths', help='path of EOD data',
                        default='/content/stock-ranking-using-list-wise-approach/data')
    parser.add_argument('-m', help='market name', default='NASDAQ')
    parser.add_argument('-ls',default='reg_rank_loss', 
                        help='listnet_loss or reg_rank_loss')
    parser.add_argument('-t', help='fname for selected tickers')
    parser.add_argument('-l', default=4,
                        help='length of historical sequence for feature')
    parser.add_argument('-u', default=64,
                        help='number of hidden units in lstm')
    parser.add_argument('-s', default=1,
                        help='steps to make prediction')
    parser.add_argument('-r', default=0.001,
                        help='learning rate')
    parser.add_argument('-a', default=1,
                        help='alpha, the weight of ranking loss')
    parser.add_argument('-g', '--gpu', type=int, default=0, help='use gpu')

    parser.add_argument('-e', '--emb_file', type=str,
                        default='NASDAQ_rank_lstm_seq-16_unit-64_2.csv.npy',
                        help='fname for pretrained sequential embedding') #NASDAQ_rank_lstm_seq-16_unit-64_2.csv.npy#NYSE_rank_lstm_seq-8_unit-32_0.csv.npy
    parser.add_argument('-rn', '--rel_name', type=str,
                        default='wikidata',
                        help='relation type: sector_industry or wikidata')
    parser.add_argument('-ip', '--inner_prod', type=int, default=1)
    parser.add_argument('-depth', type=int, default=5)
    parser.add_argument('-epoch_num', type=int, default=15)
    parser.add_argument('-num_rw',type=int,default=20)
    parser.add_argument('-len_rw',type=int, default=8)
    parser.add_argument('-p_val',type=int, default=1)
    parser.add_argument('-q_val',type=int, default=1)
    parser.add_argument('-n2vemb_size',type=int, default=64)
    args = parser.parse_args()

    if args.t is None:
        args.t = args.m + '_tickers_qualify_dr-0.98_min-5_smooth.csv'
    args.gpu = (args.gpu == 1)

    args.inner_prod = (args.inner_prod == 1)

    parameters = {'seq': int(args.l), 'unit': int(args.u), 'lr': float(args.r),
                  'alpha': float(args.a)}
    print('arguments:', args)
    print('parameters:', parameters)

    RR_LSTM = ReRaLSTM(
        data_path=args.paths,
        market_name=args.m,
        tickers_fname=args.t,
        relation_name=args.rel_name,
        emb_fname=args.emb_file,
        parameters=parameters,
        steps=1, epochs=args.epoch_num, batch_size=None, gpu=args.gpu,
        in_pro=args.inner_prod, depth=args.depth,
        loss_name=args.ls,
        num_random_walks=args.num_rw,
        len_random_walk=args.len_rw,
        p_val=args.p_val,
        q_val=args.q_val,
        n2vemb_size=args.n2vemb_size
    )
    pred_all = RR_LSTM.train() 
    df_loss=RR_LSTM.df_loss
    df_loss.to_csv('df_loss_node2vec_'+RR_LSTM.loss_name+'_'+RR_LSTM.market_name+'_'+
                   RR_LSTM.relation_name+'_'+
                   str(RR_LSTM.epochs)+'_epochs_'+str(RR_LSTM.num_random_walks)+'_num_random_walks_'+
                   str(RR_LSTM.len_random_walk)+'_len_random_walk_'+str(RR_LSTM.p_val)+'_p_val_'+
                   str(RR_LSTM.q_val)+'_q_val_'+str(RR_LSTM.n2vemb_size)+'_n2vemb_size'+'.csv',index=False)

usage: ipykernel_launcher.py [-h] [-paths PATHS] [-m M] [-ls LS] [-t T] [-l L]
                             [-u U] [-s S] [-r R] [-a A] [-g GPU]
                             [-e EMB_FILE] [-rn REL_NAME] [-ip INNER_PROD]
                             [-depth DEPTH] [-epoch_num EPOCH_NUM]
                             [-num_rw NUM_RW] [-len_rw LEN_RW] [-p_val P_VAL]
                             [-q_val Q_VAL] [-n2vemb_size N2VEMB_SIZE]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-18624252-679e-4554-bc6d-b29167a25627.json


SystemExit: ignored

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
# !tar -xvf  '/content/stock-ranking-using-list-wise-approach/data/relation.tar.gz' -C 'content/cell_images'
!tar -xzvf "/content/stock-ranking-using-list-wise-approach/data/" "/content/stock-ranking-using-list-wise-approach/data/relation.tar.gz"

tar (child): /content/stock-ranking-using-list-wise-approach/data/: Cannot read: Is a directory
tar (child): At beginning of tape, quitting now
tar (child): Error is not recoverable: exiting now

gzip: stdin: unexpected end of file
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [None]:
! pip install stellargraph

Collecting stellargraph
  Downloading stellargraph-1.2.1-py3-none-any.whl (435 kB)
[?25l[K     |▊                               | 10 kB 19.4 MB/s eta 0:00:01[K     |█▌                              | 20 kB 10.8 MB/s eta 0:00:01[K     |██▎                             | 30 kB 6.1 MB/s eta 0:00:01[K     |███                             | 40 kB 5.7 MB/s eta 0:00:01[K     |███▊                            | 51 kB 4.9 MB/s eta 0:00:01[K     |████▌                           | 61 kB 4.7 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 4.9 MB/s eta 0:00:01[K     |██████                          | 81 kB 5.5 MB/s eta 0:00:01[K     |██████▊                         | 92 kB 5.4 MB/s eta 0:00:01[K     |███████▌                        | 102 kB 5.0 MB/s eta 0:00:01[K     |████████▎                       | 112 kB 5.0 MB/s eta 0:00:01[K     |█████████                       | 122 kB 5.0 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 5.0 MB/s eta 0:

In [None]:
from stellargraph import StellarGraph
import networkx as nx
from stellargraph.data import BiasedRandomWalk
from gensim.models import Word2Vec

def relation_node2vec(relation_encoding_org,num_random_walks,len_random_walk,p_val,q_val,n2vemb_size):
    relation_encoding_2d_weighted=relation_encoding_org.sum(axis=2) # weighted 2d graph with sum
    relation_encoding_2d_unweighted=(relation_encoding_2d_weighted>0).astype(int) # unweighted 2d graph
    G_nx=nx.from_numpy_array(relation_encoding_2d_unweighted) # loading as networkx graph
    G = StellarGraph.from_networkx(G_nx)
    print(G.info())
    # Corpus generation using random walks
    rw = BiasedRandomWalk(G)
    walks = rw.run(
            nodes=list(G.nodes()),  # root nodes
            length=len_random_walk,  # maximum length of a random walk
            n=num_random_walks,  # number of random walks per root node
            p=p_val,  # Defines (unormalised) probability, 1/p, of returning to source node
            q=q_val,  # Defines (unormalised) probability, 1/q, for moving away from source node
            )
    print("Number of random walks: {}".format(len(walks))) # total number for all nodes in the graph
    # Representation Learning using Word2Vec
    str_walks = [[str(n) for n in walk] for walk in walks]
    # size: Dimensionality of the output
    # window: Maximum distance between the current and predicted word within a sentence
    # min_count : Ignores all words with total frequency lower than this
    # sg : {0, 1}, Training algorithm: 1 for skip-gram; otherwise CBOW.
    # iter : Number of iterations (epochs) over the corpus.
    # workers : Use these many worker threads to train the model (=faster training with multicore machines)
    model = Word2Vec(str_walks, size=n2vemb_size, window=5, min_count=0, sg=1, workers=2, iter=1)
    # Retrieve node embeddings and corresponding subjects
    node_embeddings = (model.wv.vectors)  # numpy.ndarray of size number of nodes times embeddings dimensionality
    print(node_embeddings.shape)
    return node_embeddings