# Data packaging: trade_day , label , risk_exp
trade_day : trading data (price , volume , etc.) for stocks every day

label : 10 days future return data for stocks every day

risk_exp : risk model factors for stocks every day


In [6]:
import os , tarfile

def list_files(directory , start_year = 2007 , end_year = 2020):
    '''list all files in directory'''
    paths = []
    for y in range(start_year,end_year+1):
        subdir = f'{directory}/{y}'
        for dirpath, _, filenames in os.walk(subdir):
            paths += [os.path.join(dirpath , filename) for filename in filenames]
    return paths

def pack(tgt_path = 'risk_exp.tar' , src_path = '/data/DataBase/DB_models/risk_exp' , start_year = 2007 , end_year = 2020):
    with tarfile.open(tgt_path, 'a') as tar:
        files = list_files(src_path , start_year , end_year)
        for file in files:
            tar.add(file, arcname = os.path.relpath(file, src_path))  

if os.path.exists('./data/DataBase'):
    pack('trade_day.tar' , './data/DataBase/DB_trade/day')
    pack('label.tar' , './data/DataBase/DB_labels/ret10_lag')
    pack('risk_exp.tar' , './data/DataBase/DB_models/risk_exp')


# A sample training code

In [26]:
import tqdm , torch

from torch import nn
from typing import Any , Literal

class GRUTrainer:
    '''A simplified GRU trainer, for code illustration purpose'''

    LOSS : Literal['pearson' , 'mse'] = 'pearson'
    TQDM : bool = True
    MAX_EPOCH : int = 10
    EARLY_STOPPAGE : int = 3
    
    def __init__(self , num_stock = 100 , num_dates = 60 , num_feat = 6):
        self.data = self.random_input(num_stock , num_dates , num_feat)
        self.num_stock = num_stock
        self.num_dates = num_dates
        self.num_feat  = num_feat
        self.init_model()

    @staticmethod
    def random_input(num_stock = 100 , num_dates = 60 , num_feat = 6) -> dict[Literal['train','valid','test'],Any]:
        '''make random input of train, valid, test dataset'''
        def rand_nan(x , remain = 0.2):
            for d in range(len(x)):
                if torch.rand(1).item() > 0.2:
                    ii = int(torch.rand(1).item() * x.shape[1])
                    x[d,:ii] = torch.nan
            return x

        # data dimentions: 
        # 0: number of stocks
        # 1: number of dates
        # 2: number of bars in a day (e.g. 1 for day candle , 240 for minute candle)
        # 4: number of features (e.g. open , close , high , low , volume , amount)
        train = (rand_nan(torch.rand(num_stock,int(num_dates*0.8),1,num_feat)) , 
                 torch.rand(num_stock,int(num_dates*0.8),1))
        valid = (rand_nan(torch.rand(num_stock,num_dates-int(num_dates*0.8),1,num_feat)) , 
                 torch.rand(num_stock,num_dates-int(num_dates*0.8),1))
        test  = (rand_nan(torch.rand(num_stock,num_dates-int(num_dates*0.8),1,num_feat)) , 
                 torch.rand(num_stock,num_dates-int(num_dates*0.8),1))

        return {'train':train , 'valid':valid , 'test' :test}
    
    def init_model(self , hidden_size = 32 , num_layers = 2 , dropout = 0.1 , learn_rate = 0.005):
        '''initialize model components: nn , optimizer , scheduler'''
        self.net = self.GRU(self.num_feat , hidden_size , num_layers = num_layers , dropout = dropout)
        if torch.cuda.is_available():
            self.net = self.net.cuda()
        self.optimizer = torch.optim.Adam(self.net.parameters() , lr = learn_rate)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size= 5, gamma=0.1)
    
    @classmethod
    def dataloader(cls , tensor : tuple[torch.Tensor,torch.Tensor] , progress_bar = False):
        '''dataloader creator of train, valid, test dataset'''
        if progress_bar:
            a = list(cls.dataloader(tensor , progress_bar=False))
            return tqdm.tqdm(iterable=cls.dataloader(tensor , progress_bar=False),total=tensor[0].shape[1])
        else:
            assert tensor[0].shape[:2] == tensor[1].shape[:2]
            dates = range(tensor[0].shape[1])
            iterance = []
            for d in dates:
                x , y = tensor[0][:,d] , tensor[1][:,d]
                valid = x.isfinite().all(dim=(1,2)) * y.isfinite().all(dim=(1))
                iterance.append((x[valid] , y[valid] , valid))
            return iterance 

    class GRU(nn.Module):
        '''A simple GRU'''
        def __init__(self, num_feat : int , hidden_size = 32 , num_layers = 2 , dropout = 0.1, **kwargs) -> None:
            super().__init__()
            self.gru = nn.GRU(num_feat , hidden_size , num_layers = num_layers , dropout = dropout)
            self.linear = nn.Linear(hidden_size , 1)

        def forward(self , x):
            x , _ = self.gru(x)
            return self.linear(x[:,-1]) , x
    
    def fit_epoch(self , epoch : int):
        '''epoch of minibatches for train & valid dataset'''
        metric_train , metric_valid = 0 , 0

        iter_train = self.dataloader(self.data['train'] , self.TQDM)
        iter_valid = self.dataloader(self.data['valid'] , self.TQDM)
        
        self.net.train()
        for i, (input , label , valid) in enumerate(iter_train):
            if torch.cuda.is_available():
                input , label = input.cuda() , label.cuda()
            pred , hidden = self.net(input)
            self.optimizer.zero_grad()
            loss_batch = self.loss(label , pred)
            metric_train -= loss_batch.item()
            loss_batch.backward()
            self.optimizer.step()
            if isinstance(iter_train , tqdm.tqdm):  
                iter_train.set_description('Epoch #{:3d} train loss:{:.5f}'.format(epoch,-metric_train/(i+1)))
        metric_train /= i + 1 
        
        self.net.eval()
        with torch.no_grad():
            for i, (input , label , valid) in enumerate(iter_valid):
                if torch.cuda.is_available():
                    input , label = input.cuda() , label.cuda()
                pred , hidden = self.net(input)
                loss_batch = self.loss(label , pred)
                metric_valid -= loss_batch.item()
                if isinstance(iter_valid , tqdm.tqdm):  
                    iter_valid.set_description('Epoch #{:3d} valid loss:{:.5f}'.format(epoch,-metric_valid/(i+1)))
            metric_valid /= i + 1 
        
        last_lr = self.scheduler.get_last_lr()[0]
        self.scheduler.step()
        return metric_train , metric_valid , last_lr
    
    def fit(self):
        '''main loop of fitting process'''
        best_metric_valid = -10000.
        best_epoch = -1
        for epoch in range(self.MAX_EPOCH):
            metric_train , metric_valid , last_lr = self.fit_epoch(epoch)
            print(f'At epoch {epoch}, train metric: {metric_train:.4f}, valid metric: {metric_valid:.4f}, last lr: {last_lr}')
            if metric_valid > best_metric_valid:
                best_metric_valid , best_epoch = metric_valid , epoch
            if epoch - best_epoch > self.EARLY_STOPPAGE:
                print(f'Early Stopped at epoch {best_epoch} , valid metric is {best_metric_valid:.4f}')
                break

    def predict(self):
        '''make prediction of test dataset'''
        metric_test = 0
        iter_test = self.dataloader(self.data['test'] , self.TQDM)
        
        preds = torch.full_like(self.data['test'][1] , fill_value=torch.nan)
        self.net.eval()
        with torch.no_grad():
            for i, (input , label , valid) in enumerate(iter_test):
                if torch.cuda.is_available():
                    input , label = input.cuda() , label.cuda()
                pred , hidden = self.net(input)
                preds[valid,i] = pred
                loss_batch = self.loss(label , pred)
                metric_test -= loss_batch.item()
                if isinstance(iter_test , tqdm.tqdm):  
                    iter_test.set_description('Date #{:3d} valid loss:{:.5f}'.format(i,-metric_test/(i+1)))
            metric_test /= i + 1
        print(f'Test Dataset has average metric of {metric_test:.4f}')
        return preds

    @classmethod
    def loss(cls , label , pred) -> torch.Tensor:
        '''loss calculator'''
        if cls.LOSS == 'pearson':
            l = -torch.stack((label, pred)).squeeze().corrcoef()[0,1] # -pearson_r(labels, pred)
        elif cls.LOSS == 'mse':
            l = nn.MSELoss()(label , pred)
        return l

gru_trainer = GRUTrainer()


# fit the trainer

In [27]:
gru_trainer.fit()

  0%|          | 0/48 [00:00<?, ?it/s]

Epoch #  0 train loss:0.01869: 100%|██████████| 48/48 [00:01<00:00, 25.75it/s] 
Epoch #  0 valid loss:-0.05472: 100%|██████████| 12/12 [00:01<00:00,  6.02it/s]


At epoch 0, train metric: -0.0187, valid metric: 0.0547, last lr: 0.005


Epoch #  1 train loss:-0.05075: 100%|██████████| 48/48 [00:01<00:00, 26.13it/s]
Epoch #  1 valid loss:0.05103: 100%|██████████| 12/12 [00:02<00:00,  5.93it/s]


At epoch 1, train metric: 0.0508, valid metric: -0.0510, last lr: 0.005


Epoch #  2 train loss:-0.02408: 100%|██████████| 48/48 [00:02<00:00, 21.81it/s]
Epoch #  2 valid loss:0.01572: 100%|██████████| 12/12 [00:02<00:00,  5.09it/s]


At epoch 2, train metric: 0.0241, valid metric: -0.0157, last lr: 0.005


Epoch #  3 train loss:-0.03904: 100%|██████████| 48/48 [00:02<00:00, 21.33it/s]
Epoch #  3 valid loss:0.04985: 100%|██████████| 12/12 [00:02<00:00,  4.98it/s]


At epoch 3, train metric: 0.0390, valid metric: -0.0498, last lr: 0.005


Epoch #  4 train loss:-0.03944: 100%|██████████| 48/48 [00:02<00:00, 20.02it/s]
Epoch #  4 valid loss:0.02130: 100%|██████████| 12/12 [00:02<00:00,  4.70it/s]

At epoch 4, train metric: 0.0394, valid metric: -0.0213, last lr: 0.005
Early Stopped at epoch 0 , valid metric is 0.0547





In [30]:
def test(do = True):
    if do:
        return list(test(False))
    else:
        for i in range(10):
            yield i

for i in test(True):
    print(i)

: 

# make prediction of test dataset

In [4]:
pred = gru_trainer.predict()
print(pred.shape)
print(pred)

Date # 11 valid loss:0.04140: : 12it [00:00, 135.73it/s]

Test Dataset has average metric of -0.0414



