# Train macro & fundamental-aware price models 
Pretraining with fundamental, macroeconomic, estimate and sharep price data to capture the data patterns.
Use embedded fundamental/macro/short-term information for return prediction


## 1. load in data

In [None]:
from data import FundamentalDataset, PriceDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import datetime as dt
import itertools
from utils import Defaults
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy

DEFAULTS = Defaults
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


fund_data = FundamentalDataset()
fund_data_weekly = FundamentalDataset(freq="W")
price_data = PriceDataset()

def collate_fn(batch):
    data_ls, masks = [], []
    for data, mask in batch:
        data_ls.append(data)
        masks.append(mask)
    return (
        torch.stack(data_ls),
        torch.stack(masks)
    )

## 2. Train autoencoders as pre-training

### 2.1. Train encoders on fundamental data

In [2]:
from typing import Sequence

def expand_mask(mask: torch.tensor, target_dim: int) -> torch.tensor:
    """expand mask from n dimensions to n+1 dimensions"""
    newmask = deepcopy(mask).unsqueeze(-1)
    mask_dims = list(newmask.shape)
    mask_dims[-1] = target_dim
    mask_dims = tuple(mask_dims)
    return newmask.expand(mask_dims)

def expand_masks(masks: Sequence[torch.tensor], target_dims: Sequence[int]):
    expanded_masks = []
    for mask, dim in zip(masks, target_dims):
        newmask = expand_mask(mask, dim)
        expanded_masks.append(newmask)
    return expanded_masks

In [3]:
def masked_mse_loss(
        input: torch.tensor, 
        target: torch.tensor,
        mask: torch.tensor,
        na_pad: torch.tensor,
        ) -> torch.tensor:
    """custome MSE loss to mask padding & nan values
    :param input: original vector
    :param target: target vector
    :param 
    """
    loss = nn.MSELoss()
    dims = input.shape[-1]
    na_mask = input == na_pad
    expanded_mask = expand_mask(mask, dims)
    new_mask = na_mask.astype(torch.bool) + expanded_mask.astype(torch.bool)
    masked_input = torch.masked_select(input, ~new_mask) # mask itself is True if masked
    masked_target = torch.masked_select(target, ~new_mask)
    return loss(masked_input, masked_target)

def composite_mseloss(mse_losses: Sequence[torch.tensor]):
    mean_loss = torch.mean(mse_losses)
    penalty_loss = torch.sum([(loss - mean_loss)**2 for loss in mse_losses])
    composite_loss = mean_loss + penalty_loss
    return composite_loss

def multiple_input_masked_mse_loss(
        inputs: Sequence[torch.tensor],
        targets: Sequence[torch.tensor],
        masks: Sequence[torch.tensor],
        na_pads: Sequence[torch.tensor]):
    losses = []
    for input, target, mask, na_pad in zip(
        inputs, targets, masks, na_pads):
        loss = masked_mse_loss(input, target, mask, na_pad)
        losses.append(loss)
    composite_loss = composite_mseloss(losses)
    return composite_loss


In [14]:
print(model.linear_encoder_layers[0])

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): BatchNorm1d(170, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): Linear(in_features=170, out_features=42, bias=True)
  (3): Linear(in_features=42, out_features=10, bias=True)
  (4): Linear(in_features=10, out_features=2, bias=True)
  (5): Linear(in_features=2, out_features=3, bias=True)
)


In [None]:
from typing import Sequence, Tuple, Optional
def encode(
        model, 
        inputs: Sequence[Tuple[torch.tensor, Optional[torch.tensor]]],
        padding_masks: Sequence[torch.tensor]
        ) -> Tuple[torch.tensor, Tuple[torch.tensor]]:
    """encode a list of inputs of different lengths and dimensionalities
    into a single embedding vector
    """
    embeddings, memories = [], []
    for input, mask, transformer_encoder, linear_encoder in zip(
        inputs, padding_masks, model.transformer_encoders, model.linear_encoder_layers):
        x_ = transformer_encoder(input, src_key_padding_mask=mask)
        memories.append(x_)
        embedded = linear_encoder(x_)
        print(embedded.shape)
        embeddings.append(embedded)
    _embedding = torch.stack(embeddings, dim=-1)
    print(_embedding.shape)
    embedding = model.linear_encoder(_embedding)
    embedding = model.tanh(embedding)
    return (embedding, memories)

embedding, memories = encode(model, [input], [mask])

torch.Size([16, 3])
torch.Size([16, 3, 1])


In [None]:
model.decode(embedding, memories)

torch.Size([16, 3, 3])

In [6]:
input, mask = input.to(DEVICE), mask.to(DEVICE)
embedding, memories = model.encode([input], [mask])
output = model.decode(embedding, memories)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x3 and 1x3)

In [5]:
from models.autoencoder import BaseAutoEncoder
from torch.utils.tensorboard import SummaryWriter

RUN = 1
LR = 1e-4
NUM_TRANSFORMER_LAYERS = 5
WINDOW_SIZE = 10
NHEADS = 1
ENCODING_DIM = 3
MAX_EPOCHS = 10
BATCH_SIZE = 16

logger_stem = "./traininglog/fundamental_encoder/runs/"
logger = SummaryWriter(f"{logger_stem}run{RUN};lr={LR};notflayrs={NUM_TRANSFORMER_LAYERS};wd={WINDOW_SIZE};nh={NHEADS};edim={ENCODING_DIM};bsize={BATCH_SIZE}")

fundamental_data_loader = DataLoader(
    fund_data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=BATCH_SIZE
)

num_batches = len(fundamental_data_loader)

model = BaseAutoEncoder(
    window_sizes=[WINDOW_SIZE],
    encoding_dim=ENCODING_DIM, 
    num_transformer_layers=[NUM_TRANSFORMER_LAYERS], 
    dims=[17],
    activation_func=F.tanh,
    nheads=[NHEADS],
    device=DEVICE)
model = model.to(DEVICE)

optimizer = torch.optim.Adam(
    model.parameters(), lr=LR, betas=[0.9, 0.99], eps=1e-07)



for epoch in range(MAX_EPOCHS):
    running_losses = []
    for i, (input, mask) in enumerate(fundamental_data_loader):
        # forward pass
        input, mask = input.to(DEVICE), mask.to(DEVICE)
        embedding, memories = model.encode([input], [mask])
        output = model.decode(embedding, memories)
        loss = masked_mse_loss(input, output, mask, na_pad=DEFAULTS.padding_val)
        running_losses.append(loss.item())
        logger.add_scaler("loss/train_step", loss.item(), step=epoch*num_batches + i)

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    mean_loss = np.mean(running_losses)
    logger.add_scalar("loss/train", mean_loss, step=epoch)




RuntimeError: mat1 and mat2 shapes cannot be multiplied (16x3 and 1x3)

In [7]:
loss

NameError: name 'loss' is not defined

In [6]:
print(model.linear_encoder)

Linear(in_features=1, out_features=3, bias=True)


In [None]:
loss = nn.MSELoss()

In [None]:
decode(model, embedding, memories)

[tensor([[[-2.7758e-02,  1.7303e-03,  1.6838e+00, -1.1760e+00,  1.5005e+00,
           -1.3603e+00,  1.0560e+00, -5.9297e-01, -1.4075e+00, -1.1499e+00,
            9.4779e-01,  1.0356e+00, -9.9712e-01,  6.9963e-01, -1.0088e-01,
            4.1064e-01, -5.2325e-01],
          [-5.6339e-01, -6.8779e-02,  6.0846e-01, -1.1481e+00,  6.5743e-01,
            1.6345e+00,  3.5085e-01, -2.3056e+00, -1.0169e+00,  1.3147e+00,
            7.5867e-01,  2.5464e-01, -5.2802e-01,  1.1613e+00, -1.1111e+00,
           -2.9937e-01,  3.0079e-01],
          [ 3.4619e-01,  1.8169e+00, -1.0665e+00,  4.0032e-01,  5.3362e-01,
            6.3245e-01,  8.5335e-01, -1.3895e+00,  3.8059e-01, -1.2566e+00,
           -8.7481e-01,  1.5971e-01, -5.6410e-01,  8.6297e-01, -1.6452e+00,
           -6.7243e-01,  1.4831e+00],
          [ 5.5835e-01,  1.9708e+00, -8.1471e-02, -9.1841e-01, -3.0379e-02,
            2.9579e-01, -1.9699e+00, -6.5844e-01, -2.8596e-01, -1.1177e+00,
            1.4016e+00, -4.0935e-01,  1.8460e+00, 

In [None]:
reconstructed = model.decode(embedding, [memory])

AttributeError: 'list' object has no attribute 'is_nested'

In [None]:
encode(model, batch[0].to(DEVICE), batch[1].to(DEVICE))

torch.Size([10, 17])
torch.Size([10, 17])


RuntimeError: running_mean should contain 17 elements not 170

In [None]:
encoder_layer = nn.TransformerEncoderLayer(1, 1, batch_first=True)
encoder = nn.TransformerEncoder(encoder_layer, 1)

encoder(input)

tensor([[[0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.]]], grad_fn=<NativeLayerNormBackward0>)

In [None]:
price_data[(1, dt.date(2022, 1, 1))]

  result = func(self.values, **kwargs)


tensor([[-1.0000e+10,  2.2513e-01, -1.0000e+10,  8.9167e-01,  9.5669e-01],
        [-1.0000e+10,  2.2591e-01, -1.0000e+10,  8.9041e-01,  9.5619e-01],
        [-1.0000e+10,  2.2721e-01, -1.0000e+10,  8.8854e-01,  9.5544e-01],
        ...,
        [-1.0000e+10,  2.7621e-01, -6.6862e+00,  8.8338e-01,  9.2140e-01],
        [-1.0000e+10,  2.7616e-01, -2.5919e+00,  8.8354e-01,  9.2150e-01],
        [-1.0000e+10,  2.7332e-01, -1.0000e+10,  8.9210e-01,  9.2715e-01]])