# Train macro & fundamental-aware price models 
Pretraining with fundamental, macroeconomic, estimate and sharep price data to capture the data patterns.
Use embedded fundamental/macro/short-term information for return prediction


## 1. load in data

In [1]:
from data import FundamentalDataset, PriceDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import datetime as dt
import itertools
from utils import Defaults
from torch.utils.data import DataLoader, Dataset

DEFAULTS = Defaults
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


fund_data = FundamentalDataset()
fund_data_weekly = FundamentalDataset(freq="W")
price_data = PriceDataset()

def collate_fn(batch):
    data_ls, masks = [], []
    for data, mask in batch:
        data_ls.append(data)
        masks.append(mask)
    return (
        torch.stack(data_ls),
        torch.stack(masks)
    )

## 2. Train autoencoders as pre-training

### 2.1. Train encoders on fundamental data

In [2]:
fundamental_data_loader = DataLoader(
    fund_data,
    batch_size=5,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=5
)

In [3]:
batch = next(fundamental_data_loader.__iter__())

In [4]:
# instantiate fundamental model

from models.autoencoder import BaseAutoEncoder
model = BaseAutoEncoder(
    window_sizes=[10],
    encoding_dim=1, 
    num_transformer_layers=[1], 
    dims=[17],
    activation_func=F.tanh,
    nheads=[1],
    device=DEVICE)
model = model.to(DEVICE)



In [5]:
x_ = model.transformer_encoders[0].to(DEVICE)(batch[0].to(DEVICE))

In [6]:
model.linear_encoder_layers[0](x_)

tensor([[-0.4081],
        [-0.5553],
        [-0.6150],
        [-0.6810],
        [-0.5336]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [7]:
from typing import Sequence, Tuple, Optional
def encode(
        model, 
        inputs: Sequence[Tuple[torch.tensor, Optional[torch.tensor]]],
        padding_masks: Sequence[torch.tensor]
        ) -> Tuple[torch.tensor, Tuple[torch.tensor]]:
    """encode the inputs i"""
    embeddings, memories = [], []
    for input, mask, transformer_encoder, linear_encoder in zip(
        inputs, padding_masks, model.transformer_encoders, model.linear_encoder_layers):
        print(input.shape)
        x_ = transformer_encoder(input, src_key_padding_mask=mask)
        print(x_.shape)
        memories.append(x_)
        embedded = linear_encoder(x_)
        print(embedded.shape)
        embeddings.append(embedded)
    _embedding = torch.stack(embeddings, dim=0)
    embedding = model.linear_encoder(_embedding)
    embedding = model.tanh(embedding)
    return (embedding, memories)

In [11]:
embedding, memory = model.encode([batch[0].to(DEVICE)], [batch[1].to(DEVICE)])

In [13]:
reconstructed = model.decode(embedding, [memory])

TypeError: 'Tensor' object is not callable

In [49]:
encode(model, batch[0].to(DEVICE), batch[1].to(DEVICE))

torch.Size([10, 17])
torch.Size([10, 17])


RuntimeError: running_mean should contain 17 elements not 170

In [29]:
encoder_layer = nn.TransformerEncoderLayer(1, 1, batch_first=True)
encoder = nn.TransformerEncoder(encoder_layer, 1)

encoder(input)

tensor([[[0.],
         [0.],
         [0.]],

        [[0.],
         [0.],
         [0.]]], grad_fn=<NativeLayerNormBackward0>)

In [3]:
price_data[(1, dt.date(2022, 1, 1))]

  result = func(self.values, **kwargs)


tensor([[-1.0000e+10,  2.2513e-01, -1.0000e+10,  8.9167e-01,  9.5669e-01],
        [-1.0000e+10,  2.2591e-01, -1.0000e+10,  8.9041e-01,  9.5619e-01],
        [-1.0000e+10,  2.2721e-01, -1.0000e+10,  8.8854e-01,  9.5544e-01],
        ...,
        [-1.0000e+10,  2.7621e-01, -6.6862e+00,  8.8338e-01,  9.2140e-01],
        [-1.0000e+10,  2.7616e-01, -2.5919e+00,  8.8354e-01,  9.2150e-01],
        [-1.0000e+10,  2.7332e-01, -1.0000e+10,  8.9210e-01,  9.2715e-01]])