# Train macro & fundamental-aware price models 
Pretraining with fundamental, macroeconomic, estimate and sharep price data to capture the data patterns.
Use embedded fundamental/macro/short-term information for return prediction


## 1. Load in data

In [1]:
from data import FundamentalDataset, PriceDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import datetime as dt
import itertools
from utils import Defaults
from torch.utils.data import DataLoader, Dataset
from copy import deepcopy

DEFAULTS = Defaults
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")



## 2. Train autoencoders as pre-training

### 2.1. Train encoders on fundamental data

In [5]:
from typing import Sequence

def expand_mask(mask: torch.tensor, target_dim: int) -> torch.tensor:
    """expand mask from n dimensions to n+1 dimensions"""
    newmask = deepcopy(mask).unsqueeze(-1)
    mask_dims = list(newmask.shape)
    mask_dims[-1] = target_dim
    mask_dims = tuple(mask_dims)
    return newmask.expand(mask_dims)

def expand_masks(masks: Sequence[torch.tensor], target_dims: Sequence[int]):
    expanded_masks = []
    for mask, dim in zip(masks, target_dims):
        newmask = expand_mask(mask, dim)
        expanded_masks.append(newmask)
    return expanded_masks

def masked_mse_loss(
        input: torch.tensor, 
        target: torch.tensor,
        mask: torch.tensor,
        na_pad: torch.tensor,
        ) -> torch.tensor:
    """custome MSE loss to mask padding & nan values
    :param input: original vector
    :param target: target vector
    :param 
    """
    loss = nn.MSELoss()
    dims = input.shape[-1]
    na_mask = input == na_pad
    expanded_mask = expand_mask(mask, dims)
    new_mask = na_mask.type(torch.bool) + expanded_mask.type(torch.bool)
    masked_input = torch.masked_select(input, ~new_mask) # mask itmodel is True if masked
    masked_target = torch.masked_select(target, ~new_mask)
    loss = loss(masked_input, masked_target)
    input_size = (masked_input == masked_input).sum()
    return loss / input_size # normalize input size

def composite_mseloss(mse_losses: Sequence[torch.tensor]):
    mean_loss = torch.stack(mse_losses).sum() / len(mse_losses)
    penalty_loss = torch.stack([(loss - mean_loss)**2 for loss in mse_losses]).sum()
    composite_loss = mean_loss + penalty_loss
    return composite_loss

def multiple_input_masked_mse_loss(
        inputs: Sequence[torch.tensor],
        targets: Sequence[torch.tensor],
        masks: Sequence[torch.tensor],
        na_pads: Sequence[torch.tensor]):
    losses = []
    for input, target, mask, na_pad in zip(
        inputs, targets, masks, na_pads):
        loss = masked_mse_loss(input, target, mask, na_pad)
        losses.append(loss)
    composite_loss = composite_mseloss(losses)
    return composite_loss


In [3]:
from typing import Tuple, Optional
def encode(
        model, 
        inputs: Sequence[Tuple[torch.tensor, Optional[torch.tensor]]],
        padding_masks: Sequence[torch.tensor]
        ) -> Tuple[torch.tensor, Tuple[torch.tensor]]:
    """encode a list of inputs of different lengths and dimensionalities
    into a single embedding vector
    """
    embeddings, memories = [], []
    for input, mask, transformer_encoder, linear_encoder in zip(
        inputs, padding_masks, model.transformer_encoders, model.linear_encoder_layers):
        x_ = transformer_encoder(input, src_key_padding_mask=mask)
        print(x_.shape)
        memories.append(x_)
        embedded = linear_encoder(x_)
        print(embedded.shape)
        embeddings.append(embedded)
    _embedding = torch.stack(embeddings, dim=-1)
    embedding = model.linear_encoder(_embedding)
    embedding = model.tanh(embedding)
    return (embedding, memories)

def decode(
        model, 
        embedding: torch.tensor, 
        memories: Sequence[torch.tensor]) -> Sequence[torch.tensor]:
    _embeddings = model.linear_decoder(embedding)
    reconstructed_xs = []
    for i in range(model.num_inputs):
        _output = model.linear_decoder_layers[i](_embeddings[:, :, i])
        output = model.transformer_decoders[i](_output, memory=memories[i])
        reconstructed_xs.append(output)
    return reconstructed_xs

In [7]:
from models.autoencoder import BaseAutoEncoder
from torch.utils.tensorboard import SummaryWriter

RUN = 5
LR = 1e-9
NUM_TRANSFORMER_LAYERS = 1
WINDOW_SIZE = 10
NHEADS = 1
ENCODING_DIM = 2
MAX_EPOCHS = 10
BATCH_SIZE = 128
PCT_NAN_THRES = 0.4

fund_data = FundamentalDataset(window_size=WINDOW_SIZE)
# fund_data_weekly = FundamentalDataset(freq="W")
price_data = PriceDataset()

def collate_fn(batch):
    data_ls, masks = [], []
    for data, mask in batch:
        data_ls.append(data)
        masks.append(mask)
    return (
        torch.stack(data_ls),
        torch.stack(masks)
    )

logger_stem = "./traininglog/fundamental_encoder/runs/"
logger = SummaryWriter(f"{logger_stem}run{RUN};lr={LR};notflayrs={NUM_TRANSFORMER_LAYERS};wd={WINDOW_SIZE};nh={NHEADS};edim={ENCODING_DIM};bsize={BATCH_SIZE}")

fundamental_data_loader = DataLoader(
    fund_data,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_fn,
    num_workers=max(BATCH_SIZE, 8) # max 8 workers suggested
)

num_batches = len(fundamental_data_loader)

model = BaseAutoEncoder(
    window_sizes=[WINDOW_SIZE],
    encoding_dim=ENCODING_DIM, 
    num_transformer_layers=[NUM_TRANSFORMER_LAYERS], 
    dims=[17],
    activation_func=F.tanh,
    nheads=[NHEADS],
    device=DEVICE)
model = model.to(DEVICE)

optimizer = torch.optim.Adam(
    model.parameters(), lr=LR, betas=[0.9, 0.99], eps=1e-09)



for epoch in range(MAX_EPOCHS):
    running_losses = []
    for i, (input, mask) in enumerate(fundamental_data_loader):
        print(i)
        pct_nan = (input == DEFAULTS.padding_val).sum() / (input == input).sum().detach() 
        if pct_nan < PCT_NAN_THRES: # skip if a lot of nan
            # forward pass
            input, mask = input.to(DEVICE), mask.to(DEVICE)
            embedding, memories = model.encode([input], [mask])
            output = model.decode(embedding, memories)
            loss = multiple_input_masked_mse_loss([input], output, [mask], [DEFAULTS.padding_val])
            running_losses.append(loss.item())
            logger.add_scalar("loss/train_step", loss.item(), epoch*num_batches + i)

            # backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    mean_loss = np.mean(running_losses)
    logger.add_scalar("loss/train", mean_loss, epoch)




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127


RuntimeError: Caught DBAPIError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1960, in _exec_single_context
    self.dialect.do_execute(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 924, in do_execute
    cursor.execute(statement, parameters)
pyodbc.Error: ('HY000', "[HY000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Resource ID : 1. The request limit for the database is 75 and has been reached. See 'https://docs.microsoft.com/azure/azure-sql/database/resource-limits-logical-server' for assistance. (10928) (SQLExecDirectW)")

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 51, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/mnt/batch/tasks/shared/LS_root/mounts/clusters/timeseries-t4/code/users/p.peng/stockencoder/data/__init__.py", line 364, in __getitem__
    slice = SQLDatabase.to_pandas(query).astype(float).ffill()
  File "/mnt/batch/tasks/shared/LS_root/mounts/clusters/timeseries-t4/code/users/p.peng/stockencoder/utils/database.py", line 59, in to_pandas
    return cls(**kwargs).query_to_pandas(query)
  File "/mnt/batch/tasks/shared/LS_root/mounts/clusters/timeseries-t4/code/users/p.peng/stockencoder/utils/database.py", line 54, in query_to_pandas
    df = pd.read_sql(sqlalchemy.text(query), con=conn)
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/pandas/io/sql.py", line 734, in read_sql
    return pandas_sql.read_query(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/pandas/io/sql.py", line 1836, in read_query
    result = self.execute(sql, params)
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/pandas/io/sql.py", line 1660, in execute
    return self.con.execute(sql, *args)
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1408, in execute
    return meth(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/sql/elements.py", line 513, in _execute_on_connection
    return connection._execute_clauseelement(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1630, in _execute_clauseelement
    ret = self._execute_context(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1839, in _execute_context
    return self._exec_single_context(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1979, in _exec_single_context
    self._handle_dbapi_exception(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 2335, in _handle_dbapi_exception
    raise sqlalchemy_exception.with_traceback(exc_info[2]) from e
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/base.py", line 1960, in _exec_single_context
    self.dialect.do_execute(
  File "/anaconda/envs/stockencoder_env/lib/python3.10/site-packages/sqlalchemy/engine/default.py", line 924, in do_execute
    cursor.execute(statement, parameters)
sqlalchemy.exc.DBAPIError: (pyodbc.Error) ('HY000', "[HY000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]Resource ID : 1. The request limit for the database is 75 and has been reached. See 'https://docs.microsoft.com/azure/azure-sql/database/resource-limits-logical-server' for assistance. (10928) (SQLExecDirectW)")
[SQL: SELECT operating_roic, normalized_roe, return_on_asset, return_com_eqy, ebit_margin, fcf_margin_after_oper_lea_pymt, gross_margin, eff_tax_rate, ebitda_margin, net_debt_to_shrhldr_eqty, fixed_charge_coverage_ratio, net_debt_to_ebitda, acct_rcv_days, cash_conversion_cycle, invent_days, net_income_growth, sales_rev_turn_growth FROM fundamental_data_stock_encoder WHERE (year between 2003 AND 2012) AND (figi = 'BBG000BDZJK6')]
(Background on this error at: https://sqlalche.me/e/20/dbapi)


In [None]:
price_data[(1, dt.date(2022, 1, 1))]

  result = func(self.values, **kwargs)


tensor([[-1.0000e+10,  2.2513e-01, -1.0000e+10,  8.9167e-01,  9.5669e-01],
        [-1.0000e+10,  2.2591e-01, -1.0000e+10,  8.9041e-01,  9.5619e-01],
        [-1.0000e+10,  2.2721e-01, -1.0000e+10,  8.8854e-01,  9.5544e-01],
        ...,
        [-1.0000e+10,  2.7621e-01, -6.6862e+00,  8.8338e-01,  9.2140e-01],
        [-1.0000e+10,  2.7616e-01, -2.5919e+00,  8.8354e-01,  9.2150e-01],
        [-1.0000e+10,  2.7332e-01, -1.0000e+10,  8.9210e-01,  9.2715e-01]])