In [1]:
import pandas as pd
import numpy as np
import torch

from torch import nn
from ISLP import load_data
from sklearn.preprocessing import StandardScaler
from torch.optim import RMSprop
from torch.utils.data import TensorDataset
from torchmetrics import R2Score
from torchinfo import summary
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning import seed_everything
from ISLP.torch import (SimpleDataModule,
                        SimpleModule,
                        ErrorTracker,
                        rec_num_workers)

In [2]:
seed_everything(0, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

Seed set to 0


In [3]:
NYSE = load_data('NYSE')
cols = ['DJ_return', 'log_volume', 'log_volatility']
X = pd.DataFrame(
    StandardScaler(with_mean=True, with_std=True).fit_transform(NYSE[cols]),
    columns=NYSE[cols].columns, index=NYSE.index)

In [4]:
for lag in range(1, 6):
    for col in cols:
        newcol = np.zeros(X.shape[0]) * np.nan
        newcol[lag:] = X[col].values[:-lag]
        X.insert(len(X.columns), "{0}_{1}".format(col, lag), newcol)
X.insert(len(X.columns), 'train', NYSE['train'])
X = X.dropna()

In [5]:
Y, train = X['log_volume'], X['train']
X = X.drop(columns=['train'] + cols)
X.columns

Index(['DJ_return_1', 'log_volume_1', 'log_volatility_1', 'DJ_return_2',
       'log_volume_2', 'log_volatility_2', 'DJ_return_3', 'log_volume_3',
       'log_volatility_3', 'DJ_return_4', 'log_volume_4', 'log_volatility_4',
       'DJ_return_5', 'log_volume_5', 'log_volatility_5'],
      dtype='object')

In [6]:
ordered_cols = []
for lag in range(5, 0, -1):
    for col in cols:
        ordered_cols.append('{0}_{1}'.format(col, lag))
X = X.reindex(columns=ordered_cols)
X.columns

Index(['DJ_return_5', 'log_volume_5', 'log_volatility_5', 'DJ_return_4',
       'log_volume_4', 'log_volatility_4', 'DJ_return_3', 'log_volume_3',
       'log_volatility_3', 'DJ_return_2', 'log_volume_2', 'log_volatility_2',
       'DJ_return_1', 'log_volume_1', 'log_volatility_1'],
      dtype='object')

In [7]:
X_rnn = X.to_numpy().reshape((-1, 5, 3))
X_rnn.shape

(6046, 5, 3)

In [8]:
class NYSEModel(nn.Module):
    def __init__(self):
        super(NYSEModel, self).__init__()
        self._forward = nn.Sequential(
            nn.Flatten(), nn.Linear(15, 32), nn.ReLU(), nn.Dropout(0.5),
            nn.Linear(32, 1))
    def forward(self, x):
        return torch.flatten(self._forward(x))
    
nyse_model = NYSEModel()

In [9]:
datasets = []
for mask in [train, ~train]:
    X_rnn_t = torch.tensor(X_rnn[mask].astype(np.float32))
    Y_t = torch.tensor(Y[mask].astype(np.float32))
    datasets.append(TensorDataset(X_rnn_t, Y_t))
nyse_train, nyse_test = datasets

In [10]:
summary(nyse_model, input_data=X_rnn_t,
        col_names=['input_size', 'output_size', 'num_params'])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NYSEModel                                [1770, 5, 3]              [1770]                    --
├─Sequential: 1-1                        [1770, 5, 3]              [1770, 1]                 --
│    └─Flatten: 2-1                      [1770, 5, 3]              [1770, 15]                --
│    └─Linear: 2-2                       [1770, 15]                [1770, 32]                512
│    └─ReLU: 2-3                         [1770, 32]                [1770, 32]                --
│    └─Dropout: 2-4                      [1770, 32]                [1770, 32]                --
│    └─Linear: 2-5                       [1770, 32]                [1770, 1]                 33
Total params: 545
Trainable params: 545
Non-trainable params: 0
Total mult-adds (M): 0.96
Input size (MB): 0.11
Forward/backward pass size (MB): 0.47
Params size (MB): 0.00
Estimated Total Size (MB): 0.58

In [11]:
nyse_dm = SimpleDataModule(nyse_train, nyse_test, num_workers=4,
                           validation=nyse_test, batch_size=64)

In [12]:
for idx, (x, y) in enumerate(nyse_dm.train_dataloader()):
    out = nyse_model(x)
    print(y.size(), out.size())
    if idx >= 2:
        break

torch.Size([64]) torch.Size([64])
torch.Size([64]) torch.Size([64])
torch.Size([64]) torch.Size([64])


In [13]:
nyse_optimizer = RMSprop(nyse_model.parameters(), lr=0.001)
nyse_module = SimpleModule.regression(nyse_model, optimizer=nyse_optimizer,
                                      metrics={'r2': R2Score()})

In [14]:
nyse_trainer = Trainer(deterministic=True, max_epochs=20,
                       callbacks=[ErrorTracker()])
nyse_trainer.fit(nyse_module, datamodule=nyse_dm)
nyse_trainer.test(nyse_module, datamodule=nyse_dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | NYSEModel | 545   
1 | loss  | MSELoss   | 0     
------------------------------------
545       Trainable params
0         Non-trainable params
545       Total params
0.002     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 67/67 [00:06<00:00,  9.70it/s, v_num=2]         

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 67/67 [00:06<00:00,  9.68it/s, v_num=2]
Testing DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 147.57it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.6067941188812256
         test_r2            0.42412275075912476
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.6067941188812256, 'test_r2': 0.42412275075912476}]