In [31]:
import pandas as pd
import numpy as np
import torch

from torch import nn
from ISLP import load_data
from sklearn.preprocessing import StandardScaler
from torch.optim import RMSprop
from torch.utils.data import TensorDataset
from torchmetrics import R2Score
from torchinfo import summary
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning import seed_everything
from ISLP.torch import (SimpleDataModule,
                        SimpleModule,
                        ErrorTracker,
                        rec_num_workers)

In [32]:
seed_everything(0, workers=True)
torch.use_deterministic_algorithms(True, warn_only=True)

Seed set to 0


In [33]:
NYSE = load_data('NYSE')
cols = ['DJ_return', 'log_volume', 'log_volatility']
X = pd.DataFrame(
    StandardScaler(with_mean=True, with_std=True).fit_transform(NYSE[cols]),
    columns=NYSE[cols].columns, index=NYSE.index)

In [34]:
for lag in range(1, 6):
    for col in cols:
        newcol = np.zeros(X.shape[0]) * np.nan
        newcol[lag:] = X[col].values[:-lag]
        X.insert(len(X.columns), "{0}_{1}".format(col, lag), newcol)
X.insert(len(X.columns), 'train', NYSE['train'])
X = X.dropna()

In [35]:
Y, train = X['log_volume'], X['train']
X = X.drop(columns=['train'] + cols)
X.columns

Index(['DJ_return_1', 'log_volume_1', 'log_volatility_1', 'DJ_return_2',
       'log_volume_2', 'log_volatility_2', 'DJ_return_3', 'log_volume_3',
       'log_volatility_3', 'DJ_return_4', 'log_volume_4', 'log_volatility_4',
       'DJ_return_5', 'log_volume_5', 'log_volatility_5'],
      dtype='object')

In [36]:
week_day_df_list = []
week_day_df = pd.get_dummies(NYSE['day_of_week'])
for i in range(1, 6):
    week_day_df_tmp = week_day_df.copy()
    for col in week_day_df_tmp.columns:
        week_day_df_tmp = week_day_df_tmp.rename(columns={col: col + f"_{i}"})
    X = pd.merge(X, week_day_df_tmp, on='date')

display(X)

Unnamed: 0_level_0,DJ_return_1,log_volume_1,log_volatility_1,DJ_return_2,log_volume_2,log_volatility_2,DJ_return_3,log_volume_3,log_volatility_3,DJ_return_4,...,fri_4,mon_4,thur_4,tues_4,wed_4,fri_5,mon_5,thur_5,tues_5,wed_5
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1962-12-10,0.046340,0.224779,-2.500970,-0.431397,0.935176,-2.366521,0.434813,2.283789,-2.418037,0.905200,...,0,1,0,0,0,0,1,0,0,0
1962-12-11,-1.304126,0.605918,-1.366028,0.046340,0.224779,-2.500970,-0.431397,0.935176,-2.366521,0.434813,...,0,0,0,1,0,0,0,0,1,0
1962-12-12,-0.006294,-0.013661,-1.505667,-1.304126,0.605918,-1.366028,0.046340,0.224779,-2.500970,-0.431397,...,0,0,0,0,1,0,0,0,0,1
1962-12-13,0.377081,0.042552,-1.551515,-0.006294,-0.013661,-1.505667,-1.304126,0.605918,-1.366028,0.046340,...,0,0,1,0,0,0,0,1,0,0
1962-12-14,-0.411718,-0.419836,-1.597607,0.377081,0.042552,-1.551515,-0.006294,-0.013661,-1.505667,-1.304126,...,1,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1986-12-24,-0.750046,1.964846,0.080250,-0.185178,1.602669,0.128004,0.968266,4.258402,0.263406,-0.360744,...,0,0,0,0,1,0,0,0,0,1
1986-12-26,0.751210,-0.974763,0.046886,-0.750046,1.964846,0.080250,-0.185178,1.602669,0.128004,0.968266,...,1,0,0,0,0,1,0,0,0,0
1986-12-29,0.195352,-5.623814,-0.083983,0.751210,-0.974763,0.046886,-0.750046,1.964846,0.080250,-0.185178,...,0,1,0,0,0,0,1,0,0,0
1986-12-30,-1.148951,-1.553082,0.019967,0.195352,-5.623814,-0.083983,0.751210,-0.974763,0.046886,-0.750046,...,0,0,0,1,0,0,0,0,1,0


In [37]:
ordered_cols = []
cols = ['DJ_return', 'log_volume', 'log_volatility', 'fri', 'mon', 'thur',
        'tues', 'wed']
for lag in range(5, 0, -1):
    for col in cols:
        ordered_cols.append('{0}_{1}'.format(col, lag))
X_day = X.reindex(columns=ordered_cols)
X.columns

Index(['DJ_return_1', 'log_volume_1', 'log_volatility_1', 'DJ_return_2',
       'log_volume_2', 'log_volatility_2', 'DJ_return_3', 'log_volume_3',
       'log_volatility_3', 'DJ_return_4', 'log_volume_4', 'log_volatility_4',
       'DJ_return_5', 'log_volume_5', 'log_volatility_5', 'fri_1', 'mon_1',
       'thur_1', 'tues_1', 'wed_1', 'fri_2', 'mon_2', 'thur_2', 'tues_2',
       'wed_2', 'fri_3', 'mon_3', 'thur_3', 'tues_3', 'wed_3', 'fri_4',
       'mon_4', 'thur_4', 'tues_4', 'wed_4', 'fri_5', 'mon_5', 'thur_5',
       'tues_5', 'wed_5'],
      dtype='object')

In [39]:
X_rnn = X_day.to_numpy().reshape((-1, 5, 8))
X_rnn.shape

(6046, 5, 8)

In [46]:
class NYSEModel(nn.Module):
    def __init__(self):
        super(NYSEModel, self).__init__()
        self.rnn = nn.RNN(8, 12, batch_first=True)
        self.dense = nn.Linear(12, 1)
        self.dropout = nn.Dropout(0.1)
    def forward(self, x):
        val, h_n = self.rnn(x)
        val = self.dense(self.dropout(val[:,-1]))
        return torch.flatten(val)
    
nyse_model = NYSEModel()


In [47]:
datasets = []
for mask in [train, ~train]:
    X_rnn_t = torch.tensor(X_rnn[mask].astype(np.float32))
    Y_t = torch.tensor(Y[mask].astype(np.float32))
    datasets.append(TensorDataset(X_rnn_t, Y_t))
nyse_train, nyse_test = datasets

In [48]:
summary(nyse_model, input_data=X_rnn_t,
        col_names=['input_size', 'output_size', 'num_params'])

Layer (type:depth-idx)                   Input Shape               Output Shape              Param #
NYSEModel                                [1770, 5, 8]              [1770]                    --
├─RNN: 1-1                               [1770, 5, 8]              [1770, 5, 12]             264
├─Dropout: 1-2                           [1770, 12]                [1770, 12]                --
├─Linear: 1-3                            [1770, 12]                [1770, 1]                 13
Total params: 277
Trainable params: 277
Non-trainable params: 0
Total mult-adds (M): 2.36
Input size (MB): 0.28
Forward/backward pass size (MB): 0.86
Params size (MB): 0.00
Estimated Total Size (MB): 1.15

In [49]:
nyse_dm = SimpleDataModule(nyse_train, nyse_test, num_workers=4,
                           validation=nyse_test, batch_size=64)

In [50]:
for idx, (x, y) in enumerate(nyse_dm.train_dataloader()):
    out = nyse_model(x)
    print(y.size(), out.size())
    if idx >= 2:
        break

torch.Size([64]) torch.Size([64])
torch.Size([64]) torch.Size([64])
torch.Size([64]) torch.Size([64])


In [51]:
nyse_optimizer = RMSprop(nyse_model.parameters(), lr=0.001)
nyse_module = SimpleModule.regression(nyse_model, optimizer=nyse_optimizer,
                                      metrics={'r2': R2Score()})

In [52]:
nyse_trainer = Trainer(deterministic=True, max_epochs=200,
                       callbacks=[ErrorTracker()])
nyse_trainer.fit(nyse_module, datamodule=nyse_dm)
nyse_trainer.test(nyse_module, datamodule=nyse_dm)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name  | Type      | Params
------------------------------------
0 | model | NYSEModel | 277   
1 | loss  | MSELoss   | 0     
------------------------------------
277       Trainable params
0         Non-trainable params
277       Total params
0.001     Total estimated model params size (MB)


Epoch 199: 100%|██████████| 67/67 [00:04<00:00, 13.49it/s, v_num=4]        

`Trainer.fit` stopped: `max_epochs=200` reached.


Epoch 199: 100%|██████████| 67/67 [00:05<00:00, 13.34it/s, v_num=4]
Testing DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 35.77it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.5719428658485413
         test_r2            0.4571983218193054
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_loss': 0.5719428658485413, 'test_r2': 0.4571983218193054}]