In [1]:
import numpy as np
import pandas as pd
import pathlib
import time
import matplotlib.pyplot as plt

import torch
from torch import nn
from pytorch_lightning import seed_everything
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from torch.nn import functional as F

In [2]:
seed_everything(42, workers=True)

Global seed set to 42


42

In [3]:
# Set path
path = pathlib.Path(r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature")

In [4]:
def feature_engineer(data):
    """
    Arguments:
    data: pandas.DataFrame that must have specific columns.

    """
    # Bid-Ask spread: (Ask - Bid) / Ask
    data["best_bid"] = (data["best_offer"] - data["best_bid"]) / (data["best_offer"])
    data = data.rename(columns={"best_bid": "ba_spread_option"}).drop(["best_offer"], axis=1)

    # Gamma: multiply by spotprice and divide by 100
    data["gamma"] = data["gamma"] * data["spotprice"] / 100 #following Bali et al. (2021)

    # Theta: scale by spotprice
    data["theta"] = data["theta"] / data["spotprice"] #following Bali et al. (2021)

    # Vega: scale by spotprice
    data["vega"] = data["vega"] / data["spotprice"] #following Bali et al. (2021)

    # Time to Maturity: cale by number of days in year: 365
    data["days_to_exp"] = data["days_to_exp"] / 365

    # Moneyness: Strike / Spot (K / S)
    data["strike_price"] = data["strike_price"] / data["spotprice"] # K / S
    data = data.rename(columns={"strike_price": "moneyness"})

    # Forward Price ratio: Forward / Spot
    data["forwardprice"] = data["forwardprice"] / data["spotprice"]

    # Drop redundant/ unimportant columns
    data = data.drop(["cfadj", "days_no_trading", "spotprice", "adj_spot"], axis=1)

    return data


In [5]:
# multiclass y label function
def binary_categorize(y):
    if y > 0:
        return 1
    else:
        return 0

In [6]:
# multiclass y label function
def multi_categorize(y):
    if y > 0.05:
        return 1
    elif y < -0.05:
        return -1
    else:
        return 0

In [7]:
torch.Tensor([1, 2, 3]).type(torch.DoubleTensor)

tensor([1., 2., 3.], dtype=torch.float64)

In [8]:
print(pl.__version__)

1.6.4


In [9]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self, path = path, batch_size: int = 32, start_val="2015", start_test="2016"):
        super().__init__()
        self.save_hyperparameters()
        self.batch_size = batch_size
        
        # read data from disk
        self.data = pd.read_parquet(path/"final_df_filledmean_small.parquet")
            
        # feature engineer data
        self.data = feature_engineer(self.data)
        
        # create y
        self.y = self.data["option_ret"]
        # make classification problem
        self.y = self.y.apply(binary_categorize)
        # create X
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # save dates and drop
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
        # to torch Tensor
        self.X = torch.from_numpy(self.X.values).float()
        self.y = torch.from_numpy(self.y.values)
        
    def setup(self, stage: str = None):
        # train
        self.X_train = self.X[self.dates < self.hparams.start_val]
        self.y_train = self.y[:len(self.X_train)]
        
        #val
        mask = (self.dates >= self.hparams.start_val) & (self.dates < self.hparams.start_test)
        self.X_val = self.X[mask]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # test
        self.X_test = self.X[self.dates >= self.hparams.start_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (np.sum(len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data)), "sum of samples of splits\
        is not equal length of dataset"
        
        print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
        print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
        print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
        print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
        print(f"train start date: ", self.dates[self.dates < self.hparams.start_val].iloc[0].strftime("%Y-%m-%d"), 
              ", train end date: ", self.dates[self.dates < self.hparams.start_val].iloc[-1].strftime("%Y-%m-%d"))
        print(f"val start date: ", self.dates[mask].iloc[0].strftime("%Y-%m-%d"), 
              ", val end date: ", self.dates[mask].iloc[-1].strftime("%Y-%m-%d"))
        print(f"test start date: ", self.dates[self.dates >= self.hparams.start_test].iloc[0].strftime("%Y-%m-%d"), 
              ", test end date: ", self.dates[self.dates >= self.hparams.start_test].iloc[-1].strftime("%Y-%m-%d"))
              
    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.X_train))
        x, y = self.X_train[idx], self.y_train[idx]
        return (x, y)

    def train_dataloader(self):
        dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def val_dataloader(self):
        dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def test_dataloader(self):
        dataset = TensorDataset(self.X_test, self.y_test)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

In [10]:
# test =MyDataModule()

In [11]:
# type(test.dates[test.dates < "1997"])

In [12]:
# test.setup()

In [13]:
torch.empty(3).random_(2)

tensor([0., 1., 0.])

In [14]:
class FFN(pl.LightningModule):
    def __init__(self,
                input_dim,
                hidden_dim,
                learning_rate,
                ):
        super().__init__()
        
        self.save_hyperparameters()
        self.l1 = nn.Linear(input_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        return self.l2(torch.relu(self.l1(x)))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        
        self.log("loss/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)

        self.log("loss/val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        return {"val_loss": loss}
    
    def on_train_start(self):
        self.st_total = time.time()

    def on_train_epoch_start(self):
        self.st = time.time()
        self.steps = self.global_step

    def on_train_epoch_end(self):
        elapsed = time.time() - self.st
        steps_done = self.global_step - self.steps
        self.log("time/step", elapsed / steps_done)

    def on_train_end(self):
        elapsed = time.time() - self.st_total
        print(f"Total Training Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed))}")
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)

        self.log("loss/test_loss", loss, prog_bar=True)
        return loss
    
    def accuracy():
        pass

In [15]:
# Hyperparameters

# Data params
PATH = path
BATCH_SIZE = 512
START_VAL = "2015"
START_TEST = "2016"

# model params
INPUT_DIM = 15 # number of input features
HIDDEN_DIM = 10 
LEARNING_RATE = 1e-4

# trainer params
NUM_EPOCHS = 200 

In [16]:
dm = MyDataModule(
    path=PATH, 
    batch_size=BATCH_SIZE, 
    start_val=START_VAL, 
    start_test=START_TEST,
)

In [17]:
model = FFN(
    input_dim=INPUT_DIM,
    hidden_dim=HIDDEN_DIM,
    learning_rate=LEARNING_RATE,
)

In [18]:
logger = pl.loggers.TensorBoardLogger(
    'logs'
)

In [19]:
checkpoint_callback = ModelCheckpoint(
    monitor="loss/val_loss",
    save_top_k= 1,
    mode= "min",
    filename='epoch={epoch}-val_loss={loss/val_loss:.2f}-other_metric={other_metric:.2f}',
    auto_insert_metric_name=False,
)

In [20]:
trainer = pl.Trainer(
    max_epochs=NUM_EPOCHS,
    gpus=1,
    logger=logger, #=logger or False
    check_val_every_n_epoch=1,
    callbacks=checkpoint_callback, #default is after each training epoch
    # enable_checkpointing = False,
    num_sanity_val_steps=2,
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [21]:
s_time = time.time()
trainer.fit(model, dm)
e_time = time.time()
print(f"Time to fit: {divmod(e_time - s_time, 60)[0]:2.0f}:{divmod(e_time - s_time, 60)[1]:2.0f}\
 min")

Missing logger folder: logs\lightning_logs


# of input data: 3823386 with shape: (3823386, 17)
# of training samples: 1947057 with X_train of shape: torch.Size([1947057, 15])
# of validation samples: 191260 with X_val of shape: torch.Size([191260, 15])
# of test samples: 1685069 with X_test of shape: torch.Size([1685069, 15])
train start date:  1996-01-31 , train end date:  2014-12-31
val start date:  2015-01-31 , val end date:  2015-12-31
test start date:  2016-01-31 , test end date:  2021-11-30


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params
--------------------------------
0 | l1   | Linear | 160   
1 | l2   | Linear | 22    
--------------------------------
182       Trainable params
0         Non-trainable params
182       Total params
0.001     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Total Training Time: 01:59:48
Time to fit: 120:25 min


In [24]:
dm.X_val

tensor([[8.7596e-01, 1.3174e-01, 4.8000e+01,  ..., 7.8000e+00, 1.0000e+00,
         0.0000e+00],
        [9.6356e-01, 5.7143e-02, 3.2000e+01,  ..., 3.4000e+00, 1.0000e+00,
         0.0000e+00],
        [1.0512e+00, 1.0345e-01, 3.0000e+00,  ..., 4.1250e+00, 0.0000e+00,
         1.0000e+00],
        ...,
        [1.1842e+00, 3.7500e-01, 4.6000e+01,  ..., 3.2500e-01, 1.0000e+00,
         0.0000e+00],
        [9.2105e-01, 2.0000e-01, 3.0200e+02,  ..., 6.7500e-01, 0.0000e+00,
         1.0000e+00],
        [7.3801e-01, 2.0000e-01, 5.2000e+01,  ..., 4.5000e-01, 0.0000e+00,
         1.0000e+00]])

In [58]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

In [54]:
model.eval()
(model(dm.X_val).softmax(dim=1)).argmax(dim=1).sum()

tensor(249)

In [56]:
F.cross_entropy(model(dm.X_val), dm.y_val)

tensor(0.6684, grad_fn=<NllLossBackward0>)

In [67]:
accuracy_score(dm.y_val.numpy(), model(dm.X_val).argmax(dim=1).detach().numpy())

0.6151260064833212

In [68]:
balanced_accuracy_score(dm.y_val.numpy(), model(dm.X_val).argmax(dim=1).detach().numpy())

0.5000026573887721

In [63]:
dm.y_val.numpy()

array([1, 1, 0, ..., 1, 1, 1], dtype=int64)

In [66]:
model(dm.X_val).argmax(dim=1).detach().numpy()

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
# [{'loss/test_loss': 0.6668015718460083}]


In [25]:
# checkpoint_callback.best_model_path

'logs\\lightning_logs\\version_0\\checkpoints\\epoch=167-val_loss=0.67-other_metric=0.00.ckpt'