In [None]:
import numpy as np
import pandas as pd
import pathlib
import time
import matplotlib.pyplot as plt
from datetime import datetime

import torch
from torch import nn
from pytorch_lightning import seed_everything
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import functional as F
import torchmetrics

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

In [None]:
seed_everything(42, workers=True)

In [None]:
# Set path
path_data = pathlib.Path(r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature")

In [None]:
def feature_engineer(data):
    """
    Arguments:
    data: pandas.DataFrame that must have specific columns.

    """
    # Bid-Ask spread: (Ask - Bid) / Ask
    data["best_bid"] = (data["best_offer"] - data["best_bid"]) / (data["best_offer"])
    data = data.rename(columns={"best_bid": "ba_spread_option"}).drop(["best_offer"], axis=1)

    # Gamma: multiply by spotprice and divide by 100
    data["gamma"] = data["gamma"] * data["spotprice"] / 100 #following Bali et al. (2021)

    # Theta: scale by spotprice
    data["theta"] = data["theta"] / data["spotprice"] #following Bali et al. (2021)

    # Vega: scale by spotprice
    data["vega"] = data["vega"] / data["spotprice"] #following Bali et al. (2021)

    # Time to Maturity: cale by number of days in year: 365
    data["days_to_exp"] = data["days_to_exp"] / 365

    # Moneyness: Strike / Spot (K / S)
    data["strike_price"] = data["strike_price"] / data["spotprice"] # K / S
    data = data.rename(columns={"strike_price": "moneyness"})

    # Forward Price ratio: Forward / Spot
    data["forwardprice"] = data["forwardprice"] / data["spotprice"]

    # Drop redundant/ unimportant columns
    data = data.drop(["cfadj", "days_no_trading", "spotprice", "adj_spot"], axis=1)

    return data


In [None]:
# multiclass y label function
def binary_categorize(y):
    if y > 0:
        return 1
    else:
        return 0

In [None]:
# multiclass y label function
def multi_categorize(y):
    if y > 0.05:
        return 2
    elif y < -0.05:
        return 0
    else:
        return 1

In [None]:
torch.Tensor([1, 2, 3]).type(torch.DoubleTensor)

In [None]:
print(pl.__version__)

In [None]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self,
                 dataset: str,
                 path,
                 batch_size: int, 
                 start_val: str, 
                 start_test: str,
                 label_fn: str,
                ):
        super().__init__()
        self.save_hyperparameters()
        self.batch_size = batch_size
        
        # read data from disk
        if dataset == "small":
            self.data = pd.read_parquet(path/"final_df_filledmean_small.parquet")
        elif dataset == "big":
            self.data = pd.read_parquet(path/"final_df_filledmean.parquet")
        else:
            raise ValueError("Specify dataset as either 'small' or 'big'")
            
        # feature engineer data
        self.data = feature_engineer(self.data)
        
        # create y
        self.y = self.data["option_ret"]
        # make classification problem
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi":
            self.y = self.y.apply(multi_categorize)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # create X
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # save dates and drop
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
        # to torch Tensor
        self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
        self.y = torch.from_numpy(self.y.values)
        
    def setup(self, stage: str = None):
        # train
        self.X_train = self.X[self.dates < self.hparams.start_val]
        self.y_train = self.y[:len(self.X_train)]
        
        #val
        mask = (self.dates >= self.hparams.start_val) & (self.dates < self.hparams.start_test)
        self.X_val = self.X[mask]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # test
        self.X_test = self.X[self.dates >= self.hparams.start_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (np.sum(len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data)), "sum of samples of splits\
        is not equal length of dataset"
        
        #standardize X_train
        mean = torch.mean(self.X_train, axis=0)
        std = torch.std(self.X_train, axis=0)
        
        # Standardize X_train, X_val and X_test with mean/std from X_train
        self.X_train = (self.X_train - mean) / std
        self.X_val = (self.X_val - mean) / std
        self.X_test = (self.X_test - mean) / std
        
        
        print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
        print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
        print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
        print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
        print(f"train start date: ", self.dates[self.dates < self.hparams.start_val].iloc[0].strftime("%Y-%m-%d"), 
              ", train end date: ", self.dates[self.dates < self.hparams.start_val].iloc[-1].strftime("%Y-%m-%d"))
        print(f"val start date: ", self.dates[mask].iloc[0].strftime("%Y-%m-%d"), 
              ", val end date: ", self.dates[mask].iloc[-1].strftime("%Y-%m-%d"))
        print(f"test start date: ", self.dates[self.dates >= self.hparams.start_test].iloc[0].strftime("%Y-%m-%d"), 
              ", test end date: ", self.dates[self.dates >= self.hparams.start_test].iloc[-1].strftime("%Y-%m-%d"))
              
    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.X_train))
        x, y = self.X_train[idx], self.y_train[idx]
        return (x, y)

    def train_dataloader(self):
        dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def val_dataloader(self):
        dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def test_dataloader(self):
        dataset = TensorDataset(self.X_test, self.y_test)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

In [None]:
torch.empty(3).random_(2)

In [None]:
class FFN(pl.LightningModule):
    def __init__(self,
                num_classes,
                sample_weight,
                input_dim,
                hidden_dim,
                learning_rate,
                ):
        super().__init__()
        self.save_hyperparameters() #init variables are saved, so that model can be reloaded cleanly if necessary
        
        #model
        self.l1 = nn.Linear(input_dim, hidden_dim)
        self.l2 = nn.Linear(hidden_dim, num_classes)
        
        #sample weights
        if self.hparams.sample_weight:
            train_idx = dm.dates < dm.hparams.start_val
            self.weight = len(dm.y[train_idx]) / dm.y[train_idx].unique(return_counts=True)[1]
            self.weight = self.weight.cuda() #move to cuda, otherwise mismatch of devices in train/val
        else:
            self.weight = None
        print("sample_weight:", self.weight)
        print("device of sample_weight:", self.weight.device)
        print("device of class:", self.device)
        
        #metrics
        self.train_acc = torchmetrics.Accuracy()
        self.train_bal_acc = torchmetrics.Accuracy(num_classes=num_classes, average="macro") #should be equal to sklearn bal. acc.
        self.val_acc = torchmetrics.Accuracy()
        self.val_bal_acc= torchmetrics.Accuracy(num_classes=num_classes, average="macro")

    def forward(self, x):
        return self.l2(torch.relu(self.l1(x)))

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
        loss = F.cross_entropy(y_hat, y, weight=self.weight)
        self.log("loss/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        
        self.train_acc(y_hat, y)
        self.log("accuracy/train", self.train_acc, on_step=False, on_epoch=True)
        
        self.train_bal_acc(y_hat, y)
        self.log("bal_accuracy/train", self.train_bal_acc, on_step=False, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
#         self.log("hp_metric", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True) # average prediction class
        self.log("mean_pred", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True)
        
        loss = F.cross_entropy(y_hat, y, weight=self.weight)
        self.log("loss/val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
        self.val_acc(y_hat, y)
        self.log("accuracy/val", self.val_acc, on_step=False, on_epoch=True)
        
        self.val_bal_acc(y_hat, y)
        self.log("bal_accuracy/val", self.val_bal_acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return {"val_loss": loss}
    
    def on_train_start(self):
        self.st_total = time.time()

    def on_train_epoch_start(self):
        self.st = time.time()
        self.steps = self.global_step

    def on_train_epoch_end(self):
        elapsed = time.time() - self.st
        steps_done = self.global_step - self.steps
        self.log("time/step", elapsed / steps_done)

    def on_train_end(self):
        elapsed = time.time() - self.st_total
        print(f"Total Training Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed))}")
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.weight)

        self.log("loss/test_loss", loss, prog_bar=True)
        return loss


In [None]:
# test = FFN(2, 15, 100, 0.01)

# test2 = MyDataModule(path_data, 512, "2015", "2016")

# test.hparams

# test.hparams

# string = ""
# for k, v in test2.hparams.items():
#     string += k
#     string += str(v)
#     string += "."

In [None]:
# Hyperparameters

# datamodule params
PATH = path_data
BATCH_SIZE = 512
START_VAL = "2014"
START_TEST = "2016"
LABEL_FN = "binary"
DATASET = "small"

# model params
NUM_CLASSES = 2
INPUT_DIM = 15 # number of input features: 15 or 172
HIDDEN_DIM = 100 
LEARNING_RATE = 1e-4
SAMPLE_WEIGHT = True

# trainer params
MAX_EPOCHS = 100

# Checks
if (LABEL_FN == "multi"):
    assert NUM_CLASSES > 2, "number of classes must be bigger than 2 (LABEL_FN is 'multi')"
elif (LABEL_FN == "binary"):
    assert NUM_CLASSES == 2, "number of classes must be 2 (LABEL_FN is 'binary')"
if (DATASET == "small"):
    assert INPUT_DIM == 15, "input dim should be 15 as DATASET='small'"
elif (DATASET == "big"):
    assert INPUT_DIM == 172, "input dim should be 172 as DATASET='big'"

In [None]:
dm = MyDataModule(
    dataset=DATASET,
    path=PATH, 
    batch_size=BATCH_SIZE, 
    start_val=START_VAL, 
    start_test=START_TEST,
    label_fn=LABEL_FN
)

In [None]:
model = FFN(
    num_classes=NUM_CLASSES,
    sample_weight=SAMPLE_WEIGHT,
    input_dim=INPUT_DIM,
    hidden_dim=HIDDEN_DIM,
    learning_rate=LEARNING_RATE,
)

In [None]:
def log_foldername(to_add: dict ={}, to_exclude: list = []):
    name = ""
    for k, v in to_add.items():
        if k not in to_exclude:
            name += k
            name += str(v)
            name += "."
    for k, v in model.hparams.items():
        if k not in to_exclude:
            name += k
            name += str(v)
            name += "."
    for k, v in dm.hparams.items():
        if k not in to_exclude:
            name += k
            name += str(v)
            name += "."
    return name

to_add = {"max_epochs": MAX_EPOCHS}
to_exclude = ["path"]

In [None]:
# Set logging directory
log_dir = "logs"
name = log_foldername(to_add=to_add, to_exclude=to_exclude)
version = datetime.now().strftime("%Y%m%d%H%M%S")
# log_dir = os.path.join(log_dir, tag, datetime.now().strftime("%Y%m%d%H%M%S"))

In [None]:
logger = pl.loggers.TensorBoardLogger(
    save_dir= log_dir,
    name = name,
    version = version
)

In [None]:
early_stop_callback = EarlyStopping(monitor="loss/val_loss", mode="min", patience=3)

In [None]:
checkpoint_callback = ModelCheckpoint(
    monitor="loss/val_loss",
    save_top_k= 1,
    mode= "min",
    filename='epoch={epoch}-val_loss={loss/val_loss:.2f}-val_bacc={bal_accuracy/val:.2f}',
    auto_insert_metric_name=False,
)

In [None]:
trainer = pl.Trainer(
    max_epochs=MAX_EPOCHS,
    gpus=1,
    logger=logger, #=logger or False
    check_val_every_n_epoch=1,
    callbacks=[early_stop_callback, checkpoint_callback], #early stop depends earliest after (patience*check_val_every_n_epoch)
    # enable_checkpointing = False,
    num_sanity_val_steps=2,
)

In [None]:
s_time = time.time()
trainer.fit(model, datamodule=dm)
e_time = time.time()
print(f"Time to fit: {divmod(e_time - s_time, 60)[0]:2.0f}:{divmod(e_time - s_time, 60)[1]:2.0f}\
 min")

In [None]:
# import tensorboard as tb
# tb.notebook.list()

In [None]:
# %reload_ext tensorboard
# %tensorboard --logdir=logs/lightning_logs/ --port=6006

In [None]:
# from sklearn.preprocessing import StandardScaler

# dm.X_train.shape

# dm.X_train

# torch.mean(dm.X_train, axis=0)

# np.mean(dm.X_train.numpy(), axis=0)

# np.std(dm.X_train.numpy(), axis=0)

# np.mean(dm.X_train.numpy(), axis=0)

# torch.mean(dm.X_train, axis=0)

# np.std(dm.X_train.numpy(), axis=0)

# torch.std(dm.X_train, axis=0)

# (dm.X_val - torch.mean(dm.X_train, axis=0)) / torch.std(dm.X_train, axis=0)

# (dm.X_val.numpy() - np.mean(dm.X_train.numpy(), axis=0)) / np.std(dm.X_train.numpy(), axis=0)

# scaler = StandardScaler()

# scaler.fit_transform(dm.X_train.numpy())

# scaler.transform(dm.X_val.numpy())

In [None]:
# def standardize(x : torch.Tensor):
#     x = (x.numpy() - np.mean(x.numpy(), axis=0)) / np.std(x.numpy(), axis=0)
#     return x

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score #equal to torchmetrics.accuracy(average="macro")

In [None]:
model.eval()
(model(dm.X_val).softmax(dim=1)).argmax(dim=1).sum()

In [None]:
F.cross_entropy(model(dm.X_val), dm.y_val)

In [None]:
accuracy_score(dm.y_val.numpy(), model(dm.X_val).argmax(dim=1).detach().numpy())

In [None]:
balanced_accuracy_score(dm.y_val.numpy(), model(dm.X_val).argmax(dim=1).detach().numpy())

In [None]:
torchmetrics.functional.accuracy(model(dm.X_val), dm.y_val)

In [None]:
torchmetrics.functional.accuracy(model(dm.X_val), dm.y_val, average="macro", num_classes=NUM_CLASSES)

In [None]:
from torchmetrics import ConfusionMatrix
confmat = ConfusionMatrix(num_classes=NUM_CLASSES)

In [None]:
confmat(model(dm.X_val), dm.y_val)

In [None]:
dm.y_val.unique(return_counts=True)

In [None]:
model(dm.X_val)

In [None]:
dm.y_val.numpy()

In [None]:
model(dm.X_val).argmax(dim=1).detach().numpy()

In [None]:
target = torch.randint(4, (10, ))

In [None]:
target

In [None]:
preds = torch.randn(10, 4) * 3 

In [None]:
torch.mean(preds).item()


In [None]:
preds.numpy().mean()

In [None]:
preds

In [None]:
torchmetrics.functional.accuracy(preds, target, num_classes=4, average="macro")

In [None]:
preds.argmax(dim=-1)

In [None]:
balanced_accuracy_score(target, preds.argmax(dim=-1))

In [None]:
acc = torchmetrics.functional.accuracy(preds, target)

In [None]:
# [{'loss/test_loss': 0.6668015718460083}]


In [None]:
# checkpoint_callback.best_model_path