In [None]:
import os
import numpy as np
import pandas as pd
import pathlib
import time
import matplotlib.pyplot as plt
from datetime import datetime

import torch
from torch import nn
from pytorch_lightning import seed_everything
from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.nn import functional as F
import torchmetrics

from pytorch_lightning.callbacks.early_stopping import EarlyStopping

from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score #equal to torchmetrics.accuracy(average="macro")

from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter

In [None]:
os.getcwd()

In [None]:
# in tune: will seed hyperparam search space
seed_everything(42, workers=True)

In [None]:
r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature\data"

In [None]:
# Set path
path_data = pathlib.Path(r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature\data")

In [None]:
def feature_engineer(data):
    """
    Arguments:
    data: pandas.DataFrame that must have specific columns.

    """
    # Bid-Ask spread: (Ask - Bid) / Ask
    data["best_bid"] = (data["best_offer"] - data["best_bid"]) / (data["best_offer"])
    data = data.rename(columns={"best_bid": "ba_spread_option"}).drop(["best_offer"], axis=1)

    # Gamma: multiply by spotprice and divide by 100
    data["gamma"] = data["gamma"] * data["spotprice"] / 100 #following Bali et al. (2021)

    # Theta: scale by spotprice
    data["theta"] = data["theta"] / data["spotprice"] #following Bali et al. (2021)

    # Vega: scale by spotprice
    data["vega"] = data["vega"] / data["spotprice"] #following Bali et al. (2021)

    # Time to Maturity: cale by number of days in year: 365
    data["days_to_exp"] = data["days_to_exp"] / 365

    # Moneyness: Strike / Spot (K / S)
    data["strike_price"] = data["strike_price"] / data["spotprice"] # K / S
    data = data.rename(columns={"strike_price": "moneyness"})

    # Forward Price ratio: Forward / Spot
    data["forwardprice"] = data["forwardprice"] / data["spotprice"]

    # Drop redundant/ unimportant columns
    data = data.drop(["cfadj", "days_no_trading", "spotprice", "adj_spot"], axis=1)

    return data


In [None]:
# multiclass y label function
def binary_categorize(y):
    if y > 0:
        return 1
    else:
        return 0

In [None]:
# multiclass y label function
def multi_categorize(y):
    if y > 0.05:
        return 2
    elif y < -0.05:
        return 0
    else:
        return 1

In [None]:
class MyDataModule(pl.LightningDataModule):
    def __init__(self,
                 path,
                 dataset: str,
#                  batch_size: int, 
                 start_val: str, 
                 start_test: str,
                 label_fn: str,
                 config: dict,
        ):
        super().__init__()
        self.save_hyperparameters(ignore=["path"])
        
        self.batch_size = config["batch_size"]
        
        # read data from disk
        if dataset == "small":
            self.data = pd.read_parquet(path/"final_df_filledmean_small.parquet")
        elif dataset == "big":
            self.data = pd.read_parquet(path/"final_df_filledmean.parquet")
        else:
            raise ValueError("Specify dataset as either 'small' or 'big'")
            
        # feature engineer data
        self.data = feature_engineer(self.data)
        
        # create y
        self.y = self.data["option_ret"]
        # make classification problem
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi":
            self.y = self.y.apply(multi_categorize)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # create X
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # save dates and drop
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
        # to torch Tensor
        self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
        self.y = torch.from_numpy(self.y.values)
        
    def setup(self, stage: str = None):
        # train
        self.X_train = self.X[self.dates < self.hparams.start_val]
        self.y_train = self.y[:len(self.X_train)]
        
        #val
        mask = (self.dates >= self.hparams.start_val) & (self.dates < self.hparams.start_test)
        self.X_val = self.X[mask]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # test
        self.X_test = self.X[self.dates >= self.hparams.start_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (np.sum(len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data)), "sum of samples of splits\
        is not equal length of dataset"
        
        #standardize X_train
        mean = torch.mean(self.X_train, axis=0)
        std = torch.std(self.X_train, axis=0)
        
        # Standardize X_train, X_val and X_test with mean/std from X_train
        self.X_train = (self.X_train - mean) / std
        self.X_val = (self.X_val - mean) / std
        self.X_test = (self.X_test - mean) / std

        # Save variables to pass to model class
        # input dim
        self.input_dim = self.X_train.shape[1]
        # number of classes
        self.num_classes = len(self.y_train.unique())
        # class weights
        self.class_weights = len(self.y_train) / self.y_train.unique(return_counts=True)[1]

        print("class_weights:", self.class_weights)
        print("device of class_weights:", self.class_weights.device)
        print("---")
        print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
        print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
        print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
        print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
        print(f"train start date: ", self.dates[self.dates < self.hparams.start_val].iloc[0].strftime("%Y-%m-%d"), 
              ", train end date: ", self.dates[self.dates < self.hparams.start_val].iloc[-1].strftime("%Y-%m-%d"))
        print(f"val start date: ", self.dates[mask].iloc[0].strftime("%Y-%m-%d"), 
              ", val end date: ", self.dates[mask].iloc[-1].strftime("%Y-%m-%d"))
        print(f"test start date: ", self.dates[self.dates >= self.hparams.start_test].iloc[0].strftime("%Y-%m-%d"), 
              ", test end date: ", self.dates[self.dates >= self.hparams.start_test].iloc[-1].strftime("%Y-%m-%d"))
        print("---")

    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.X_train))
        x, y = self.X_train[idx], self.y_train[idx]
        return (x, y)

    def train_dataloader(self):
        dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def val_dataloader(self):
        dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    def test_dataloader(self):
        dataset = TensorDataset(self.X_test, self.y_test)
        return DataLoader(dataset, batch_size=self.batch_size,
                         num_workers=4,
                         pin_memory=True,
                         )

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("DataModule")
        parser.add_argument("--dataset", type=str, default="small")
        # parser.add_argument("--path", type=str, help="path to folder that contains the data")
        parser.add_argument("--batch_size", type=int, default=512)
        parser.add_argument("--start_val", type=str, default="2014")
        parser.add_argument("--start_test", type=str, default="2016")
        parser.add_argument("--label_fn", type=str, default="binary")

        return parent_parser


In [None]:
class FFN(pl.LightningModule):
    def __init__(self,
                # dm,
                input_dim: int,
                num_classes: int,
                class_weights: torch.Tensor,
                no_class_weights: bool,
#                 hidden_dim: int,
#                 learning_rate: float,
                 config: dict,
        ):
        super().__init__()
        # Init variables are saved, so that model can be reloaded cleanly if necessary
#         self.save_hyperparameters(ignore=["class_weights"])
        self.save_hyperparameters()
        #ignore "dm" is crucial if dm is passed
        
        if config is not None:
            self.hidden_dim = config["hidden_dim"]
            self.learning_rate = config["lr"]
        else:
            self.hidden_dim = hidden_dim
            self.learning_rate = learning_rate
        
        middle_layer = []
        for i in range(3):
            middle_layer.append(nn.Linear(self.hidden_dim, self.hidden_dim))
#             middle_layer.append(nn.BatchNorm1d(self.hidden_dim))
            middle_layer.append(nn.ReLU(inplace=True))
            middle_layer.append(nn.Dropout())
        
        
        #model
        self.first = nn.Sequential(nn.Linear(input_dim, self.hidden_dim), nn.ReLU())
#         self.middle = nn.Sequential(*[nn.Sequential(nn.Linear(self.hidden_dim, self.hidden_dim), nn.ReLU()) for i in range(3)])
        self.middle = nn.Sequential(*middle_layer)
        self.last = nn.Linear(self.hidden_dim, num_classes)
        
        #sample weights
        if not self.hparams.no_class_weights:
            self.class_weights = class_weights
            self.class_weights = self.class_weights.cuda() # Move to cuda, otherwise mismatch of devices # in train/val
        else:
            self.class_weights = None
#         print("---")
#         print("class_weights:", self.class_weights)
#         print("device of class_weights:", self.class_weights.device)
#         print("device of class:", self.device)
#         print("---")

        #metrics
        self.train_acc = torchmetrics.Accuracy()
        self.train_bal_acc = torchmetrics.Accuracy(
        num_classes=num_classes, average="macro") # should be equal to sklearn bal. acc.

        self.val_acc = torchmetrics.Accuracy()
        self.val_bal_acc= torchmetrics.Accuracy(
            num_classes=num_classes, average="macro")

    def forward(self, x):
        x = self.first(x)
        x = self.middle(x)
        x = self.last(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        self.log("loss/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        
        self.train_acc(y_hat, y)
        self.log("acc/train", self.train_acc, on_step=False, on_epoch=True)
        
        self.train_bal_acc(y_hat, y)
        self.log("bal_acc/train", self.train_bal_acc, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
#         self.log("hp_metric", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True) # average prediction class
        self.log("mean_pred", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True)
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        self.log("loss/val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
        self.val_acc(y_hat, y)
        self.log("acc/val", self.val_acc, on_step=False, on_epoch=True)
        
        self.val_bal_acc(y_hat, y)
        self.log("bal_acc/val", self.val_bal_acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return {"val_loss": loss}
    
    def on_train_start(self):
        self.st_total = time.time()

    def on_train_epoch_start(self):
        self.st = time.time()
        self.steps = self.global_step

    def on_train_epoch_end(self):
        elapsed = time.time() - self.st
        steps_done = self.global_step - self.steps
        self.log("time/step", elapsed / steps_done)

    def on_train_end(self):
        elapsed = time.time() - self.st_total
        print(f"Total Training Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed))}")
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)

        self.log("loss/test_loss", loss, prog_bar=True)

        return loss
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("FFN")
        parser.add_argument("--no_class_weights", action='store_true')
        parser.add_argument("--hidden_dim", type=int, default=100)
        parser.add_argument("-lr", "--learning_rate", type=float, default=1e-2)

        return parent_parser
        

In [None]:
model = FFN(
input_dim=15,
num_classes=2,
class_weights=None,
no_class_weights=True,
#     hidden_dim=HIDDEN_DIM,
#     learning_rate=LEARNING_RATE,
config={"hidden_dim": 50, "lr": 1e-2},
)

In [None]:
print(model)

In [None]:
# tune_callback = TuneReportCallback(
#     {
#         "val_loss": "loss/val_loss",
#         "val_bal_acc": "bal_acc/val",
#         "mean_pred": "mean_pred"
#     },
#     on="validation_end")

In [None]:
# checkpoint_callback = ModelCheckpoint(
#         monitor="loss/val_loss",
#         save_top_k=1,
#         mode="min",
#         filename='epoch={epoch}-val_loss={loss/val_loss:.3f}-val_bacc={bal_acc/val:.4f}',
#         auto_insert_metric_name=False,
#     )

In [None]:
early_stop_callback = EarlyStopping(
        monitor="loss/val_loss", 
        mode="min", 
        patience=10
    )

In [None]:
tune_callback = TuneReportCheckpointCallback(
    metrics={
        "loss": "loss/loss",
        "mean_pred": "mean_pred",
        "val_loss": "loss/val_loss",
        "val_bal_acc": "bal_acc/val"
    },
    filename="checkpoint",
    on="validation_end")

In [None]:
def train_mnist_tune(config, max_epochs, num_gpus, checkpoint_dir=None):
#     data_dir = os.path.expanduser(data_dir)
    
    # will seed trainer (init of weights in NN?)
    seed_everything(42, workers=True)
    
    print("------")
    print(np.random.uniform(0, 100, size=1).item())
    print("------")
    
    dm = MyDataModule(
    path=path_data, 
    dataset="small",
#     batch_size=BATCH_SIZE, 
    start_val="1997", 
    start_test="1998",
    label_fn="binary",
    config=config,
    )
    
    dm.setup()
    
    model = FFN(
    input_dim=dm.input_dim,
    num_classes=dm.num_classes,
    class_weights=dm.class_weights,
    no_class_weights=False,
#     hidden_dim=HIDDEN_DIM,
#     learning_rate=LEARNING_RATE,
    config=config,
    )
    
    print(model)
    
    trainer = pl.Trainer(
        deterministic=True,
        max_epochs=max_epochs,
        gpus=num_gpus,
        logger=pl.loggers.TensorBoardLogger(
        save_dir=tune.get_trial_dir(), name="", version="."),
        enable_progress_bar=True,
        callbacks=[
#                    checkpoint_callback, 
                   early_stop_callback,
                   tune_callback, 
                  ],
        enable_checkpointing=False,
    )
    
    trainer.fit(model, datamodule=dm)
    
#     print(checkpoint_callback.best_model_path)

In [None]:
def tune_mnist_asha(num_samples=2, max_epochs=5, gpus_per_trial=1,):
    config = {
        "hidden_dim": tune.choice([50, 100]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([256]),
#         "hidden_dim": tune.choice([32]),
#         "lr": tune.choice([1e-2]),
#         "batch_size": tune.choice([512]),
    }

    scheduler = ASHAScheduler(
        max_t=max_epochs,
        grace_period=1,
        reduction_factor=2)

    reporter = CLIReporter(
        parameter_columns=["hidden_dim", "lr", "batch_size"],
        metric_columns=["val_loss", "val_bal_acc", "mean_pred", "training_iteration"])

    train_fn_with_parameters = tune.with_parameters(train_mnist_tune,
                                                    max_epochs=max_epochs,
                                                    num_gpus=gpus_per_trial,
#                                                     data_dir=data_dir,
                                                   )
    resources_per_trial = {"cpu": 1, "gpu": gpus_per_trial}
    
    analysis = tune.run(train_fn_with_parameters,
        local_dir="./logs",
        resources_per_trial=resources_per_trial,
        metric="val_loss",
        mode="min",
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        name="tune_mnist_asha",
        keep_checkpoints_num=1, # only keep best checkpoint
        checkpoint_score_attr="min-val_loss",
        )

    print("Best hyperparameters found were: ", analysis.best_config)
    
    best_trial = analysis.get_best_trial("val_loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial >>last<< validation loss: {}".format(
        best_trial.last_result["val_loss"]))
    print("Best trial >>last epoch<< validation accuracy: {}".format(
        best_trial.last_result["val_bal_acc"]))
    
    return analysis

In [None]:
result = tune_mnist_asha()

In [None]:
best_trial = result.get_best_trial("val_loss", "min", "all")

In [None]:
best_trial.config

In [None]:
result.get_best_config("val_loss", "min", "all")

In [None]:
result.best_config

In [None]:
result.dataframe(metric="val_loss", mode="min").sort_values("val_loss")

In [None]:
result.best_result

In [None]:
a = result.dataframe(metric="val_loss", mode="min").sort_values("val_loss").iloc[0, :].to_dict()

In [None]:
a

In [None]:
test = {}

In [None]:
test["2008"] = 12

In [None]:
test

In [None]:
a = test

In [None]:
a.update({"2009": 3})

In [None]:
a

In [None]:
test

In [None]:
df_results = result.results_df

In [None]:
df_results

In [None]:
result.get_best_trial("val_loss", "min", scope="all").checkpoint

In [None]:
result.best_result

In [None]:
result.best_checkpoint

In [None]:
result.get_best_trial().checkpoint.value

In [None]:
result.get_best_trial("val_loss", "max")

In [None]:
result.get_best_trial("val_loss", "min", scope="all")

In [None]:
result.get_best_trial("val_loss", "min", scope="all")

In [None]:
result.dataframe()

In [None]:
# 0,lr=0.0245_2022-07-09_16-11-58\checkpoint_epoch=1-step=302\>

In [None]:
result.get_best_checkpoint(result.get_best_trial("val_loss", "min", scope="all")).get_internal_representation()[1]

In [None]:
result.get_best_checkpoint(result.get_best_trial("val_loss", "min", scope="last")).get_internal_representation()[1]

In [None]:
result.get_best_checkpoint(result.get_best_trial(), metric="val_loss", mode="min").get_internal_representation()[1]

In [None]:
result.best_checkpoint.get_internal_representation()[1]

In [None]:
result.default_mode

In [None]:
from pathlib import Path
path = result.get_best_checkpoint(result.get_best_trial(), metric="val_loss", mode="min").get_internal_representation()[1]

In [None]:
model = FFN.load_from_checkpoint(Path(path)/"checkpoint")

In [None]:
result.get_best_trial("val_loss", "min", "last").checkpoint

In [None]:
# model = FFN()
# # trainer = Trainer()
# from pathlib import Path
# path = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\notebooks\logs\tune_mnist_asha\train_mnist_tune_5b9ea_00000_0_batch_size=256,hidden_dim=50,lr=0.0245_2022-07-04_21-20-26\.\checkpoints\epoch=0-val_loss=0.764-val_bacc=0.5235.ckpt")

# # automatically restores model, epoch, step, LR schedulers, apex, etc...
# # trainer.fit(model, ckpt_path=path)
# model = FFN.load_from_checkpoint(path)

In [None]:
# pd.read_json(r"C:\Users\Mathiass\Documents\Projects\master-thesis\notebooks\logs\tune_mnist_asha\train_mnist_tune_21877_00000_0_batch_size=256,hidden_dim=50,lr=0.0245_2022-07-05_09-36-06\result.json", 
# lines=True)

In [None]:
model.learning_rate

In [None]:
result.best_config

In [None]:
dm = MyDataModule(
path=path_data, 
dataset="small",
#     batch_size=BATCH_SIZE, 
start_val="1998", 
start_test="1999",
label_fn="binary",
config=result.best_config,
)

trainer = pl.Trainer(
deterministic=True,
max_epochs=1,
gpus=1,
# logger=pl.loggers.TensorBoardLogger(
# save_dir=tune.get_trial_dir(), name="", version="."), # SPECIFY SAVE_DIR FOR VALIDATION LOGGING -> default: lightning logs
enable_progress_bar=True,
# callbacks=[
#     tune_callback, 
#     early_stop_callback
# ]
)

In [None]:
val_result = trainer.validate(model, datamodule=dm)

In [None]:
val_result

In [None]:
# Best hyperparameters found were:  {'hidden_dim': 50, 'lr': 0.024526126311336768, 'batch_size': 256}
# Best trial config: {'hidden_dim': 50, 'lr': 0.024526126311336768, 'batch_size': 256}
# Best trial >>last<< validation loss: 0.6991798281669617
# Best trial >>last epoch<< validation accuracy: 0.5307202339172363

In [None]:
result.get_best_trial

In [None]:
df_results

In [None]:
checkpoint_callback.best_model_path

In [None]:
# trainer = pl.Trainer(
#     deterministic=True,
#     max_epochs=MAX_EPOCHS,
#     gpus=1,
#     logger=logger, #=logger or False
#     check_val_every_n_epoch=1,
#     callbacks=[early_stop_callback, checkpoint_callback], #early stop depends earliest after (patience*check_val_every_n_epoch)
#     # enable_checkpointing = False,
#     num_sanity_val_steps=2,
# )

In [None]:
# s_time = time.time()
# trainer.fit(model, datamodule=dm)
# e_time = time.time()
# print(f"Time to fit: {divmod(e_time - s_time, 60)[0]:2.0f}:{divmod(e_time - s_time, 60)[1]:2.0f}\
#  min")

In [None]:
idx = 0
init_train = 2

In [None]:
years = np.arange((1996+init_train+idx),(2021)) 

In [None]:
years[:-2]

In [None]:
len(years)

In [None]:
def abc():
    return 3

In [None]:
globals()["abc"]

In [None]:
a = ["a", "b", "c"]
for i, k in enumerate(a):
    print(i, k)

In [None]:
a = iter([1, 2, 3])

In [None]:
a = {"test2009": 3}

In [None]:
b = None

In [None]:
a.update(b)