In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
import json
import time

In [2]:
import os
from ray.tune.schedulers import ASHAScheduler
from sklearn.model_selection import train_test_split
import xgboost as xgb

from ray import tune
from ray.tune.integration.xgboost import TuneReportCheckpointCallback

import pytorch_lightning as pl #for "seed everything"
from typing import Tuple, Dict, List

In [3]:
pl.seed_everything(42, workers=True)

Global seed set to 42


42

In [4]:
def feature_engineer(data):
    """
    Arguments:
    data: pandas.DataFrame that must have specific columns.

    """
    # Bid-Ask spread: (Ask - Bid) / Ask
    data["best_bid"] = (data["best_offer"] - data["best_bid"]) / (data["best_offer"])
    data = data.rename(columns={"best_bid": "ba_spread_option"}).drop(["best_offer"], axis=1)

    # Gamma: multiply by spotprice and divide by 100
    data["gamma"] = data["gamma"] * data["spotprice"] / 100 #following Bali et al. (2021)

    # Theta: scale by spotprice
    data["theta"] = data["theta"] / data["spotprice"] #following Bali et al. (2021)

    # Vega: scale by spotprice
    data["vega"] = data["vega"] / data["spotprice"] #following Bali et al. (2021)

    # Time to Maturity: cale by number of days in year: 365
    data["days_to_exp"] = data["days_to_exp"] / 365

    # Moneyness: Strike / Spot (K / S)
    data["strike_price"] = data["strike_price"] / data["spotprice"] # K / S
    data = data.rename(columns={"strike_price": "moneyness"})

    # Forward Price ratio: Forward / Spot
    data["forwardprice"] = data["forwardprice"] / data["spotprice"]

    # Drop redundant/ unimportant columns
    data = data.drop(["cfadj", "days_no_trading", "spotprice", "adj_spot"], axis=1)

    return data

# binary y label generator
def binary_categorize(y):
    """
    Input: continuous target variable 

    Output: 1 for positive returns, 
            0 for negative returns
    """
    if y > 0:
        return 1
    else:
        return 0


# multiclass y label generator
def multi_categorize(y):
    """
    Input: continuous target variable
    CAREFUL: classes have to be between [0, C) for F.crossentropyloss.
    
    Output: multi class
    """
    if y > 0.05:
        return 2
    elif y < -0.05:
        return 0
    else:
        return 1
    
class CVSplitter:
    """ Generator for data splits
    Args:
    dates: pandas.Series of datetime,
    init_train_length: int,
    val_length: int
    """
    def __init__(self, dates, init_train_length=1, val_length=2, test_length=1):
        # find indeces where years change (will ignore last year end in dates)
        self.val_length = val_length
        self.test_length = test_length
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        self.eoy_idx = np.append(self.eoy_idx, len(dates)) #append end of year of last year in dates

        assert init_train_length + val_length + test_length <= len(self.eoy_idx) + 1, \
        "defined train and val are larger than number of years in dataset"
        assert init_train_length > 0, "init_train_length must be strictly greater than 0"

        # align the 4th idx to be the end of the 5th year...
        self.train_start_idx = init_train_length - 1

        self.train_eoy = self.eoy_idx[self.train_start_idx:-(val_length+test_length)]
        self.val_eoy = self.eoy_idx[self.train_start_idx + val_length:-test_length]
        # For generate_idx():
        self.test_eoy = self.eoy_idx[self.train_start_idx + val_length + test_length:]

    def generate(self):
        for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length)):
            yield (list(range(self.train_eoy[i])),
                   list(range(self.train_eoy[i], self.val_eoy[i])))

    def generate_idx(self):
        for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length 
                        + self.test_length)):
            yield ({"train": self.train_eoy[i], 
                    "val": self.val_eoy[i], 
                    "test": self.test_eoy[i]}
                )

In [None]:
# read data from disk
path = Path(r"C:\Users\Mathiass\OneDrive - Universität Zürich UZH\Documents\mt_literature\data")

class Dataset():
    def __init__(self, path=path, year_idx=0, dataset="small", init_train_length=20, val_length=2, label_fn="binary"):
        if dataset == "small":
            self.data = pd.read_parquet(path/"final_df_filledmean_small.parquet")
        elif dataset == "big":
            self.data = pd.read_parquet(path/"final_df_filledmean.parquet")
        else:
            raise ValueError("Specify dataset as either 'small' or 'big'")

        # get splits
        splitter = CVSplitter(self.data["date"], init_train_length=init_train_length, 
                                val_length=val_length, test_length=1)
        eoy_indeces = list(splitter.generate_idx())
        self.eoy_train = eoy_indeces[year_idx]["train"]
        self.eoy_val = eoy_indeces[year_idx]["val"]
        self.eoy_test = eoy_indeces[year_idx]["test"]
        
        # Truncate data
        self.data = self.data.iloc[:self.eoy_test]
        assert len(self.data) == self.eoy_test, "length of data is not equal to eoy_test"
            
        # feature engineer data
        self.data = feature_engineer(self.data)
        
        # create y
        self.y = self.data["option_ret"]
        # make classification problem
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi":
            self.y = self.y.apply(multi_categorize)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # create X
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # save dates and drop
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
#         # to torch Tensor
#         self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
#         self.y = torch.from_numpy(self.y.values)

        # to numpy
        self.X = self.X.values #-> will be standardized in setup, so do it there.
        self.y = self.y.values
    
        ############################### setup #########################################################
        # train
        self.X_train = self.X[:self.eoy_train]
        self.y_train = self.y[:len(self.X_train)]
        
        #val
        self.X_val = self.X[self.eoy_train:self.eoy_val]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # test
        self.X_test = self.X[self.eoy_val:self.eoy_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data), \
            "sum of X train, val, test is not equal length of dataset"
        assert (len(self.y_train)+len(self.y_val)+len(self.y_test) == len(self.data)), \
        "sum of y train, val, test is not equal to length of dataset"
        
#         #standardize X_train
#         mean = torch.mean(self.X_train, axis=0)
#         std = torch.std(self.X_train, axis=0)
        
#         # Standardize X_train, X_val and X_test with mean/std from X_train
#         self.X_train = (self.X_train - mean) / std
#         self.X_val = (self.X_val - mean) / std
#         self.X_test = (self.X_test - mean) / std

        # Save variables
        # input dim
        self.input_dim = self.X_train.shape[1]
        # number of classes
        self.num_classes = len(np.unique(self.y_train))
#         class weights
        self.class_weights = len(self.y_train) / np.unique(self.y_train, return_counts=True)[1]
        
        print("*****************************************************************************************")
        print("Current dataset information:")
        print("---")
        print("class_weights:", self.class_weights)
        print("---")
        print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
        print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
        print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
        print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
        print("---")
        print(f"train start date: ", self.dates.iloc[0].strftime("%Y-%m-%d"), 
              ", train end date: ", self.dates.iloc[:self.eoy_train].iloc[-1].strftime("%Y-%m-%d"))
        print(f"val start date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[0].strftime("%Y-%m-%d"), 
              ", val end date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[-1].strftime("%Y-%m-%d"))
        print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
              ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))
        print("*****************************************************************************************")
        
    def get_datasets(self):
        return self.X_train, self.X_val, self.X_test
    
    def get_train_val_xgb(self):
        return self.X_train, self.X_val, self.y_train, self.y_val


In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
def bal_acc_xgb(preds: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    y = dtrain.get_label() # get true y as np.array
    # if more than 2 classes
    if len(preds.shape) > 1:
        raise NotImplementedError("Implement Softmax here")
    else:
        # if 2 classes, round the probabilities
        preds = np.round(preds)
    
    val_bal_acc = balanced_accuracy_score(y, preds)
    
    return 'val_bal_acc', val_bal_acc

In [None]:
pl.seed_everything(42, workers=True)

data = Dataset()

In [None]:
X_train, X_val, y_train, y_val = data.get_train_val_xgb()

In [None]:
data = Dataset()
X_train, X_val, y_train, y_val = data.get_train_val_xgb()

In [None]:
data = Dataset()

In [None]:
# from sklearn.utils.class_weight import compute_class_weight

In [None]:
# y_train

In [None]:
# weights = len(y_train) / np.unique(y_train, return_counts=True)[1]

In [None]:
# weights

In [None]:
# 1. / 0.56205813

In [None]:
# class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

In [None]:
# weights / weights[weights.argmax()]

In [None]:
# 1.3895877 / 0.78102907

In [None]:
# w_array = np.ones(y_train.shape[0])

In [None]:
# for i, val in enumerate(y_train):
#     w_array[i] = class_weights[val]

In [None]:
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb


def train(config, data):
    # Load dataset
#     data, labels = sklearn.datasets.load_breast_cancer(return_X_y=True)
#     # Split into train and test set
#     train_x, test_x, train_y, test_y = train_test_split(
#         data, labels, test_size=0.25)

#     pl.seed_everything(42, workers=True)
    
#     data = Dataset()
    X_train, X_val, y_train, y_val = data.get_train_val_xgb()
    
    # Build input matrices for XGBoost
    D_train = xgb.DMatrix(X_train, label=y_train)
    D_val = xgb.DMatrix(X_val, label=y_val)
    # Train the classifier
    results = {}
    bst = xgb.train(
        config,
        D_train,
        evals=[(D_train, "train"), (D_val, "eval")],
        evals_result=results,
#         verbose_eval=False,
#         num_boost_round=1, #*************************************************************************
        callbacks=[TuneReportCheckpointCallback(filename="model.xgb")],
        custom_metric=bal_acc_xgb,
        num_boost_round=100,
    )
    
    return results


In [None]:
def get_best_model_checkpoint(analysis):
    best_bst = xgb.Booster()
    best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb"))
    accuracy = 1. - analysis.best_result["eval-error"]
    print(f"Best model parameters: {analysis.best_config}")
    print(f"Best model total accuracy: {accuracy:.4f}")
    return best_bst


def tune_xgboost():
    search_space = {
        # You can mix constants with search space objects.
        "objective": "binary:logistic",
        "eval_metric": ["logloss", "error"],
        "seed": 42,
        'tree_method' : 'gpu_hist', # to use GPU
#         "single_precision_histogram": True, #may improve speed, in particular on older architectures.
        'disable_default_eval_metric': 1,
        "max_depth": tune.randint(1, 9),
        "min_child_weight": tune.choice([1, 2, 3]),
        "subsample": tune.uniform(0.5, 1.0),
        "eta": tune.loguniform(1e-4, 1e-1),
    }
    
    data = Dataset()
    
    train_fn_with_parameters = tune.with_parameters(train,
                                                    data=data
                                                )
    
    # This will enable aggressive early stopping of bad trials.
    scheduler = ASHAScheduler(
        max_t=3,  # 10 training iterations
        grace_period=1,
        reduction_factor=2)

    analysis = tune.run(
        train_fn_with_parameters,
        metric="eval-logloss",
        mode="min",
        # You can add "gpu": 0.1 to allocate GPUs
        resources_per_trial={"cpu": 4, "gpu": 0.5},
        config=search_space,
        num_samples=2,
        scheduler=scheduler
    
    )

    return analysis

In [None]:
# best_bst.best_iteration

In [None]:
analysis = tune_xgboost()

best_bst = get_best_model_checkpoint(analysis)

# You could now do further predictions with
# best_bst.predict(...)

In [None]:
data = Dataset()
X_train, X_val, y_train, y_val = data.get_train_val_xgb()
# Build input matrices for XGBoost
D_train = xgb.DMatrix(X_train, label=y_train)
D_val = xgb.DMatrix(X_val, label=y_val)

In [None]:
y_val

In [None]:
best_bst.predict(D_val)

In [None]:
np.argmax(best_bst.predict(D_val).reshape(-1, 1), axis=1).sum()

In [None]:
np.argmax(best_bst.predict(D_val), axis=1)

In [None]:
balanced_accuracy_score(y_val, best_bst.predict(D_val).round())

In [None]:
# 17sec, 4gpu, 0.5gpu

In [None]:
analysis.best_checkpoint

In [None]:
# 16.99, 4 cpu, no gpu

In [None]:
# Total run time: 28.79 seconds (28.61 seconds for the tuning loop).

In [None]:
# Trial name	status	loc	eta	max_depth	min_child_weight	subsample	iter	total time (s)	train-logloss	train-error	train-val_bal_acc
# train_e9566_00000	TERMINATED	127.0.0.1:2424	0.021831	7	1	0.591717	10	23.8794	0.67544	0.352338	0.515156
# train_e9566_00001	TERMINATED	127.0.0.1:1592	0.00238642	2	3	0.549987	1	8.44267	0.692946	0.359819	0.5
# train_e9566_00002	TERMINATED	127.0.0.1:4856	0.0812325	8	3	0.510292	10	23.9471	0.647541	0.345555	0.525421
# train_e9566_00003	TERMINATED	127.0.0.1:11448	0.000354988	6	2	0.590912	1	8.15133	0.693114	0.357855	0.513158
# train_e9566_00004	TERMINATED	127.0.0.1:1896	0.000747631	6	1	0.715973	1	8.64184	0.693078	0.357727	0.51345
# train_e9566_00005	TERMINATED	127.0.0.1:15140	0.022674	2	3	0.728035	2	9.73432	0.689457	0.359819	0.5
# train_e9566_00006	TERMINATED	127.0.0.1:11076	0.00664714	7	1	0.523225	1	8.55218	0.692498	0.355293	0.512156
# train_e9566_00007	TERMINATED	127.0.0.1:6392	0.0788671	7	2	0.974443	10	24.4164	0.650617	0.348126	0.520765
# train_e9566_00008	TERMINATED	127.0.0.1:24340	0.0112901	1	2	0.548836	1	8.17584	0.692232	0.359819	0.5
# train_e9566_00009	TERMINATED	127.0.0.1:18876	0.000331203	7	1	0.916597	1	8.0835	0.693115	0.355767	0.511753


In [None]:
# def get_best_model_checkpoint(analysis):
#     best_bst = xgb.Booster()
#     best_bst.load_model(os.path.join(analysis.best_checkpoint, "model.xgb")) # CHANGE THIS TO BEST OVERALL CKPT
#     accuracy = 1. - analysis.best_result["eval-error"]
#     print(f"Best model parameters: {analysis.best_config}")
#     print(f"Best model total accuracy: {accuracy:.4f}")
#     return best_bst


In [None]:
analysis.best_checkpoint

In [None]:
best_trial = analysis.get_best_trial("train-error", "max", scope="all")

In [None]:
analysis.get_best_checkpoint(best_trial)

In [None]:
analysis.dataframe(metric="eval-val_bal_acc", mode="min")

In [None]:
best_bst