In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import math
from tqdm import tqdm

In [None]:
exp_dir = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\logs\tune\nn_loops\20220906163324")

In [None]:
args_exp = pd.read_json(exp_dir/"args.json", typ="series")

In [None]:
def get_yearidx_bestmodelpaths(exp_path: Path) -> list:
    """Search exp_path for 'best_ckpt????' files and save their paths into a list.
    Then sort the path and enumerate it to get a list of tuples (idx, best_ckpt_path).

        Returns:
            idx_bestmodelpaths (list): Enumerated best checkpoint paths in the
                                        experiment path.
    """
    best_ckpt_paths = []
    for directory in exp_path.iterdir():
        if directory.is_dir() and directory.name != "predictions" and directory.name != "portfolios":
            # See https://docs.python.org/3/library/fnmatch.html#module-fnmatch
            # for filename pattern matching below.
            for file in directory.glob("best_ckpt????"):
                # If files do not exist in 'predictions' folder yet
                best_ckpt_paths.append(file.resolve())
    # IMPORTANT: Sort best ckpt paths that were read in, in ascending order.
    best_ckpt_paths = sorted(best_ckpt_paths, key=lambda x: int(str(x)[-4:]))
    # Append corresponding year_idx to each best_model_path.
    idx_bestmodelpaths = list(enumerate(best_ckpt_paths))
    # Check if year_idx and bestckpts are in the correct order.
    prev_year = -9999
    for yearidx, bestckpt_path in idx_bestmodelpaths:
        year = int(str(bestckpt_path)[-4:])
        if year > prev_year:
            print(f"({yearidx}, {year})\t is the correct (year_idx, year) tuple!")
            continue
        else:
            raise ValueError("(year_idx, bestmodel_ckpt_path) are not in the "
                             "correct ascending order.")
    return idx_bestmodelpaths

In [None]:
yearidx_target_list = get_yearidx_bestmodelpaths(exp_dir)

In [None]:
yearidx_target_list[0][1].stem

In [None]:
def load_data(data_path: Path, dataset: str):
    """Loads specific dataset from path, depending on specified size."""
    if dataset == "small":
        return pd.read_parquet(data_path/"final_df_call_cao_small.parquet")
    elif dataset == "medium":
        return pd.read_parquet(data_path/"final_df_call_cao_med_fillmean.parquet")
    elif dataset == "big":
        return pd.read_parquet(data_path/"final_df_call_cao_big_fillmean.parquet")
    else:
        raise ValueError("Specify dataset as either 'small', 'medium' or big'")

In [None]:
args_exp

In [None]:
type(args_exp)

In [None]:
path_data = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data")

In [None]:
cols = load_data(path_data, args_exp.dataset).columns.tolist()

In [None]:
cols.remove("date")
cols.remove("option_ret")

In [None]:
cols

In [None]:
to_drop = ["date", "option_ret"]

In [None]:
[i for i in cols if i not in to_drop]

In [None]:
for col in cols:
    print(col)

In [None]:
import pdb
import time
import torch
from torch import nn
import torchmetrics
import pytorch_lightning as pl
from torch.nn import functional as F

# import pdb

class FFN(pl.LightningModule):
    def __init__(self,
                input_dim: int,
                num_classes: int,
                class_weights: torch.Tensor,
                no_class_weights: bool,
                learning_rate: float,
                hidden_dim: int,
                n_hidden: int,
                batch_norm: bool,
                dropout: bool,
                drop_prob: float,
                # config: dict = None,
        ):
        super().__init__()
        # Init variables are saved, so that model can be reloaded cleanly if necessary
        # self.save_hyperparameters(ignore=["class_weights"])
        self.save_hyperparameters()

        middle_layers = []
        for _ in range(self.hparams.n_hidden):
            middle_layers.append(nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim))
            if self.hparams.batch_norm:
                middle_layers.append(nn.BatchNorm1d(self.hparams.hidden_dim))
            middle_layers.append(nn.ReLU(inplace=True))
            if self.hparams.dropout:
                middle_layers.append(nn.Dropout(p=self.hparams.drop_prob))

        #model
        self.first = nn.Sequential(nn.Linear(self.hparams.input_dim, self.hparams.hidden_dim), 
                                    nn.ReLU(inplace=True))
        self.middle = nn.Sequential(*middle_layers)  
        self.last = nn.Linear(self.hparams.hidden_dim, self.hparams.num_classes)
        
        #sample weights
        if not self.hparams.no_class_weights:
            self.class_weights = class_weights
            self.class_weights = self.class_weights.cuda() # Move to cuda, otherwise mismatch of devices # in train/val
        else:
            self.class_weights = None
        print("---")
        print("class_weights:", self.class_weights)
        print("device of class_weights:", self.class_weights.device)
        print("device of class:", self.device)
        print("---")

        #metrics
        self.train_acc = torchmetrics.Accuracy()
        self.train_bal_acc = torchmetrics.Accuracy(
        num_classes=self.hparams.num_classes, average="macro") # should be equal to sklearn bal. acc.

        self.val_acc = torchmetrics.Accuracy()
        self.val_bal_acc= torchmetrics.Accuracy(
            num_classes=self.hparams.num_classes, average="macro")

    def forward(self, x):
        x = self.first(x)
        x = self.middle(x)
        x = self.last(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        # Logging is done "log_every_n_steps" times (default=50 steps)
        self.log("loss/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        
        self.train_acc(y_hat, y)
        self.log("acc/train", self.train_acc, on_step=False, on_epoch=True)
        
        self.train_bal_acc(y_hat, y)
        self.log("bal_acc/train", self.train_bal_acc, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
#         self.log("hp_metric", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True) # average prediction class
        self.log("mean_pred", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True)
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        self.log("loss/val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
        self.val_acc(y_hat, y)
        self.log("acc/val", self.val_acc, on_step=False, on_epoch=True)
        
        self.val_bal_acc(y_hat, y)
        self.log("bal_acc/val", self.val_bal_acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return {"val_loss": loss}
    
    def on_train_start(self):
        self.st_total = time.time()

    def on_train_epoch_start(self):
        self.st = time.time()
        self.steps = self.global_step

    def on_train_epoch_end(self):
        elapsed = time.time() - self.st
        steps_done = self.global_step - self.steps
        self.log("time/step", elapsed / steps_done)

    def on_train_end(self):
        elapsed = time.time() - self.st_total
        print(f"Total Training Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed))}")
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)

        self.log("loss/test_loss", loss, prog_bar=True)

        return loss

    def predict_step(self, batch, batch_idx):
        return self(batch)
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("FFN")
        parser.add_argument("--no_class_weights", action='store_true')
        parser.add_argument("--hidden_dim", type=int, default=100)
        parser.add_argument("-lr", "--learning_rate", type=float, default=1e-2)
        parser.add_argument("--n_hidden", type=int, default=0)
        parser.add_argument("--no_batch_norm", action='store_false')
        parser.add_argument("--no_dropout", action='store_false')
        parser.add_argument("--drop_prob", type=float, default=0.5)

        return parent_parser
        

In [None]:
def feature_engineer(data):
    """
    Arguments:
    data: pandas.DataFrame that must have specific columns.

    """
    # Bid-Ask spread: (Ask - Bid) / Ask
    data["best_bid"] = (data["best_offer"] - data["best_bid"]) / (data["best_offer"])
    data = data.rename(columns={"best_bid": "ba_spread_option"}).drop(["best_offer"], axis=1)

    # Gamma: multiply by spotprice and divide by 100
    data["gamma"] = data["gamma"] * data["spotprice"] / 100 #following Bali et al. (2021)

    # Theta: scale by spotprice
    data["theta"] = data["theta"] / data["spotprice"] #following Bali et al. (2021)

    # Vega: scale by spotprice
    data["vega"] = data["vega"] / data["spotprice"] #following Bali et al. (2021)

    # Time to Maturity: scale by number of days in year: 365
    data["days_to_exp"] = data["days_to_exp"] / 365

    # Moneyness: Strike / Spot (K / S)
    data["strike_price"] = data["strike_price"] / data["spotprice"] # K / S
    data = data.rename(columns={"strike_price": "moneyness"})

    # Forward Price ratio: Forward / Spot
    data["forwardprice"] = data["forwardprice"] / data["spotprice"]

    # Drop redundant/ unimportant columns
    data = data.drop(["cfadj", "days_no_trading", "spotprice", "adj_spot"], axis=1)

    return data


# Binary y label generator.
def binary_categorize(y):
    """
    Input: continuous target variable 

    Output: 1 for positive returns, 
            0 for negative returns
    """
    # threshold 0%
    if y > 0:
        return 1
    else:
        return 0


# Multiclass y label generator.
def multi_categorize(y: float, classes: int):
    """
    Creates categorical labels from continuous values.

        Args:
            y (float):      continuous target variable (option return)
            classes (int):  number of classes to create
        Returns:
            (int):          class assignment
        CAREFUL: classes have to be between [0, C) for F.crossentropyloss.
    """
    if classes == 3:
        # thresholds: +/- 5%
        if y > 0.05:
            return 2
        elif y < -0.05:
            return 0
        else:
            return 1
    elif classes == 5:
        # thresholds: +/- 2.5% and +/- 5%
        if y > 0.05:
            return 4
        elif (y > 0.025 and y <= 0.05):
            return 3
        elif (y >= -0.05 and y < -0.025):
            return 1
        elif (y < -0.05):
            return 0
        else:
            return 2 # all returns \elin [-0.025, 0.025]
    # elif classes==10:
    #     if y > 0.05:
    #         return 9
    #     elif (y > 0.04 and y <= 0.05):
    #         return 8
    #     elif (y > 0.03 and y <= 0.04):
    #         return 7
    #     elif (y > 0.02 and y <= 0.03):
    #         return 6
    #     elif (y > 0.01 and y <= 0.02):
    #         return 5
    #     elif (y >= -0.02 and y < -0.01):
    #         return 3
    #     elif (y >= -0.03 and y < -0.02):
    #         return 2
    #     elif (y >= -0.04 and y < -0.03):
    #         return 1
    #     elif (y >= -0.05 and y < -0.05):
    #         return 0
    #     else:
    #         return 4
    else:
        raise ValueError("Only multi for 3 or 5 classes implemented right now.")


class YearEndIndeces:
    """Generator for indices where years change.

        Args:
            dates (pandas.Series):      series of datetimes,
            init_train_length (int):    initial train length,
            val_length (int):           validation length
    """
    def __init__(self, dates, init_train_length, val_length, test_length):
        # Find indeces where years change.
        self.val_length = val_length
        self.test_length = test_length
        # Get end of month indeces for slicing.
        # TECHNICALLY its start of month indeces, i.e. first row of January 31,
        # but because for slicing [:idx], idx is not included, we name it end of
        # year here.
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        # Append last row as end of year of last year.
        self.eoy_idx = np.append(self.eoy_idx, len(dates))

        assert init_train_length + val_length + test_length <= len(self.eoy_idx), \
            ("defined train and val are larger than eoy_indeces generated")
        assert init_train_length > 0, "init_train_length must be strictly greater than 0"

        # The 4th idx in eoy_idx is the end of year 5. -> Subtract 1.
        self.train_length_zeroindex = init_train_length - 1

        self.train_eoy = self.eoy_idx[self.train_length_zeroindex:-(val_length+test_length)]
        self.val_eoy = self.eoy_idx[self.train_length_zeroindex + val_length:-test_length]
        # For generate_idx():
        self.test_eoy = self.eoy_idx[self.train_length_zeroindex + val_length + test_length:]

    # def generate(self):
    #     for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length)):
    #         yield (list(range(self.train_eoy[i])),
    #                list(range(self.train_eoy[i], self.val_eoy[i])))

    def generate_idx(self):
        for i in range(len(self.eoy_idx) - (self.train_length_zeroindex + self.val_length 
                        + self.test_length)):
            yield ({"train": self.train_eoy[i], 
                    "val": self.val_eoy[i], 
                    "test": self.test_eoy[i]}
                )


class YearMonthEndIndeces:
    """Generator for indices where months change.

        Args:
            dates (pandas.Series):      series of datetimes,
            init_train_length (int):    initial train length,
            val_length (int):           validation length
    """
    def __init__(self, dates, init_train_length, val_length, test_length):
        # self.val_length = val_length
        # self.test_length = test_length
        # Get end of month indeces for slicing.
        # TECHNICALLY its start of month indeces, i.e. first row of January 31,
        # but because for slicing [:idx], idx is not included, we name it end of
        # year here.
        self.eom_idx =  np.concatenate([
                        np.where((dates.dt.month.diff() == 1))[0], 
                        np.where((dates.dt.month.diff() == -11))[0] #Dec->Jan
                        ])
        # Sort, since Dec->Jan months indeces are only concatenated at the end.
        self.eom_idx.sort()
        # Append last row as end of month of last month.
        self.eom_idx = np.append(self.eom_idx, len(dates))

        # End of year indeces
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        self.eoy_idx = np.append(self.eoy_idx, len(dates))

        # Careful: -2 because November (-> cao (2021) return calc.) and December 2021 is not in dataset.
        assert (26 * 12 - 2 == len(self.eom_idx)), ("Some end of month indeces are missing.")
        assert init_train_length > 0, "init_train_length must be strictly greater than 0."

        # The 4th idx in eoy_idx is the end of year 5. -> Subtract 1.
        self.train_length_zeroindex = init_train_length - 1

        # Get eoy indeces where we predicted on AND FIRST ENTRY IS EOY_VAL == SOY_TEST
        self.test_eoy = self.eoy_idx[self.train_length_zeroindex + val_length:]

        # Get first end of month idx of year X until first end of month of year Y
        # a prediction was made.
        years_predicted = np.arange(1996 + init_train_length + val_length, 2021 + 1) #upper limit not included.
        self.month_idx_per_year = {}
        for i, eoy_idx in enumerate(self.test_eoy[:-1]): #-1 because [last_index:last_index+13] not needed.
            idx_in_idx = np.where(np.in1d(self.eom_idx, eoy_idx))[0].item() #only one eom_idx equals one eoy_idx
            # + 13, so that slicing is from start of year until +12 months (end of year).
            self.month_idx_per_year[years_predicted[i]] = self.eom_idx[idx_in_idx:idx_in_idx+13]

        # Check that dictionary years are correct and that months are consecutive.
        assert check_month_years(self.month_idx_per_year, dates=dates), ("Years of end "
        "of month indices are wrong or the months are not consecutive.")

    def get_indeces(self):
        # Return Tuple.
        return (self.test_eoy, self.month_idx_per_year)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import PredefinedSplit
import torch

from torch.utils.data import TensorDataset, DataLoader
import pytorch_lightning as pl
from pathlib import Path
import pdb
from sklearn.utils.class_weight import compute_class_weight

def load_data(path: Path, dataset: str):
    """Loads dataset from path, depending on specified size."""
    if dataset == "small":
        return pd.read_parquet(path/"final_df_call_cao_small.parquet")
    elif dataset == "medium":
        return pd.read_parquet(path/"final_df_call_cao_med_fillmean.parquet")
    elif dataset == "big":
        return pd.read_parquet(path/"final_df_call_cao_big_fillmean.parquet")
    elif dataset == "predict":
        return pd.read_parquet(path)
    else:
        raise ValueError("Specify dataset as either 'small', 'medium' or 'custom'. For 'custom' "
                         "the absolute path is required.")

class DataModule(pl.LightningDataModule):
    """Dataset Loader for Pytorch Lightning (Neural Network)."""
    def __init__(self,
                 path: str, # will be converted to Path in __init__
                 year_idx: int,
                 dataset: str,
                 batch_size: int,
                 init_train_length: int,
                 val_length: int,
                 test_length: int,
                #  start_val: str, 
                #  start_test: str,
                 label_fn: str,
                 custom_data: pd.DataFrame = None,
        ):
        super().__init__()
        self.save_hyperparameters(ignore=["path"])
        
        # read data from disk ########################################### ADDED
        if custom_data is not None:
            self.data = custom_data
        else:
            path = Path(path)
            self.data = load_data(path, dataset)
        ###########################################################################

        # get splits
        splitter = YearEndIndeces(
                                self.data["date"], 
                                init_train_length=init_train_length, 
                                val_length=val_length,
                                test_length=test_length,
                                )
        eoy_indeces = list(splitter.generate_idx())
        self.eoy_train = eoy_indeces[year_idx]["train"]
        self.eoy_val = eoy_indeces[year_idx]["val"]
        self.eoy_test = eoy_indeces[year_idx]["test"]

        # Truncate data
        self.data = self.data.iloc[:self.eoy_test]
        assert len(self.data) == self.eoy_test, "length of data is not equal to eoy_test"
            
#         # feature engineer data
#         self.data = feature_engineer(self.data)
        
        # create y
        self.y = self.data["option_ret"]
        # make classification problem
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi3":
            self.y = self.y.apply(multi_categorize, classes=3)
        elif label_fn == "multi5":
            self.y = self.y.apply(multi_categorize, classes=5)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # create X
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # save dates and drop
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
        # to torch Tensor
        self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
        self.y = torch.from_numpy(self.y.values)
        
    def setup(self, stage: str = None):
        # train
        # self.X_train = self.X[self.dates < self.hparams.start_val]
        self.X_train = self.X[:self.eoy_train]
        self.y_train = self.y[:len(self.X_train)]
        
        #val
        # mask = (self.dates >= self.hparams.start_val) & (self.dates < self.hparams.start_test)
        # self.X_val = self.X[mask]
        self.X_val = self.X[self.eoy_train:self.eoy_val]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # test
        self.X_test = self.X[self.eoy_val:self.eoy_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data), \
            "sum of X train, val, test is not equal length of dataset"
        assert (len(self.y_train)+len(self.y_val)+len(self.y_test) == len(self.data)), \
        "sum of y train, val, test is not equal to length of dataset"
        
        #standardize X_train
        mean = torch.mean(self.X_train, axis=0)
        std = torch.std(self.X_train, axis=0)
        
        # Standardize X_train, X_val and X_test with mean/std from X_train
        self.X_train = (self.X_train - mean) / std
        self.X_val = (self.X_val - mean) / std
        self.X_test = (self.X_test - mean) / std

        # Save variables to pass to model class
        # input dim
        self.input_dim = self.X_train.shape[1]
        # number of classes
        self.num_classes = len(self.y_train.unique())
        # class weights
        self.class_weights = len(self.y_train) / self.y_train.unique(return_counts=True)[1]

        print("*****************************************************************************************")
        print("Current TORCH dataset information:")
        print("---")
        print("class counts: ", self.y_train.unique(return_counts=True))
        print("class_weights:", self.class_weights)
        print("device of class_weights:", self.class_weights.device)
        print("---")
        print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
        print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
        print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
        print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
        print("---")
        print(f"train start date: ", self.dates.iloc[0].strftime("%Y-%m-%d"), 
              ", train end date: ", self.dates.iloc[:self.eoy_train].iloc[-1].strftime("%Y-%m-%d"))
        print(f"val start date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[0].strftime("%Y-%m-%d"), 
              ", val end date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[-1].strftime("%Y-%m-%d"))
        print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
              ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))
        print("*****************************************************************************************")


    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.X_train))
        x, y = self.X_train[idx], self.y_train[idx]
        return (x, y)

    def train_dataloader(self):
        dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0, #uses just the main worker, see https://stackoverflow.com/questions/71713719/runtimeerror-dataloader-worker-pids-15876-2756-exited-unexpectedly
                         # there are issues occuring on windows where PID workers exit unexpectedly.
                         pin_memory=True,
                         shuffle=True, #shuffle training data
                         )

    def val_dataloader(self):
        dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0,
                         pin_memory=True,
                         shuffle=False,
                         )

    def test_dataloader(self):
        dataset = TensorDataset(self.X_test, self.y_test)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0,
                         pin_memory=True,
                         shuffle=False, #must not shuffle here!
                         )

    def predict_dataloader(self):
        dataset = self.X_test # predict_step expects tensor not a list
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                        num_workers=0,
                        pin_memory=True,
                        shuffle=False, #must not shuffle here!
                        )

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("DataModule for Lightning")
        parser.add_argument("--batch_size", type=int, default=512)
        return parent_parser

In [None]:
# load best model
yearidx_target = yearidx_target_list[-1]

best_path = yearidx_target[1] ######## IMPORTANT ############################################

# RANDOMIZED DATA
rand_data = pd.read_parquet(Path(args_exp.path_data)/"final_df_call_cao_small.parquet")

# Copy best model checkpoint to loop folder for later analysis.
# test_year_end = val_year_end + args.test_length
# shutil.copy2(best_path, loop_path/f"best_ckpt{test_year_end}")
print(f"Loading model to predict from path: {best_path}")
model = FFN.load_from_checkpoint(best_path)
dm = DataModule(
    path=args_exp.path_data, ########################## SET TO NONE
    year_idx=yearidx_target[0], ######## IMPORTANT ############################################
    dataset="small", ############################ SET TO NONE
    batch_size=128, # #################TAKE FIXED BATCH SIZE, SHOULD AFFECT PREDICTIONS?
    init_train_length=args_exp.init_train_length,
    val_length=args_exp.val_length,
    test_length=args_exp.test_length,
    label_fn=args_exp.label_fn,
    custom_data=None, #################################### NEW
    # config=model.hparams.config, # so that config is not hyperparam search again
)
trainer = pl.Trainer(
    deterministic=True,
    gpus=1, #fractional gpus here not possible.
    logger=False, #deactivate logging for prediction
)
# predict
preds = trainer.predict(model=model, datamodule=dm) #returns list of batch predictions.
preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
preds_argmax = preds.argmax(dim=1).numpy()
preds_argmax_df_old = pd.DataFrame(preds_argmax, columns=["pred"])
# # prediction path
# save_to_dir = loop_path/f"prediction{test_year_end}.csv"
#         preds_argmax_df.to_csv(save_to_dir, index_label="id")

In [None]:
# load best model
best_path = yearidx_target[1] ######## IMPORTANT ############################################

# RANDOMIZED DATA
rand_data = pd.read_parquet(Path(args_exp.path_data)/"final_df_call_cao_small.parquet")

# Copy best model checkpoint to loop folder for later analysis.
# test_year_end = val_year_end + args.test_length
# shutil.copy2(best_path, loop_path/f"best_ckpt{test_year_end}")
print(f"Loading model to predict from path: {best_path}")
model = FFN.load_from_checkpoint(best_path)
dm = DataModule(
    path=None, ########################## SET TO NONE
    year_idx=yearidx_target[0], ######## IMPORTANT ############################################
    dataset=None, ############################ SET TO NONE
    batch_size=128, # #################TAKE FIXED BATCH SIZE, SHOULD AFFECT PREDICTIONS?
    init_train_length=args_exp.init_train_length,
    val_length=args_exp.val_length,
    test_length=args_exp.test_length,
    label_fn=args_exp.label_fn,
    custom_data=rand_data, #################################### NEW
    # config=model.hparams.config, # so that config is not hyperparam search again
)
trainer = pl.Trainer(
    deterministic=True,
    gpus=1, #fractional gpus here not possible.
    logger=False, #deactivate logging for prediction
)
# predict
preds = trainer.predict(model=model, datamodule=dm) #returns list of batch predictions.
preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
preds_argmax = preds.argmax(dim=1).numpy()
preds_argmax_df_new = pd.DataFrame(preds_argmax, columns=["pred"])
# # prediction path
# save_to_dir = loop_path/f"prediction{test_year_end}.csv"
#         preds_argmax_df.to_csv(save_to_dir, index_label="id")

In [None]:
# # CHECK IF METHODS ARE EQUAL, when loading data from path vs. inputting them.

# # load best model
# list_equal = []

# for yearidx_target in tqdm(yearidx_target_list):

#     best_path = yearidx_target[1] ######## IMPORTANT ############################################

#     # RANDOMIZED DATA
#     rand_data = pd.read_parquet(Path(args.path_data)/"final_df_call_cao_small.parquet")

#     # Copy best model checkpoint to loop folder for later analysis.
#     # test_year_end = val_year_end + args.test_length
#     # shutil.copy2(best_path, loop_path/f"best_ckpt{test_year_end}")
#     print(f"Loading model to predict from path: {best_path}")
#     model = FFN.load_from_checkpoint(best_path)
#     dm = DataModule(
#         path=args.path_data, ########################## SET TO NONE
#         year_idx=yearidx_target[0], ######## IMPORTANT ############################################
#         dataset="small", ############################ SET TO NONE
#         batch_size=128, # #################TAKE FIXED BATCH SIZE, SHOULD AFFECT PREDICTIONS?
#         init_train_length=args.init_train_length,
#         val_length=args.val_length,
#         test_length=args.test_length,
#         label_fn=args.label_fn,
#         custom_data=None, #################################### NEW
#         # config=model.hparams.config, # so that config is not hyperparam search again
#     )
#     trainer = pl.Trainer(
#         deterministic=True,
#         gpus=math.ceil(args.gpus_per_trial), #fractional gpus here not possible.
#         logger=False, #deactivate logging for prediction
#     )
#     # predict
#     preds = trainer.predict(model=model, datamodule=dm) #returns list of batch predictions.
#     preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
#     preds_argmax = preds.argmax(dim=1).numpy()
#     preds_argmax_df_old = pd.DataFrame(preds_argmax, columns=["pred"])
#     # # prediction path
#     # save_to_dir = loop_path/f"prediction{test_year_end}.csv"
#     #         preds_argmax_df.to_csv(save_to_dir, index_label="id")
    
#     # load best model
#     best_path = yearidx_target[1] ######## IMPORTANT ############################################

#     # RANDOMIZED DATA
#     rand_data = pd.read_parquet(Path(args.path_data)/"final_df_call_cao_small.parquet")

#     # Copy best model checkpoint to loop folder for later analysis.
#     # test_year_end = val_year_end + args.test_length
#     # shutil.copy2(best_path, loop_path/f"best_ckpt{test_year_end}")
#     print(f"Loading model to predict from path: {best_path}")
#     model = FFN.load_from_checkpoint(best_path)
#     dm = DataModule(
#         path=None, ########################## SET TO NONE
#         year_idx=yearidx_target[0], ######## IMPORTANT ############################################
#         dataset=None, ############################ SET TO NONE
#         batch_size=128, # #################TAKE FIXED BATCH SIZE, SHOULD AFFECT PREDICTIONS?
#         init_train_length=args.init_train_length,
#         val_length=args.val_length,
#         test_length=args.test_length,
#         label_fn=args.label_fn,
#         custom_data=rand_data, #################################### NEW
#         # config=model.hparams.config, # so that config is not hyperparam search again
#     )
#     trainer = pl.Trainer(
#         deterministic=True,
#         gpus=math.ceil(args.gpus_per_trial), #fractional gpus here not possible.
#         logger=False, #deactivate logging for prediction
#     )
#     # predict
#     preds = trainer.predict(model=model, datamodule=dm) #returns list of batch predictions.
#     preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
#     preds_argmax = preds.argmax(dim=1).numpy()
#     preds_argmax_df_new = pd.DataFrame(preds_argmax, columns=["pred"])
#     # # prediction path
#     # save_to_dir = loop_path/f"prediction{test_year_end}.csv"
#     #         preds_argmax_df.to_csv(save_to_dir, index_label="id")
    
#     list_equal.append((yearidx_target[0], (preds_argmax_df_old == preds_argmax_df_new).all().all()))
#     print(yearidx_target[0], (preds_argmax_df_old == preds_argmax_df_new).all().all())

# list_equal

In [None]:
(preds_argmax_df_old == preds_argmax_df_new).all().all()

In [None]:
# ALTERNATIVE PREDICTION
preds = model(dm.X_test) #returns list of batch predictions.
# preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
preds_argmax = preds.argmax(dim=1).numpy()
preds_argmax_df2 = pd.DataFrame(preds_argmax, columns=["pred"])

In [None]:
best_path

In [None]:
exp_dir

In [None]:
import shutil
def collect_preds(exp_dir: Path) -> None:
    """Copies all predictions????.csv to a 'predictions' folder within the
    experiment_directory."""
    preds_dir = exp_dir/"predictions"
    preds_dir.mkdir(exist_ok=True, parents=True)
    # For all objects in exp_dir.
    for dir in exp_dir.iterdir():
        if dir.is_dir() and dir.name != "predictions":
            # See https://docs.python.org/3/library/fnmatch.html#module-fnmatch
            # for filename pattern matching below.
            for file in dir.glob("prediction[1,2]???.csv"): #[1,2]??? for years 1995,..,2000,..
                # If files do not exist in 'predictions' folder yet
                if not (preds_dir/(file.name)).is_file():
#                     print(f"Copy file: '{file.relative_to(Path.cwd())}'"
#                         f"to '{preds_dir.relative_to(Path.cwd())}'")
                    try:
                        shutil.copy2(file, preds_dir)
                    except shutil.SameFileError:
                        print("Source and Destination are the same file...")
                else:
                    print(f"File {file.name} already exists in '{preds_dir.name}' folder.")

In [None]:
collect_preds(exp_dir)

In [None]:
# Read from loop folder directly.
read_preds = pd.read_csv(best_path.parent/f"prediction{str(yearidx_target[1])[-4:]}.csv", index_col="id")
# Read from collected 'predictions' folder.
read_preds2 = pd.read_csv(exp_dir/"predictions"/f"prediction{str(yearidx_target[1])[-4:]}.csv", index_col="id")

In [None]:
assert (read_preds == read_preds2).all().all(), "Predictions from loop folder different than in the 'predictions'."
assert (preds_argmax_df_old == read_preds).all().all(), "Predictions from loaded model not equal to the saved predictions."
assert (preds_argmax_df_new == read_preds).all().all(), "Predictions from loaded model not equal to the saved predictions."
assert preds_argmax_df_new.equals(read_preds), "Predictions from loaded model not equal to the saved predictions."

## Randomize Feature and test significance of difference of monthly PF returns

In [None]:
exp_dir

### Compare balanced accuracy

In [None]:
## Assuming we have all predictions from permutation in a dataframe, like so:
preds_permut = pd.read_csv(exp_dir/"all_pred.csv", index_col=0)

In [None]:
preds_permut

In [None]:
## Original preds
preds_orig = pd.read_csv(exp_dir/"all_pred.csv", index_col=0)

In [None]:
preds_orig

In [None]:
# Calc. difference in bal_accuracy. Significant?

In [None]:
preds_orig

In [None]:
assert dm.data.iloc[-len(preds_orig):].equals(dm.data[dm.data["date"] > "2008"])

In [None]:
y_true = dm.y[-len(preds_orig):].numpy()

In [None]:
from sklearn.metrics import balanced_accuracy_score

In [None]:
preds_orig.iloc[33000:]

In [None]:
preds_orig["pred"]

In [None]:
if (len(y_true) == len(y_true) == len(y_true)):
    print("NOT")

In [None]:
(y_true == y_true).all()

In [None]:
y_true.shape

In [None]:
balanced_accuracy_score(y_true, preds_permut["pred"])

In [None]:
balanced_accuracy_score(y_true, preds_orig["pred"])

### Compare monthly returns

In [None]:
dm.X_test

In [None]:
yearidx_target

In [None]:
# Load original PF returns.
pd.read_parquet(Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data")/"final_df_call_cao_small.parquet")

In [None]:
dm.y.shape

In [None]:
dm.y[-len(preds_orig):].shape

In [None]:
dm.y[234731:].shape

In [None]:
preds_orig

In [None]:
from pathlib import Path
import shutil
import numpy as np
import pandas as pd
import dataframe_image as dfi
from tqdm import tqdm

def collect_preds(exp_dir: Path) -> None:
    """Copies all predictions????.csv to a 'predictions' folder within the
    experiment_directory."""
    preds_dir = exp_dir/"predictions"
    preds_dir.mkdir(exist_ok=True, parents=True)
    # For all objects in exp_dir.
    for dir in exp_dir.iterdir():
        if dir.is_dir() and dir.name != "predictions":
            # See https://docs.python.org/3/library/fnmatch.html#module-fnmatch
            # for filename pattern matching below.
            for file in dir.glob("prediction[1,2]???.csv"): #[1,2]??? for years 1995,..,2000,..
                # If files do not exist in 'predictions' folder yet
                if not (preds_dir/(file.name)).is_file():
                    print(f"Copy file: '{file.relative_to(Path.cwd())}'"
                        f"to '{preds_dir.relative_to(Path.cwd())}'")
                    try:
                        shutil.copy2(file, preds_dir)
                    except shutil.SameFileError:
                        print("Source and Destination are the same file...")
                else:
                    print(f"File {file.name} already exists in '{preds_dir.name}' folder.")


def concat_and_save_preds(exp_dir: Path) -> pd.DataFrame:
    """Read prediction????.csv files from the 'predictions' folder in the experiment
    directory and return the concatenated pandas dataframe.
    
    Also, make sure years of 'prediction????.csv' are read in ascending order
    and consecutively, i.e. 2009, 2010, 2011, ... and not 2009, 2011, ... .
    """
    preds_dir = exp_dir/"predictions"
    preds = []
    prev_year = 0 # to make sure that files are read from lowest years to top years.
    for idx, file in enumerate(sorted(preds_dir.glob("*.csv"), reverse=False)): 
        # MUST BE reverse=False <=> ascending!, so that years are read in order ->2003->2004, etc.
        if not idx: # For first year (equiv. to: if idx == 0).
            year = int(file.stem[-4:]) 
            assert year > prev_year, "ERROR: year is not a positive integer"
            prev_year = year
        else: # For remaining years: must be consecutive, i.e. 2010, 2011, etc.
            year = int(file.stem[-4:]) 
            assert year == prev_year + 1, "ERROR: year is not succeeding previous year"
            prev_year = year
        pred_df = pd.read_csv(file)
        preds.append(pred_df)
    # Return concatenated dataframe.
    preds_concat_df = pd.concat(preds).reset_index(drop=True)
    preds_concat_df.to_csv(exp_dir/"all_pred.csv")
    return preds_concat_df


def check_month_years(dic, dates):
    """Checks whether all end of month indeces in the dictionary 'dic'
    are in the correct year. Also, checks whether all indeces are in
    consecutive order. 31.12.2019[31.01.2020,......,31.12.2020, 31.01.2021]
    
    The last month of the eom indeces overlaps with the first entry in
    the next year.

    ---
    Example:
        If a year has 12 months in the data, the end of month indeces should 
        have length 13. The first index is the first "row" of the year, 
        the last index is the first row of the next year.
    """
    for year in dic.keys():
        len_dic = len(dic[year])
        for idx, eom_idx in enumerate(dic[year]):
            # Special case: last eom_idx is first eom_idx of next year.
            if idx == len_dic - 1: #idx uses zero indexing.
                if int(year) != dates[eom_idx-1].year or (idx)!= dates[eom_idx-1].month:
                    return False
            elif int(year) != dates[eom_idx].year or (idx+1) != dates[eom_idx].month:
                return False
    return True

# Checks whether rows with id of 0 correspond to start of new years.
def check_eoy(concat_df: pd.DataFrame, eoy_indeces: np.ndarray):
    """Checks whether start of year (eoy indeces) rows correspond to id 0 in 
    concatenated predictions.
    
    """
    id_eq_zero = np.where(concat_df.loc[:, "id"] == 0)[0] #np.where returns tuple
    for idx_idx, idx in enumerate(id_eq_zero):
        if concat_df.iloc[idx]["index"] != eoy_indeces[idx_idx]:
            return False
    return True


def get_and_check_min_max_pred(concat_df: pd.DataFrame, labelfn_exp: str):
    """Checks whether the predictions contain at least one of the smallest and
    at least one of the largest class in each month (so that we can form Long-
    Short Portfolios.)

    Arguments: 
        concat_df:      Dataframe with option returns and the direction prediction. 
        labelfn_exp:    The label_fn of the experiment. Should be a string 'binary' 
                        or 'multi{number of classes}'.
    Returns:
        max_real:       Max realized prediction over all the data.
        min_real:       Min realized prediction over all the data.
        """
    classes = sorted(concat_df["pred"].unique(), reverse=False) #ascending order
    # Min pred value theoretically.
    min_theor = 0
    if labelfn_exp=="binary":
        max_theor = 1 
    else: #multi3, multi5, multi10 -> take (3, 5, 10) - 1
        max_theor = int(labelfn_exp[5:]) - 1 # 3 classes -> 0, 1, 2
    assert len(classes) == max_theor + 1, "At least one class is not predicted at all."
    assert classes[0] == min_theor and classes[-1] == max_theor, "List 'classes' is not sorted in ascending order."
    # Min pred value realized per month.
    min_real_series = concat_df.groupby("date")["pred"].min()
    min_real = min_real_series.min()
    # print("Min prediction realized is:", min_real)
    assert min_theor == min_real, (
        "Not a single month has the prediction of the theoretical minimum class.")
    months_no_min = min_real_series[min_real_series != min_real].count()
    print(f"Number of months where Short class {min_real} is not predicted:", 
            months_no_min, "out of", f"{len(min_real_series)}.")
    # Max pred value realized per month.
    max_real_series = concat_df.groupby("date")["pred"].max()
    max_real = max_real_series.max()
    # max_real_series[max_real_series != max_real].index.strftime("%Y-%m-%d").to_list()
    assert max_theor == max_real, (
        "Not a single month has the prediction of the theoretical maximum class.")
    months_no_max = max_real_series[max_real_series != max_real].count()
    print(f"Number of months where Long class {max_real} is not predicted:", 
            months_no_max, "out of", f"{len(max_real_series)}.")
    return max_real, min_real, classes


def various_tests(agg_dict: dict, concat_df: pd.DataFrame, col_list: list, classes: list, class_ignore: dict):
    """Perform various sanity checks on our monthly aggregated results in agg_dict."""
    # Test1: Compare agg_dict with agg_dict2, calculated via 'weighted_avg' function 
    # and not via 'np.average'. They should yield the same (up to small precision).
    agg_dict2 = {}
    for c in tqdm(classes):
        agg_df = concat_df.groupby("date").aggregate(weighted_means_by_column2, col_list, f"weights_{c}")
        agg_dict2[f"class{c}"] = agg_df
    for key in agg_dict.keys():
        pd.testing.assert_frame_equal(agg_dict[key], agg_dict2[key])
    print("Test1: Successful! Weighted_avg function seems to yield the same as np.average.")

    # COPY CRUCIAL HERE! Otherwise, input df will be altered...
    agg_dict_copy = agg_dict.copy() #copy because we drop class_ignore months for each class.
    concat_df_copy = concat_df.copy()
    # Drop 'class_ignore' rows:
    for c in classes:
        agg_dict_copy[f"class{c}"] = agg_dict_copy[f"class{c}"].drop(class_ignore[f"class{c}"])
    # Test2: Check whether first and last month aggregation yield same as first 
    # and last entries of agg_dict_copy for each class.
    for c in classes:
        concat_df_copy_c = concat_df_copy[~concat_df_copy["date"].isin(class_ignore[f"class{c}"])]
        first_month = concat_df_copy_c.loc[concat_df_copy_c["date"] == concat_df_copy_c["date"].iloc[0]]
        last_month = concat_df_copy_c.loc[concat_df_copy_c["date"] == concat_df_copy_c["date"].iloc[-1]]
        for k in col_list:
            assert np.average(first_month[k], weights=first_month[f"weights_{c}"]) == agg_dict_copy[f"class{c}"].iloc[0][k]
            assert np.average(last_month[k], weights=last_month[f"weights_{c}"]) == agg_dict_copy[f"class{c}"].iloc[-1][k]
            assert (weighted_avg(first_month, k, f"weights_{c}") - agg_dict_copy[f"class{c}"].iloc[0][k]) < 0.0001
            assert (weighted_avg(last_month, k, f"weights_{c}") - agg_dict_copy[f"class{c}"].iloc[-1][k]) < 0.0001
    print("Test2: Successful! First and last month individual aggregation (of non-to-ignore months) yield the same "
         "as first and last entries of the aggregated dataframe for the respective class.")

    # Test3: If "pred" column in aggregated df's corresponds to class in each row (month).
    for c in classes:
        assert (agg_dict_copy[f"class{c}"]["pred"] == c).all(), "Aggregated 'pred' is not equal to the class in at least one month."
    print("Test3: Successful! Aggregated 'pred' column is equal to the class in each month.")
    # Test4: If short and low portfolios are aggregated correctly.
    assert ((agg_dict_copy[f"class{classes[0]}"]["if_long_short"] == -1).all() and
            (agg_dict_copy[f"class{classes[-1]}"]["if_long_short"] == 1).all()), ("Long "
            "or short portfolio aggregation does not yield 1 or -1 in 'if_long_short' column.")
    print("Test4: Successful! Both the lowest class and the highest class corrrespond "
        "to -1 and 1 in the column 'if_long_short', respectively.")
    # Test5: Check if one-hot encoding columns correspond to 'preds' and 'if_long_short'.
    for c in classes:
        for k in classes:
            if c == k:
                assert (agg_dict_copy[f"class{c}"][f"weights_{k}"] == 1).all()
                assert (agg_dict_copy[f"class{c}"]["pred"] == k).all()
                if c==classes[0]:
                    assert (agg_dict_copy[f"class{c}"]["if_long_short"] == -1).all()
                elif c==classes[-1]:
                    assert (agg_dict_copy[f"class{c}"]["if_long_short"] == 1).all()
                else:
                    assert (agg_dict_copy[f"class{c}"]["pred"] == k).all()
            else:
                assert (agg_dict_copy[f"class{c}"][f"weights_{k}"] == 0).all()
    print("Test5: Successful! Check whether one-hot encoding columns make sense "
        "with the columns 'preds' and 'if_long_short'.")


# Weighted average functions used to aggreagte portfolios. We use np.average.
def weighted_means_by_column(x, cols, w):
    """ This takes a DataFrame and averages each data column (cols)
        while weighting observations by column w.
    """
    try:
        return pd.Series([np.average(x[c], weights=x[w] ) for c in cols], cols)
    except ZeroDivisionError:
        series = pd.Series(0, cols) # set all values to 0 for those months with no prediction.
        return series


# Only used for testing:
def weighted_avg(df, values, weights):
    if df[weights].sum() == 0:
        raise ZeroDivisionError
    return sum(df[values] * df[weights]) / df[weights].sum()

def weighted_means_by_column2(x, cols, w):
    """ This takes a DataFrame and averages each data column (cols)
        while weighting observations by column w.
    """
    try:
        return pd.Series([weighted_avg(x, c, weights=w) for c in cols], cols)
    except ZeroDivisionError:
        series = pd.Series(0, cols) # set all values to 0 for those months with no prediction.
        return series
# ---


def export_dfi(perfstats: pd.DataFrame, path: str) -> None:
    """dfi tries exporting the dataframe with Google Chrome first. On Linux
    this can fail, so then it tries exporting with table_conversion=matplotlib."""
    try:
        dfi.export(perfstats, path)
        return
    except OSError:
        print("Exporting performance stats via chrome failed. Trying with "
            "table conversion='matplotlib'...")
    try:
        dfi.export(perfstats, path, table_conversion="matplotlib")
        return
    except OSError as err:
        raise OSError("Try different dataframe .png exporter.") from err


def get_class_ignore_dates(concat_df: pd.DataFrame, classes: list) -> dict:
    """For each class get months where there was no prediction for it at all.
    
        Returns:
            class_ignore (dict): DatetimeIndeces for each class in a dictionary.
    """
    class_ignore = {}
    for c in classes:
        sum_onehot = concat_df.groupby("date")[f"weights_{c}"].sum()
        nr_months_noclass = sum_onehot[sum_onehot==0].count()
        months_noclass = sum_onehot[sum_onehot==0].index #Datetimeindex of months.
        if c == classes[0]: #short class, save month indeces to exlude.
            if not nr_months_noclass:
                print(f"Short Class {c} was predicted in every month.")
            else:
                print(f"Short Class {c}, was not predicted in the following {nr_months_noclass} months:", 
                months_noclass.strftime("%Y-%m-%d").tolist())
        elif c == classes[-1]: #long class, save month indeces to exclude.
            if not nr_months_noclass:
                print(f"Short Class {c} was predicted in every month.")
            else:
                print(f"Long Class {c} was not predicted in the following {nr_months_noclass} months:", 
                months_noclass.strftime("%Y-%m-%d").tolist())
        else: #remaining classes, just print info.
            if not nr_months_noclass:
                print(f"Class {c} was predicted in every month.")
            else:
                print(f"Class {c}, was not predicted in the following {nr_months_noclass} months:", 
                months_noclass.strftime("%Y-%m-%d").tolist())
        class_ignore[f"class{c}"] = months_noclass
    return class_ignore

In [None]:
def aggregate(preds_concat_df: pd.DataFrame, exp_dir: Path, datapath: Path):
    # Get path where datasets reside:
    print("Concat the dataframe with the respective option data...")
    df_small = pd.read_parquet(datapath/"final_df_call_cao_small.parquet")
    print(datapath)
    dates = df_small["date"]
    # Get args from experiment.
    # Alternatively: Load json with json.load() and convert dict/list to df.
    args_exp = pd.read_json(exp_dir/"args.json", typ="series")
    # Get start of year index and all end of month indeces. Load with args that 
    # were used in the actual experiment.
    eoy_indeces, eom_indeces = YearMonthEndIndeces(
                                dates=dates, 
                                init_train_length=args_exp["init_train_length"],
                                val_length=args_exp["val_length"],
                                test_length=args_exp["test_length"]
                                ).get_indeces()
    # Slice df_small to prediction period.
    # Get first month of first year of eventual predictions.
    preds_start_idx = list(eom_indeces.values())[0][0]
    df_small = df_small.iloc[preds_start_idx:]
    # Make sure df_small and preds_concat_df are of same length.
    assert len(preds_concat_df) == len(df_small), ("length of prediction dataframe "
                                    "is not equal the sliced option return dataframe")
    # Align indeces with preds_concat_df, but dont drop old index.
    df_small = df_small.reset_index(drop=False)
    # Concatenate option return data and predictions.
    concat_df = pd.concat([df_small, preds_concat_df], axis=1)
    # Checks whether rows with id of 0 correspond to start of new years.
    assert check_eoy(concat_df, eoy_indeces), ("Id 0 and eoy indeces do not match.")
    # Set df_small index back to main index.
    concat_df = concat_df.set_index("index", drop=True)
    print("Done.")

    # Create single weight column 'if_long_short' with -1 for lowest and 1 for 
    # highest predicted class. Rest is 0.
    print("Create weight columns for each class...")
    max_pred, min_pred, classes = get_and_check_min_max_pred(concat_df, args_exp["label_fn"])
    # 1.5x faster than pd.map...
    condlist = [concat_df["pred"] == min_pred, concat_df["pred"] == max_pred]
    choicelist = [-1, 1]
    no_alloc_value = 0
    concat_df["if_long_short"] = np.select(condlist, choicelist, no_alloc_value)
    # Create separate weight columns for each class in concat_df.
    for c in classes:
        condlist = [concat_df["pred"] == c]
        choicelist = [1]
        no_alloc_value = 0
        concat_df[f"weights_{c}"] = np.select(condlist, choicelist, no_alloc_value)

    # Only calculate weighted average for numerical columns (have to drop 'date').
    col_list = [val for val in concat_df.columns.tolist() if "date" not in val]
    print("Done.")
    # Aggregate and collect all portfolios in a dictionary with key 'class0', 'class1', etc.
    print("Aggregate for each class and collect the dataframes...")
    agg_dict = {}
    for c in classes:
        agg_df = concat_df.groupby("date").aggregate(weighted_means_by_column, col_list, f"weights_{c}")
        agg_dict[f"class{c}"] = agg_df
    print("Done.")
    
    print("Which classes were not predicted at all in a respective month?...")
    # For each class print out months where no prediction was allocated for that class, 
    # and save these indeces for short and long class to later ignore the returns of 
    # these months.
    class_ignore = get_class_ignore_dates(concat_df, classes) #returns dict
    print("Done.")
    
    # Perform various tests to check our calculations.
    test_concat = concat_df.copy()
    test_agg_dict = agg_dict.copy()
    print("Sanity test the aggregated results...")
    various_tests(agg_dict, concat_df, col_list, classes, class_ignore)
    print("Done.")
    # Make sure tests did not alter dataframes.
    pd.testing.assert_frame_equal(test_concat, concat_df)
    for c in classes:
        pd.testing.assert_frame_equal(test_agg_dict[f"class{c}"], agg_dict[f"class{c}"])

#     print("Save each dataframe in the 'portfolios' subfolder...")
    # Save all aggregated dataframes per class to 'portfolios' subfolder within the 
    # experiment directory 'exp_dir'.
#     pf_dir = exp_dir/"portfolios"
#     try: # raise error if 'portfolio' folder exists already
#         pf_dir.mkdir(exist_ok=False, parents=False) # raise error if parents are missing.
#         for class_c, df in agg_dict.items():
#             df.to_csv(pf_dir/f"{class_c}.csv")
#     except FileExistsError as err: # from 'exist_ok' -> portfolios folder already exists, do nothing.
#         raise FileExistsError("Directory 'portfolios' already exists. Will not "
#         "touch folder and exit code.") from err
#     print("Done.")

    print("Create Long Short Portfolio while ignoring months where one side "
        "is not allocated...")
    # Long-Short PF (highest class (long) - lowest class (short))
    short_class = classes[0] #should be 0
    assert short_class == 0, "Class of short portfolio not 0. Check why."
    long_class = classes[-1] #should be 2 for binary, 3 for 'multi3', etc.
    print(f"Subtract Short portfolio (class {short_class}) from Long portfolio "
            f"(class {long_class}) and save to long{long_class}short{short_class}.csv...")
    # Subtract short from long portfolio.
    long_df = agg_dict[f"class{long_class}"].copy() #deep copy to not change original agg_dict
    short_df = agg_dict[f"class{short_class}"].copy() #deep copy to not change original agg_dict
    months_no_inv = class_ignore[f"class{long_class}"].union(class_ignore[f"class{short_class}"]) #union of months to set to 0.
    long_df.loc[months_no_inv, :] = 0
    short_df.loc[months_no_inv, :] = 0
    long_short_df = long_df - short_df #months that are 0 in both dfs stay 0 everywhere.
    assert ((long_short_df.drop(months_no_inv)["pred"] == (long_class - short_class)).all() and #'pred' should be long_class - short_class
            (long_short_df.drop(months_no_inv)["if_long_short"] == 2).all()) #'if_long_short' should be 2 (1 - (-1) = 2)
    # Drop one-hot "weight" columns here.
    cols_to_keep = [col for col in long_short_df.columns.tolist() if "weight" not in col]
    long_short_df = long_short_df[cols_to_keep]
    return long_short_df["option_ret"]
#     long_short_df.to_csv(pf_dir/f"long{long_class}short{short_class}.csv")
    print("Done.")
    print("All done!")

In [None]:
exp_dir

In [None]:
data_path = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data")

In [None]:
ls_df_permuted = aggregate(preds_orig, exp_dir, data_path)

In [None]:
ls_df_permuted

In [None]:
ret_orig = pd.read_csv(exp_dir/"portfolios"/"long4short0.csv")["option_ret"]

In [None]:
diff = ret_orig - ls_df_permuted.values

In [None]:
import statsmodels.api as sm

In [None]:
X = np.ones_like(diff)

In [None]:
y = diff

In [None]:
ols = sm.OLS(y, X) #long_short_return regressed on X.
# ols_result = ols.fit()
ols_result = ols.fit(cov_type="HAC", cov_kwds={"maxlags": 5}, use_t=True)

In [None]:
ols_result.summary()

In [None]:
a = [1, 2, 3]

In [None]:
a[:3]

### For large samples t-test is similar to z-test:

In [None]:
# P value for t score
import scipy.stats

#find p-value for two-tailed test
scipy.stats.t.sf(abs(-0.486), df=165)*2

In [None]:
# zscore
import scipy.stats as st
(1 - st.norm.cdf(abs(-0.425)))*2

In [None]:
# Is z value significant at the 10% level?
(ols_result.summary2().tables[1].z > 1.645) | (ols_result.summary2().tables[1].z < -1.645)

In [None]:
# Is z value significant at the 5% level?
(ols_result.summary2().tables[1].z > 1.96) | (ols_result.summary2().tables[1].z < -1.96)

In [None]:
# Is z value significant at the 1% level?
(ols_result.summary2().tables[1].z > 2.58) | (ols_result.summary2().tables[1].z < -2.58)

### Permute features

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import math
from tqdm import tqdm

In [None]:
# Load original small/medium/big dataset.
df_orig = pd.read_parquet(Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data")/"final_df_call_cao_small.parquet")

In [None]:
df = df_orig.copy()

In [None]:
features_list = df.columns.tolist()
features_list.remove("date")
features_list.remove("option_ret")

In [None]:
df["moneyness"]

In [None]:
df["moneyness"].describe()

In [None]:
df[df["moneyness"] > 10]

In [None]:
df[df["impl_volatility"] == 2.860694]

In [None]:
df

In [None]:
np.random.seed(42)

In [None]:
df["moneyness"] = np.random.permutation(df["moneyness"])

In [None]:
df

In [None]:
df["moneyness"]

In [None]:
df["moneyness"].describe()

In [None]:
df["moneyness"].plot.hist(bins=1000)

In [None]:
df[df["moneyness"] > 10]

In [None]:
df

In [None]:
df_orig

In [24]:
a = [1, 2, 3] * 100

In [25]:
from tqdm import tqdm

In [26]:
for i in tqdm(a):
    print(i)

100%|████████████████████████████████████| 300/300 [00:00<00:00, 293171.30it/s]

1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3
1
2
3



