### Check if loading checkpoints from remote server (Linux) works when loading locally in Windows

In [91]:
import pandas as pd
import numpy as np

In [158]:
test = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6]]))

In [159]:
test[3] = "*"

In [162]:
test.iloc[1, 3] = "**"

In [180]:
a = 1 * "*"

In [181]:
a

'*'

In [163]:
test

Unnamed: 0,0,1,2,3
0,1,2,3,*
1,4,5,6,**


In [164]:
from pandas.api.types import is_numeric_dtype

In [165]:
def mean_str(col):
    if is_numeric_dtype(col):
        return col.mean()
    else:
        return col.str.count("\*").mean()

In [176]:
test.apply(mean_str).to_dict()

{0: 2.5, 1: 3.5, 2: 4.5, 3: 1.5}

In [None]:
def check_month_years(dic, dates):
    """Checks whether all end of month indeces in the dictionary 'dic'
    are in the correct year. Also, checks whether all indeces are in
    consecutive order. 31.12.2019[31.01.2020,......,31.12.2020, 31.01.2021]
    
    The last month of the eom indeces overlaps with the first entry in
    the next year.

    ---
    Example:
        If a year has 12 months in the data, the end of month indeces should 
        have length 13. The first index is the first "row" of the year, 
        the last index is the first row of the next year.
    """
    for year in dic.keys():
        len_dic = len(dic[year])
        for idx, eom_idx in enumerate(dic[year]):
            # Special case: last eom_idx is first eom_idx of next year.
            if idx == len_dic - 1: #idx uses zero indexing.
                if int(year) != dates[eom_idx-1].year or (idx)!= dates[eom_idx-1].month:
                    return False
            elif int(year) != dates[eom_idx].year or (idx+1) != dates[eom_idx].month:
                return False
    return True


In [None]:
import pdb
import numpy as np

# Binary y label generator.
def binary_categorize(y):
    """
    Input: continuous target variable 

    Output: 1 for positive returns, 
            0 for negative returns
    """
    # threshold 0%
    if y > 0:
        return 1
    else:
        return 0


# Multiclass y label generator.
def multi_categorize(y: float, classes: int):
    """
    Creates categorical labels from continuous values.

        Args:
            y (float):      continuous target variable (option return)
            classes (int):  number of classes to create
        Returns:
            (int):          class assignment
        CAREFUL: classes have to be between [0, C) for F.crossentropyloss.
    """
    if classes == 3:
        # thresholds: +/- 5%
        if y > 0.05:
            return 2
        elif y < -0.05:
            return 0
        else:
            return 1
    elif classes == 5:
        # thresholds: +/- 2.5% and +/- 5%
        if y > 0.05:
            return 4
        elif (y > 0.025 and y <= 0.05):
            return 3
        elif (y >= -0.05 and y < -0.025):
            return 1
        elif (y < -0.05):
            return 0
        else:
            return 2 # all returns \elin [-0.025, 0.025]
    # elif classes==10:
    #     if y > 0.05:
    #         return 9
    #     elif (y > 0.04 and y <= 0.05):
    #         return 8
    #     elif (y > 0.03 and y <= 0.04):
    #         return 7
    #     elif (y > 0.02 and y <= 0.03):
    #         return 6
    #     elif (y > 0.01 and y <= 0.02):
    #         return 5
    #     elif (y >= -0.02 and y < -0.01):
    #         return 3
    #     elif (y >= -0.03 and y < -0.02):
    #         return 2
    #     elif (y >= -0.04 and y < -0.03):
    #         return 1
    #     elif (y >= -0.05 and y < -0.05):
    #         return 0
    #     else:
    #         return 4
    else:
        raise ValueError("Only multi for 3 or 5 classes implemented right now.")


class YearEndIndeces:
    """Generator for indices where years change.

        Args:
            dates (pandas.Series):      series of datetimes,
            init_train_length (int):    initial train length,
            val_length (int):           validation length
    """
    def __init__(self, dates, init_train_length, val_length, test_length):
        # Find indeces where years change.
        self.val_length = val_length
        self.test_length = test_length
        # Get end of month indeces for slicing.
        # TECHNICALLY its start of month indeces, i.e. first row of January 31,
        # but because for slicing [:idx], idx is not included, we name it end of
        # year here.
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        # Append last row as end of year of last year.
        self.eoy_idx = np.append(self.eoy_idx, len(dates))

        assert init_train_length + val_length + test_length <= len(self.eoy_idx), \
            ("defined train and val are larger than eoy_indeces generated")
        assert init_train_length > 0, "init_train_length must be strictly greater than 0"

        # The 4th idx in eoy_idx is the end of year 5. -> Subtract 1.
        self.train_length_zeroindex = init_train_length - 1

        self.train_eoy = self.eoy_idx[self.train_length_zeroindex:-(val_length+test_length)]
        self.val_eoy = self.eoy_idx[self.train_length_zeroindex + val_length:-test_length]
        # For generate_idx():
        self.test_eoy = self.eoy_idx[self.train_length_zeroindex + val_length + test_length:]

    # def generate(self):
    #     for i in range(len(self.eoy_idx) - (self.train_start_idx + self.val_length)):
    #         yield (list(range(self.train_eoy[i])),
    #                list(range(self.train_eoy[i], self.val_eoy[i])))

    def generate_idx(self):
        for i in range(len(self.eoy_idx) - (self.train_length_zeroindex + self.val_length 
                        + self.test_length)):
            yield ({"train": self.train_eoy[i], 
                    "val": self.val_eoy[i], 
                    "test": self.test_eoy[i]}
                )


class YearMonthEndIndeces:
    """Generator for indices where months change.

        Args:
            dates (pandas.Series):      series of datetimes,
            init_train_length (int):    initial train length,
            val_length (int):           validation length
    """
    def __init__(self, dates, init_train_length, val_length, test_length):
        # self.val_length = val_length
        # self.test_length = test_length
        # Get end of month indeces for slicing.
        # TECHNICALLY its start of month indeces, i.e. first row of January 31,
        # but because for slicing [:idx], idx is not included, we name it end of
        # year here.
        self.eom_idx =  np.concatenate([
                        np.where((dates.dt.month.diff() == 1))[0], 
                        np.where((dates.dt.month.diff() == -11))[0] #Dec->Jan
                        ])
        # Sort, since Dec->Jan months indeces are only concatenated at the end.
        self.eom_idx.sort()
        # Append last row as end of month of last month.
        self.eom_idx = np.append(self.eom_idx, len(dates))

        # End of year indeces
        self.eoy_idx =  np.where((dates.dt.year.diff() == 1))[0]
        self.eoy_idx = np.append(self.eoy_idx, len(dates))

        # Careful: -2 because November (-> cao (2021) return calc.) and December 2021 is not in dataset.
        assert (26 * 12 - 2 == len(self.eom_idx)), ("Some end of month indeces are missing.")
        assert init_train_length > 0, "init_train_length must be strictly greater than 0."

        # The 4th idx in eoy_idx is the end of year 5. -> Subtract 1.
        self.train_length_zeroindex = init_train_length - 1

        # Get eoy indeces where we predicted on AND FIRST ENTRY IS EOY_VAL == SOY_TEST
        self.test_eoy = self.eoy_idx[self.train_length_zeroindex + val_length:]

        # Get first end of month idx of year X until first end of month of year Y
        # a prediction was made.
        years_predicted = np.arange(1996 + init_train_length + val_length, 2021 + 1) #upper limit not included.
        self.month_idx_per_year = {}
        for i, eoy_idx in enumerate(self.test_eoy[:-1]): #-1 because [last_index:last_index+13] not needed.
            idx_in_idx = np.where(np.in1d(self.eom_idx, eoy_idx))[0].item() #only one eom_idx equals one eoy_idx
            # + 13, so that slicing is from start of year until +12 months (end of year).
            self.month_idx_per_year[years_predicted[i]] = self.eom_idx[idx_in_idx:idx_in_idx+13]

        # Check that dictionary years are correct and that months are consecutive.
        assert check_month_years(self.month_idx_per_year, dates=dates), ("Years of end "
        "of month indices are wrong or the months are not consecutive.")

    def get_indeces(self):
        # Return Tuple.
        return (self.test_eoy, self.month_idx_per_year)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import PredefinedSplit
import torch
from pathlib import Path
import pytorch_lightning as pl

from torch.utils.data import TensorDataset, DataLoader
from sklearn.utils.class_weight import compute_class_weight

class DataModule(pl.LightningDataModule):
    """Dataset Loader for Pytorch Lightning (Neural Network)."""
    def __init__(self,
                 path: str, # will be converted to Path in __init__
                 year_idx: int,
                 dataset: str,
                 batch_size: int,
                 init_train_length: int,
                 val_length: int,
                 test_length: int,
                #  start_val: str, 
                #  start_test: str,
                 label_fn: str,
                 custom_data: pd.DataFrame = None,
        ):
        super().__init__()
        self.save_hyperparameters(ignore=["path"])
        
        # If the data is provided at initialization, use it. (E.g. used in feature importance)
        if custom_data is not None:
            self.data = custom_data
        else:
            path = Path(path)
            self.data = load_data(path, dataset)

        # Get year train, val, test split indeces.
        splitter = YearEndIndeces(
                                self.data["date"], 
                                init_train_length=init_train_length, 
                                val_length=val_length,
                                test_length=test_length,
                                )
        eoy_indeces = list(splitter.generate_idx())
        self.eoy_train = eoy_indeces[year_idx]["train"]
        self.eoy_val = eoy_indeces[year_idx]["val"]
        self.eoy_test = eoy_indeces[year_idx]["test"]

        # Truncate data to only use current train, val and test.
        self.data = self.data.iloc[:self.eoy_test]
        assert len(self.data) == self.eoy_test, "length of data is not equal to eoy_test"
            
        # # feature engineer data
        # self.data = feature_engineer(self.data)
        
        # Get the y vector.
        self.y = self.data["option_ret"]
        # Classify returns (floats) into classes.
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi3":
            self.y = self.y.apply(multi_categorize, classes=3)
        elif label_fn == "multi5":
            self.y = self.y.apply(multi_categorize, classes=5)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # Get the features X.
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # Save dates and drop it from X.
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
        # Convert X and y to torch tensors (for GPU).
        self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
        self.y = torch.from_numpy(self.y.values)
        
    def setup(self, stage: str = None):
        # Training data.
        # self.X_train = self.X[self.dates < self.hparams.start_val]
        self.X_train = self.X[:self.eoy_train]
        self.y_train = self.y[:len(self.X_train)]
        
        # Validation data.
        # mask = (self.dates >= self.hparams.start_val) & (self.dates < self.hparams.start_test)
        # self.X_val = self.X[mask]
        self.X_val = self.X[self.eoy_train:self.eoy_val]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # Test data.
        self.X_test = self.X[self.eoy_val:self.eoy_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data), \
            "sum of X train, val, test is not equal length of dataset"
        assert (len(self.y_train)+len(self.y_val)+len(self.y_test) == len(self.data)), \
        "sum of y train, val, test is not equal to length of dataset"
        
        # Get mean and std of X_train.
        mean = torch.mean(self.X_train, axis=0)
        std = torch.std(self.X_train, axis=0)
        
        # Standardize X_train, X_val and X_test with mean/std from X_train.
        self.X_train = (self.X_train - mean) / std
        self.X_val = (self.X_val - mean) / std
        self.X_test = (self.X_test - mean) / std

        # Save important variables to pass to model class.
        # Input dim of features (int).
        self.input_dim = self.X_train.shape[1]
        # Number of classes (int).
        self.num_classes = len(self.y_train.unique())
        # Class weights (torch.tensor).
        self.class_weights = len(self.y_train) / self.y_train.unique(return_counts=True)[1]

        if self.hparams.custom_data is None:
            print("*****************************************************************************************")
            print("Current TORCH dataset information:")
            print("---")
            print("class counts: ", self.y_train.unique(return_counts=True))
            print("class_weights:", self.class_weights)
            print("device of class_weights:", self.class_weights.device)
            print("---")
            print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
            print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
            print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
            print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
            print("---")
            print(f"train start date: ", self.dates.iloc[0].strftime("%Y-%m-%d"), 
                ", train end date: ", self.dates.iloc[:self.eoy_train].iloc[-1].strftime("%Y-%m-%d"))
            print(f"val start date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[0].strftime("%Y-%m-%d"), 
                ", val end date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[-1].strftime("%Y-%m-%d"))
            print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
                ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))
            print("*****************************************************************************************")
        else:
            print("*****************************************************************************************")
            print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
                ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))



    def example(self):
        """Returns a random training example."""        
        idx = np.random.randint(0, len(self.X_train))
        x, y = self.X_train[idx], self.y_train[idx]
        return (x, y)

    def train_dataloader(self):
        dataset = TensorDataset(self.X_train, self.y_train)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0, #uses just the main worker, see https://stackoverflow.com/questions/71713719/runtimeerror-dataloader-worker-pids-15876-2756-exited-unexpectedly
                         # there are issues occuring on windows where PID workers exit unexpectedly.
                         pin_memory=True,
                         shuffle=True, #shuffle training data
                         )

    def val_dataloader(self):
        dataset = TensorDataset(self.X_val, self.y_val)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0,
                         pin_memory=True,
                         shuffle=False,
                         )

    def test_dataloader(self):
        dataset = TensorDataset(self.X_test, self.y_test)
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                         num_workers=0,
                         pin_memory=True,
                         shuffle=False, #must not shuffle here!
                         )

    def predict_dataloader(self):
        dataset = self.X_test # predict_step expects tensor not a list
        return DataLoader(dataset, batch_size=self.hparams.batch_size,
                        num_workers=0,
                        pin_memory=True,
                        shuffle=False, #must not shuffle here!
                        )

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("DataModule for Lightning")
        parser.add_argument("--batch_size", type=int, default=512)
        return parent_parser


#****************************************************************************************

class Dataset():
    """Dataset for non-torch classifiers. Provides train, val and test set
    in numpy. Can also output predefinded cv splits for gridsearch."""
    def __init__(self, 
                path: str, 
                year_idx: int, 
                dataset: str, 
                init_train_length: int, 
                val_length: int,
                test_length: int,
                label_fn: str,
                custom_data: pd.DataFrame = None,
                ):
        
        # If the data is provided at initialization, use it. (E.g. used in feature importance)
        if custom_data is not None:
            self.data = custom_data
        else:
            path = Path(path)
            self.data = load_data(path, dataset)

        # Get year train, val, test split indeces.
        splitter = YearEndIndeces(
                                self.data["date"], 
                                init_train_length=init_train_length, 
                                val_length=val_length, 
                                test_length=test_length,
                                )
        eoy_indeces = list(splitter.generate_idx())
        self.eoy_train = eoy_indeces[year_idx]["train"]
        self.eoy_val = eoy_indeces[year_idx]["val"]
        self.eoy_test = eoy_indeces[year_idx]["test"]
        
        # Truncate data to only use train, val and current test.
        self.data = self.data.iloc[:self.eoy_test]
        assert len(self.data) == self.eoy_test, "length of data is not equal to eoy_test"
            
        # feature engineer data
        # self.data = feature_engineer(self.data)
        
        # Get the y vector.
        self.y = self.data["option_ret"]
        # Classify returns (floats) into classes.
        if label_fn == "binary":
            self.y = self.y.apply(binary_categorize)
        elif label_fn == "multi3":
            self.y = self.y.apply(multi_categorize, classes=3)
        elif label_fn == "multi5":
            self.y = self.y.apply(multi_categorize, classes=5)
        else:
            raise ValueError("Specify label_fn as either 'binary' or 'multi'")
        # Get the features X.
        self.X = self.data.drop(["option_ret"], axis=1)
        
        # Save dates and drop it from X.
        self.dates = self.X["date"]
        self.X = self.X.drop(["date"], axis=1)
        
#         # to torch Tensor
#         self.X = torch.from_numpy(self.X.values).float() #-> will be standardized in setup, so do it there.
#         self.y = torch.from_numpy(self.y.values)

        # Convert X and y to numpy arrays.
        self.X = self.X.values #-> will be standardized in setup, so do it there.
        self.y = self.y.values
    
        ############################### setup #########################################################
        # Training data.
        self.X_train = self.X[:self.eoy_train]
        self.y_train = self.y[:len(self.X_train)]
        
        # Validation data.
        self.X_val = self.X[self.eoy_train:self.eoy_val]
        self.y_val = self.y[len(self.X_train):len(self.X_train)+len(self.X_val)]
        
        # Test data.
        self.X_test = self.X[self.eoy_val:self.eoy_test]
        self.y_test = self.y[-len(self.X_test):]
        
        assert (len(self.X_train)+len(self.X_val)+len(self.X_test)) == len(self.data), \
            "sum of X train, val, test is not equal length of dataset"
        assert (len(self.y_train)+len(self.y_val)+len(self.y_test) == len(self.data)), \
        "sum of y train, val, test is not equal to length of dataset"

        
        # --> StandardScaler is instead used!
#         #standardize X_train
#         mean = torch.mean(self.X_train, axis=0)
#         std = torch.std(self.X_train, axis=0)
        
#         # Standardize X_train, X_val and X_test with mean/std from X_train
#         self.X_train = (self.X_train - mean) / std
#         self.X_val = (self.X_val - mean) / std
#         self.X_test = (self.X_test - mean) / std

        # Save variables
        # Input dim of features (int).
        self.input_dim = self.X_train.shape[1]
        # Number of classes (int).
        self.num_classes = len(np.unique(self.y_train))

        # Class weights (dict).
        # self.class_weights = len(self.y_train) / np.unique(self.y_train, return_counts=True)[1]
        # calculate "balanced" class weights manually (class_weight="balanced" not possible for TuneSearch)
        weights = compute_class_weight('balanced', classes=np.unique(self.y_train), y=self.y_train)
        labels = np.unique(self.y_train)
        self.class_weights = {}
        for i in range(len(labels)):
            self.class_weights[labels[i]] = weights[i]

        if custom_data is None:
            print("*****************************************************************************************")
            print("Current NUMPY dataset information:")
            print("---")
            print("class counts: ", np.unique(self.y_train, return_counts=True))
            print("class_weights:", self.class_weights)
            print("---")
            print(f"# of input data: {len(self.data)} with shape: {self.data.shape}")
            print(f"# of training samples: {len(self.y_train)} with X_train of shape: {self.X_train.shape}")
            print(f"# of validation samples: {len(self.y_val)} with X_val of shape: {self.X_val.shape}")
            print(f"# of test samples: {len(self.y_test)} with X_test of shape: {self.X_test.shape}")
            print("---")
            print(f"train start date: ", self.dates.iloc[0].strftime("%Y-%m-%d"), 
                ", train end date: ", self.dates.iloc[:self.eoy_train].iloc[-1].strftime("%Y-%m-%d"))
            print(f"val start date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[0].strftime("%Y-%m-%d"), 
                ", val end date: ", self.dates.iloc[self.eoy_train:self.eoy_val].iloc[-1].strftime("%Y-%m-%d"))
            print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
                ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))
            print("*****************************************************************************************")
        else:
            print("*****************************************************************************************")
            print(f"test start date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[0].strftime("%Y-%m-%d"), 
                ", test end date: ", self.dates.iloc[self.eoy_val:self.eoy_test].iloc[-1].strftime("%Y-%m-%d"))
            print("*****************************************************************************************")
        
    def get_datasets(self):
        return self.X_train, self.X_val, self.X_test
    
    def get_cv_data(self):
        """For scikitlearn classifiers: return datasets and predefined cv split 
        needed for gridsearchcv (only 1 train/val split)"""
        # careful: if predicting on X_val later... -> cheating
        X = np.concatenate((self.X_train, self.X_val))
        y = np.concatenate((self.y_train, self.y_val))
        ps = PredefinedSplit(np.concatenate((np.zeros(len(self.X_train)) - 1, np.ones(len(self.X_val)))))
        
        assert (self.X_train.shape[0] + self.X_val.shape[0] == X.shape[0] and 
                (self.X_train.shape[1] == self.X_val.shape[1] == X.shape[1]))
        assert ps.get_n_splits() == 1, "There should only be 1 train/ val split in PredefinedSplit."
        
        return X, y, ps

    def get_train_val(self):
        """Used in xgboost trainer."""
        return self.X_train, self.X_val, self.y_train, self.y_val

    def get_test(self):
        return self.X_test, self.y_test

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("Dataset for Scikitlearn + xgboost")
        # parser.add_argument("--batch_size", type=int, default=512)
        return parent_parser


def load_data(path_data: Path, dataset: str):
    """Loads specific dataset from path, depending on specified size."""
    if dataset == "small":
        return pd.read_parquet(path_data/"final_df_call_cao_small.parquet")
    elif dataset == "medium":
        return pd.read_parquet(path_data/"final_df_call_cao_med_fillmean.parquet")
    elif dataset == "big":
        return pd.read_parquet(path_data/"final_df_call_cao_big_fillmean.parquet")
    else:
        raise ValueError("Specify dataset as either 'small', 'medium' or big'")

In [None]:
import pdb
import time
import torch
from torch import nn
import torchmetrics
import pytorch_lightning as pl
from torch.nn import functional as F


class FFN(pl.LightningModule):
    def __init__(self,
                input_dim: int,
                num_classes: int,
                class_weights: torch.Tensor,
                no_class_weights: bool,
                learning_rate: float,
                hidden_dim: int,
                n_hidden: int,
                batch_norm: bool,
                dropout: bool,
                drop_prob: float,
                # config: dict = None,
        ):
        super().__init__()
        # Init variables are saved, so that model can be reloaded cleanly if necessary
        # self.save_hyperparameters(ignore=["class_weights"])
        self.save_hyperparameters()

        middle_layers = []
        for _ in range(self.hparams.n_hidden):
            middle_layers.append(nn.Linear(self.hparams.hidden_dim, self.hparams.hidden_dim))
            if self.hparams.batch_norm:
                middle_layers.append(nn.BatchNorm1d(self.hparams.hidden_dim))
            middle_layers.append(nn.ReLU(inplace=True))
            if self.hparams.dropout:
                middle_layers.append(nn.Dropout(p=self.hparams.drop_prob))

        #model
        self.first = nn.Sequential(nn.Linear(self.hparams.input_dim, self.hparams.hidden_dim), 
                                    nn.ReLU(inplace=True))
        self.middle = nn.Sequential(*middle_layers)  
        self.last = nn.Linear(self.hparams.hidden_dim, self.hparams.num_classes)
        
        #sample weights
        if not self.hparams.no_class_weights:
            self.class_weights = class_weights
            self.class_weights = self.class_weights.cuda() # Move to cuda, otherwise mismatch of devices # in train/val
        else:
            self.class_weights = None
        # print("---")
        # print("class_weights:", self.class_weights)
        # print("device of class_weights:", self.class_weights.device)
        # print("device of class:", self.device)
        # print("---")

        #metrics
        self.train_acc = torchmetrics.Accuracy()
        self.train_bal_acc = torchmetrics.Accuracy(
        num_classes=self.hparams.num_classes, average="macro") # should be equal to sklearn bal. acc.

        self.val_acc = torchmetrics.Accuracy()
        self.val_bal_acc= torchmetrics.Accuracy(
            num_classes=self.hparams.num_classes, average="macro")

    def forward(self, x):
        x = self.first(x)
        x = self.middle(x)
        x = self.last(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        # Logging is done "log_every_n_steps" times (default=50 steps)
        self.log("loss/loss", loss, on_step=True, on_epoch=False, prog_bar=True)
        
        self.train_acc(y_hat, y)
        self.log("acc/train", self.train_acc, on_step=False, on_epoch=True)
        
        self.train_bal_acc(y_hat, y)
        self.log("bal_acc/train", self.train_bal_acc, on_step=False, on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams.learning_rate)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x) #logits
        
#         self.log("hp_metric", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True) # average prediction class
        self.log("mean_pred", torch.mean(y_hat.argmax(dim=-1).float()).item(), prog_bar=True)
        
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)
        self.log("loss/val_loss", loss, on_step=False, on_epoch=True, prog_bar=True)
        
        self.val_acc(y_hat, y)
        self.log("acc/val", self.val_acc, on_step=False, on_epoch=True)
        
        self.val_bal_acc(y_hat, y)
        self.log("bal_acc/val", self.val_bal_acc, on_step=False, on_epoch=True, prog_bar=True)
        
        return {"val_loss": loss}
    
    def on_train_start(self):
        self.st_total = time.time()

    def on_train_epoch_start(self):
        self.st = time.time()
        self.steps = self.global_step

    def on_train_epoch_end(self):
        elapsed = time.time() - self.st
        steps_done = self.global_step - self.steps
        self.log("time/step", elapsed / steps_done)

    def on_train_end(self):
        elapsed = time.time() - self.st_total
        print(f"Total Training Time: {time.strftime('%H:%M:%S', time.gmtime(elapsed))}")
        
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y, weight=self.class_weights)

        self.log("loss/test_loss", loss, prog_bar=True)

        return loss

    def predict_step(self, batch, batch_idx):
        return self(batch)
    
    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = parent_parser.add_argument_group("FFN")
        parser.add_argument("--no_class_weights", action='store_true')
        parser.add_argument("--hidden_dim", type=int, default=100)
        parser.add_argument("-lr", "--learning_rate", type=float, default=1e-2)
        parser.add_argument("--n_hidden", type=int, default=0)
        parser.add_argument("--no_batch_norm", action='store_false')
        parser.add_argument("--no_dropout", action='store_false')
        parser.add_argument("--drop_prob", type=float, default=0.5)

        return parent_parser
        

In [None]:
# bestmodelpath = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data\20220908133630\train2006_val2008")/"best_ckpt2009"
bestmodelpath = Path(r"C:\Users\Mathiass\Documents\Projects\master-thesis\data\20220908133630\train2018_val2020")/"best_ckpt2021"

In [None]:
model = FFN.load_from_checkpoint(bestmodelpath)

In [None]:
model = FFN.load_from_checkpoint(bestmodelpath)
dm = DataModule(
    path=r"C:\Users\Mathiass\Documents\Projects\master-thesis\data", #Set to None.
    year_idx=0, #Important.
    dataset="small", #Set to None.
    batch_size=100000000, #Predict in one step.
    init_train_length=10,
    val_length=2,
    test_length=1,
    label_fn="multi5",
    custom_data=None, #Provide data directly.
)
trainer = pl.Trainer(
    deterministic=True,
    gpus=1, #gpu fixed to be one here.
    logger=False, #deactivate logging for prediction
    enable_progress_bar=False,
)
# Predict.
preds = trainer.predict(model=model, datamodule=dm) #returns list of batch predictions.
preds = torch.cat(preds) #preds is a list already of [batch_size, num_classes]. 
preds_argmax = preds.argmax(dim=1).numpy()
preds_argmax_df = pd.DataFrame(preds_argmax, columns=["pred"])

In [None]:
preds_argmax_df