In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
#%%
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
#%%
###
### NOTE TO SELF: This type of dataset reduces the total amount of data
### and leads to a high increase in number of input features.
### This dataset is not adjusted to include macro-indicators, only
### data for multiple assets (such as stocks, commodities etc.)
###

class MultiAssetDataset(Dataset):
    def __init__(self, df, tickers, features, window=60, horizon=1):
        """
        df: DataFrame with columns ['Ticker', features..., 'Return']
        tickers: List of ticker-strings
        features: List of feature column names (excluding 'Return')
        window: Number of time steps in the input sequence
        horizon: Int or list of ints for the prediction horizon
                ex horizon=1 means predicting the next time step
                ex horizon=[1, 2] means predicting the next 1 and 2 time steps
        """
        self.df = df
        self.tickers = tickers
        self.features = features
        self.window = window
        self.horizon = list(horizon) if isinstance(horizon, list) else [horizon]
        
        dates = df.index.unique().sort_values()

        T = len(dates)
        A = len(tickers)
        F = len(features)
        H = len(self.horizon)
        max_h = max(self.horizon)

        arr = np.zeros((T, A, F), dtype=float)
        rets = np.zeros((T, A), dtype=float)
        for i, t in enumerate(tickers):
            sub = df[df['Ticker'] == t].reindex(dates)
            arr[:, i, :] = sub[features].values
            rets[:, i] = sub['Return'].values
        
        Xs = []
        ys = []
        n_samples = T - window - max_h + 1

        for i in range(n_samples):
            Xs.append(arr[i : i + window])

            labels = []
            for h in self.horizon:
                labels.append(rets[i + window + h - 1])
            
            y_i = np.stack(labels, axis = -1)
            ys.append(y_i)

        self.X = torch.tensor(np.stack(Xs), dtype=torch.float32) # (N, W, A, F)
        self.y = torch.tensor(np.stack(ys), dtype=torch.float32) # (N, A, H)

    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

if __name__ == "__main__":
    print("Testing MultiAssetDataset...")
    # Example usage
    df = pd.read_csv("..\\data\\OMXS22_model_features_raw.csv", index_col= 'Date', parse_dates=True)
  
    tickers = df["Ticker"].unique().tolist()
    features = ["Return", "Volume", "SMA20", "EMA20", "RSI14", "ReturnVola20"]
    window = 60
    horizon = 1

    ds = MultiAssetDataset(df, tickers, features, window, horizon)

    print(f"Dataset length: {len(ds)} samples")
    print(f"First sample: {ds[0]}")
    for x, y in ds:
        print(f"First sample X shape: {x.shape}")
        print(f"First sample y shape: {y.shape}")
        break

    loader = DataLoader(ds, batch_size=32, shuffle=True)

    print("DataLoader test:")
    print(f"Batch X shape: {next(iter(loader))[0].shape}")
    print(f"Batch y shape: {next(iter(loader))[1].shape}")
    print("TickerEmbeddedDataset test complete.")


Testing MultiAssetDataset...
Dataset length: 6368 samples
First sample: (tensor([[[-5.0000e-02,  1.6820e+06,  0.0000e+00,  2.6790e+01,  7.4256e+01,
           2.7978e-02],
         [-4.4748e-02,  2.9996e+07,  0.0000e+00,  3.9910e+00,  4.8970e+01,
           2.4745e-02],
         [-4.7808e-02,  3.9562e+06,  0.0000e+00,  3.1217e+00,  5.2314e+01,
           2.9787e-02],
         ...,
         [-1.5686e-02,  1.0422e+06,  0.0000e+00,  2.7014e+01,  3.7933e+01,
           1.7279e-02],
         [-8.5515e-02,  2.7243e+06,  0.0000e+00,  4.9224e+01,  5.2661e+01,
           3.3042e-02],
         [-2.2989e-02,  8.5792e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]],

        [[ 0.0000e+00,  1.6820e+06,  0.0000e+00,  2.6790e+01,  7.4256e+01,
           2.7978e-02],
         [-1.8330e-02,  2.0738e+07,  0.0000e+00,  3.9910e+00,  4.8970e+01,
           2.4745e-02],
         [-2.5105e-02,  2.4806e+06,  0.0000e+00,  3.1217e+00,  5.2314e+01,
           2.9787e-02],
         ...,
      

In [15]:
print("Testing SingleAssetDataset...")
# Example usage
df = pd.read_csv("..\\data\\OMXS22_model_features_raw.csv", index_col= 'Date', parse_dates=True)

tickers = ["VOLV-B.ST"]
features = ["Return", "Volume", "SMA20", "EMA20", "RSI14", "ReturnVola20"]
window = 5
horizon = 1
singleds = MultiAssetDataset(df, tickers, features, window, horizon)
print(f"Dataset length: {len(singleds)} samples")
print(f"First sample: {singleds[0]}")
for x, y in singleds:
    print(f"First sample X shape: {x.shape}")
    print(f"First sample y shape: {y.shape}")
    break
loader = DataLoader(singleds, batch_size=32, shuffle=False)
print("DataLoader test:")
print(f"Batch X shape: {next(iter(loader))[0].shape}")
print(f"Batch y shape: {next(iter(loader))[1].shape}")
print("TickerEmbeddedDataset test complete.")

Testing SingleAssetDataset...
Dataset length: 6423 samples
First sample: (tensor([[[-2.2989e-02,  8.5792e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]],

        [[ 0.0000e+00,  8.5792e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]],

        [[ 0.0000e+00,  8.5792e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]],

        [[ 4.7060e-03,  9.5302e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]],

        [[ 3.7470e-02,  8.0079e+07,  0.0000e+00,  1.6981e+01,  5.3064e+01,
           1.4362e-02]]]), tensor([[0.0248]]))
First sample X shape: torch.Size([5, 1, 6])
First sample y shape: torch.Size([1, 1])
DataLoader test:
Batch X shape: torch.Size([32, 5, 1, 6])
Batch y shape: torch.Size([32, 1, 1])
TickerEmbeddedDataset test complete.
