In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [9]:
# 对于单只股票, 数据结构是一个二维数组
X_2D = np.random.normal(100, 10, (2520, 70))
y = np.random.normal(0.02, 0.01, (2520, ))

In [12]:
class MyDataSet(Dataset):
    def __init__(self, X, y):
        assert len(X) == len(y)
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

In [15]:
ds = MyDataSet(X_2D, y)

In [34]:
ds[10:12][0].shape

(2, 70)

# Ts, for RNN

In [25]:
# Suppose we have a tseries with n features

In [None]:
# 如果y的创建用到了未来数据, 那么把ts_data

In [29]:
ts_data = np.random.normal(100, 10, (2520, 70))
ts_data_label = np.random.normal(0.02, 0.1, (2520, ))

In [75]:
import typing
import torch
import numpy as np
class TsDataSet(Dataset):

    offset: int 
    X_2D: typing.Union[np.ndarray, torch.Tensor]
    y: typing.Union[np.ndarray, torch.Tensor]

    def __init__(
            self, 
            X: typing.Union[np.ndarray, torch.Tensor], 
            y: typing.Union[np.ndarray, torch.Tensor], 
            len_seq: int=5, 
            zero_padding=True,
    ):
        self.offset = len_seq
        self.X_2D = X
        self.y = y
        self.X_2D_padded = self._zero_padding()
        self.X_3D = np.concatenate([self.X_2D_padded[index: index+self.offset][np.newaxis, ...] for index in range(len(self.X_2D))], axis=0)

    def _zero_padding(self):
        padding_array = np.zeros(shape=(self.offset-1, self.X_2D.shape[1]), dtype=self.X_2D.dtype)
        X_pad = np.concatenate([padding_array, self.X_2D], axis=0)
        return X_pad    

    def __len__(self):
        return len(self.X_2D)
    
    def __getitem__(self, index) -> typing.Tuple[typing.Union[np.ndarray, torch.Tensor], typing.Union[np.ndarray, torch.Tensor]]:
        return self.X_3D[index], self.y[index]

In [76]:
ts_data_set = TsDataSet(ts_data, ts_data_label)
ts_data_loader = DataLoader(dataset=ts_data_set, batch_size=16, shuffle=True)

In [85]:
data_iter = iter(ts_data_loader)

In [89]:
data_batch = next(data_iter)

In [92]:
data_batch[0]

tensor([[[104.8995, 113.0671, 102.3795,  ...,  90.2112, 115.3298, 101.6391],
         [108.6183,  96.8552, 121.7096,  ...,  82.1136,  97.6914,  95.3650],
         [ 92.9488,  96.4737, 116.0551,  ...,  85.0252, 100.0508, 126.7118],
         [115.8829,  83.6619, 108.0222,  ..., 109.5998, 102.3212, 103.7188],
         [ 92.9737, 127.4365,  97.3127,  ..., 111.8935, 102.1500, 106.5334]],

        [[ 93.1962, 104.3539,  92.6209,  ..., 124.5690, 103.3286,  95.9846],
         [106.8587, 105.0528, 101.2982,  ...,  94.8759, 101.6539,  89.1772],
         [ 94.4634, 117.4887, 109.2796,  ...,  98.1306, 104.6419,  95.8360],
         [ 97.7332, 100.3376,  99.3156,  ..., 103.8390, 113.0837, 104.2476],
         [ 97.8045, 104.7634, 105.7869,  ...,  93.2516,  97.7637, 110.6219]],

        [[108.0323,  97.6574, 124.4590,  ..., 110.7517,  96.6154, 105.4796],
         [ 95.0918,  99.0115,  91.1631,  ...,  89.4445, 107.8128,  83.7360],
         [ 82.1950, 107.2381,  93.4295,  ..., 101.0105,  90.7488, 103.60