In [1]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

In [2]:
class CoinDataset(Dataset):
    def __init__(self, csv_path, coin_symbol, analysis_window, prediction_window, augmentation_p = 0.2, augmentation_noise_std=0.01):
        self.df = pd.read_csv(csv_path)
        
        # first column is open_time, so skip it
        start, end  = {'BTC': (1, 5), 'ETH': (5, 9), 'BNB': (9, 13), 'XRP': (13, 17)}[coin_symbol]
        self.coin_cols = self.df.columns[start: end]

        self.analysis_window = analysis_window
        self.prediction_window = prediction_window

        self.augmentation_noise_std = augmentation_noise_std
        self.augmentation_p = augmentation_p

        self.z_norm_means_df = pd.read_csv("./data/BTC_ETH_BNB_XRP_6h_z_norm_means.csv")
        self.z_norm_stds_df = pd.read_csv("./data/BTC_ETH_BNB_XRP_6h_z_norm_stds.csv")

    def __len__(self):
        return len(self.df) - self.analysis_window - self.prediction_window + 1

    def __getitem__(self, idx):
        analysis_rows = self.df.iloc[idx : idx + self.analysis_window]
        prediction_rows = self.df.iloc[idx + self.analysis_window : idx + self.analysis_window + self.prediction_window]

        # first 4 columns are BTC_open/close/low_high, and then same 4 for each ETH, BNB, XRP. Each column is a timestamp
        analysis_matrix = analysis_rows[analysis_rows.columns[1:]].to_numpy()
        prediction_target = prediction_rows[self.coin_cols].to_numpy()

        x,y = analysis_matrix.T, prediction_target.T

        if np.random.rand() < self.augmentation_p:
            x = self.augment(x)

        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

    def rescale_to_real_price(self, price):
        means = self.z_norm_means_df[self.coin_cols].to_numpy()
        stds = self.z_norm_stds_df[self.coin_cols].to_numpy()
        
        real_price = np.power(10, price.T * stds + means)        
        
        return real_price.T
    
    def augment(self, x):
        x_aug = x + np.random.normal(scale=self.augmentation_noise_std, size=x.shape)

        # this dict explains the low <= close & open <= high logic for each coin
        clip_rules = {(0,1): (2, 3), (4,5): (6, 7), (8,9): (10, 11), (12,13): (14, 15)}

        for ((open_row, close_row), (low_row, high_row)) in clip_rules.items():
            x_aug[open_row] = np.clip(x_aug[open_row], x_aug[low_row], x_aug[high_row])
            x_aug[close_row] = np.clip(x_aug[close_row], x_aug[low_row], x_aug[high_row])

        return x_aug

In [43]:
BTC_cd = CoinDataset("./data/BTC_ETH_BNB_XRP_6h_log_and_z_norm_train.csv", coin_symbol="BTC", analysis_window=30, prediction_window=10, augmentation_p=0)
# ETH_cd = CoinDataset("./data/BTC_ETH_BNB_XRP_6h_log_and_z_norm_train.csv", coin_symbol="ETH", analysis_window=30, prediction_window=10)
# BNB_cd = CoinDataset("./data/BTC_ETH_BNB_XRP_6h_log_and_z_norm_train.csv", coin_symbol="BNB", analysis_window=30, prediction_window=10)
# XRP_cd = CoinDataset("./data/BTC_ETH_BNB_XRP_6h_log_and_z_norm_train.csv", coin_symbol="XRP", analysis_window=30, prediction_window=10)

In [44]:
print("dataset len:", len(BTC_cd), "batch's x data shape:", BTC_cd[0][0].shape, "batch's y data shape:", BTC_cd[0][1].shape)
print("target data:", BTC_cd[0][1], "scaled target prices", BTC_cd.rescale_to_real_price(BTC_cd[0][1]))
print("train data", BTC_cd[0][0])
# print(len(ETH_cd), ETH_cd[0][0].shape, ETH_cd[0][1].shape, ETH_cd.rescale_to_real_price(ETH_cd[0][1]), ETH_cd[0][0])
# print(len(BNB_cd), BNB_cd[0][0].shape, BNB_cd[0][1].shape, BNB_cd.rescale_to_real_price(BNB_cd[0][1]), BNB_cd[0][0])
# print(len(XRP_cd), XRP_cd[0][0].shape, XRP_cd[0][1].shape, XRP_cd.rescale_to_real_price(XRP_cd[0][1]), XRP_cd[0][0])

dataset len: 9168 batch's x data shape: torch.Size([16, 30]) batch's y data shape: torch.Size([4, 10])
target data: tensor([[-0.9189, -0.9445, -0.9625, -0.9283, -0.9488, -0.9350, -0.9499, -0.9241,
         -0.9252, -0.9073],
        [-0.9455, -0.9627, -0.9285, -0.9491, -0.9364, -0.9501, -0.9243, -0.9251,
         -0.9074, -0.9498],
        [-0.9396, -0.9517, -0.9661, -0.9523, -0.9458, -0.9456, -0.9368, -0.9168,
         -0.9112, -0.9485],
        [-0.9197, -0.9479, -0.9326, -0.9247, -0.9391, -0.9413, -0.9297, -0.9175,
         -0.9091, -0.9163]]) scaled target prices tensor([[8593.0003, 8405.9400, 8277.0604, 8524.0204, 8374.9000, 8475.4403,
         8367.0001, 8555.0003, 8547.0001, 8679.7100],
        [8400.0002, 8277.0702, 8524.0202, 8374.0903, 8465.9402, 8367.0001,
         8555.0000, 8549.0000, 8679.7103, 8369.1402],
        [8341.0004, 8254.7002, 8153.0001, 8250.0004, 8296.1200, 8298.0003,
         8361.1304, 8505.9900, 8547.0000, 8277.0003],
        [8684.0000, 8476.3600, 8588.000

In [20]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset

class LogReturnCoinDataset(Dataset):
    def __init__(self, csv_path, coin_symbol, input_window, output_window, augmentation_p, augmentation_noise_std, augment_constant_c, augment_scale_s):
        self.df = pd.read_csv(csv_path)

        # first column is open_time, so skip it
        start, end  = {'BTC': (1, 5), 'ETH': (5, 9), 'BNB': (9, 13), 'XRP': (13, 17)}[coin_symbol]
        self.coin_cols = self.df.columns[start: end]

        self.input_window = input_window
        self.output_window = output_window

        self.augmentation_p = augmentation_p
        self.augmentation_noise_std = augmentation_noise_std
        self.augment_constant_c = augment_constant_c
        self.augment_scale_s = augment_scale_s

    def __len__(self):
        return len(self.df) - self.input_window - self.output_window + 1

    def __getitem__(self, idx):
        analysis_rows = self.df.iloc[idx:idx + self.input_window]
        prediction_rows = self.df.iloc[idx + self.input_window:idx + self.input_window + self.output_window]

        # first 4 columns are BTC_open/close/low_high, and then same 4 for each ETH, BNB, XRP. Each column is a timestamp
        analysis_matrix = analysis_rows[analysis_rows.columns[1:]].to_numpy()
        prediction_target = prediction_rows[self.coin_cols].to_numpy()

        x, y = analysis_matrix.T, prediction_target.T

        if np.random.rand() < self.augmentation_p:
            x = self.augment(x)

        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

    def augment(self, x):
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.normal(scale=self.augmentation_noise_std, size=x.shape)

            # this dict explains the low <= close & open <= high logic for each coin
            clip_rules = {(0,1): (2, 3), (4,5): (6, 7), (8,9): (10, 11), (12,13): (14, 15)}

            for ((open_row, close_row), (low_row, high_row)) in clip_rules.items():
                x[open_row] = np.clip(x[open_row], x[low_row], x[high_row])
                x[close_row] = np.clip(x[close_row], x[low_row], x[high_row])
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.uniform(-self.augment_constant_c, self.augment_constant_c)
        if torch.rand(1) < self.augmentation_p:
            x = x * (1 + np.random.uniform(-self.augment_scale_s, self.augment_scale_s))

        return x

In [21]:
d = LogReturnCoinDataset("data/BTC_ETH_BNB_XRP_6h_log_returns_train.csv", "BTC", 28, 8, 0, 0, 0, 0)