In [None]:
import pandas as pd
import numpy as np

from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset

In [None]:
class LogZNormCoinDataset(Dataset):
    def __init__(self, csv_path, coin_symbol, input_window, output_window, augmentation_p, augmentation_noise_std, augment_constant_c, augment_scale_s, z_norm_means_csv_path="", z_norm_stds_csv_path=""):
        self.df = pd.read_csv(csv_path)
        
        # first column is open_time, so skip it
        start, end  = {'BTC': (1, 5), 'ETH': (5, 9), 'BNB': (9, 13), 'XRP': (13, 17)}[coin_symbol]
        self.coin_cols = self.df.columns[start: end]

        self.input_window = input_window
        self.output_window = output_window

        self.augmentation_p = augmentation_p
        self.augmentation_noise_std = augmentation_noise_std
        self.augment_constant_c = augment_constant_c
        self.augment_scale_s = augment_scale_s

        self.z_norm_means_df = pd.read_csv(z_norm_means_csv_path)
        self.z_norm_stds_df = pd.read_csv(z_norm_stds_csv_path)

    def __len__(self):
        return len(self.df) - self.input_window - self.output_window + 1

    def __getitem__(self, idx):
        analysis_rows = self.df.iloc[idx : idx + self.input_window]
        prediction_rows = self.df.iloc[idx + self.input_window : idx + self.input_window + self.output_window]

        # first 4 columns are BTC_open/close/low_high, and then same 4 for each ETH, BNB, XRP. Each column is a timestamp
        analysis_matrix = analysis_rows[analysis_rows.columns[1:]].to_numpy()
        prediction_target = prediction_rows[self.coin_cols].to_numpy()

        x,y = analysis_matrix.T, prediction_target.T

        if np.random.rand() < self.augmentation_p:
            x = self.augment(x)

        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

    def rescale_to_real_price(self, price):
        means = self.z_norm_means_df[self.coin_cols].to_numpy()
        stds = self.z_norm_stds_df[self.coin_cols].to_numpy()
        
        real_price = np.power(10, price.T * stds + means)        
        
        return real_price.T
    
    def augment(self, x):
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.normal(scale=self.augmentation_noise_std, size=x.shape)

            # this dict explains the low <= close & open <= high logic for each coin
            clip_rules = {(0,1): (2, 3), (4,5): (6, 7), (8,9): (10, 11), (12,13): (14, 15)}

            for ((open_row, close_row), (low_row, high_row)) in clip_rules.items():
                x[open_row] = np.clip(x[open_row], x[low_row], x[high_row])
                x[close_row] = np.clip(x[close_row], x[low_row], x[high_row])
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.uniform(-self.augment_constant_c, self.augment_constant_c)
        if torch.rand(1) < self.augmentation_p:
            x = x * (1 + np.random.uniform(-self.augment_scale_s, self.augment_scale_s))

        return x

In [None]:
class LogReturnCoinDataset(Dataset):
    def __init__(self, csv_path, coin_symbol, input_window, output_window, augmentation_p, augmentation_noise_std, augment_constant_c, augment_scale_s, distribution_scale, distribution_clip):
        self.df = pd.read_csv(csv_path)
        self.df[self.df.columns[1:]] = self.df[self.df.columns[1:]] * distribution_scale
        self.df[self.df.columns[1:]] = self.df[self.df.columns[1:]].clip(-distribution_clip, distribution_clip)

        # first column is open_time, so skip it
        start, end  = {'BTC': (1, 5), 'ETH': (5, 9), 'BNB': (9, 13), 'XRP': (13, 17)}[coin_symbol]
        self.coin_cols = self.df.columns[start: end]

        self.input_window = input_window
        self.output_window = output_window

        self.augmentation_p = augmentation_p
        self.augmentation_noise_std = augmentation_noise_std
        self.augment_constant_c = augment_constant_c
        self.augment_scale_s = augment_scale_s
        self.distribution_scale = distribution_scale

    def __len__(self):
        return len(self.df) - self.input_window - self.output_window + 1

    def __getitem__(self, idx):
        analysis_rows = self.df.iloc[idx:idx + self.input_window]
        prediction_rows = self.df.iloc[idx + self.input_window:idx + self.input_window + self.output_window]

        # first 4 columns are BTC_open/close/low_high, and then same 4 for each ETH, BNB, XRP. Each column is a timestamp
        analysis_matrix = analysis_rows[analysis_rows.columns[1:]].to_numpy()
        prediction_target = prediction_rows[self.coin_cols].to_numpy()

        x, y = analysis_matrix.T, prediction_target.T

        if np.random.rand() < self.augmentation_p:
            x = self.augment(x)

        return torch.tensor(x, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)

    def augment(self, x):
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.normal(scale=self.augmentation_noise_std, size=x.shape)

            # this dict explains the low <= close & open <= high logic for each coin
            clip_rules = {(0,1): (2, 3), (4,5): (6, 7), (8,9): (10, 11), (12,13): (14, 15)}

            for ((open_row, close_row), (low_row, high_row)) in clip_rules.items():
                x[open_row] = np.clip(x[open_row], x[low_row], x[high_row])
                x[close_row] = np.clip(x[close_row], x[low_row], x[high_row])
        if torch.rand(1) < self.augmentation_p:
            x = x + np.random.uniform(-self.augment_constant_c, self.augment_constant_c)
        if torch.rand(1) < self.augmentation_p:
            x = x * (1 + np.random.uniform(-self.augment_scale_s, self.augment_scale_s))

        return x

In [None]:
def plot_features(df):
    fig, axes = plt.subplots(2, 2, figsize=(10, 8))

    axes[0, 0].plot(range(len(df)), df[df.columns[1:5]])
    axes[0, 0].set_title('Columns 1-4')

    axes[0, 1].plot(range(len(df)), df[df.columns[5:9]])
    axes[0, 1].set_title('Columns 5-8')

    axes[1, 0].plot(range(len(df)), df[df.columns[9:13]])
    axes[1, 0].set_title('Columns 9-12')

    axes[1, 1].plot(range(len(df)), df[df.columns[13:]])
    axes[1, 1].set_title('Columns 13-end')

    plt.tight_layout()
    plt.show()

    fig, axes = plt.subplots(2, 2, figsize=(10, 8))

    axes[0, 0].plot(range(len(df)), df[df.columns[1]])
    axes[0, 0].set_title('open')

    axes[0, 1].plot(range(len(df)), df[df.columns[2]])
    axes[0, 1].set_title('close')

    axes[1, 0].plot(range(len(df)), df[df.columns[3]])
    axes[1, 0].set_title('low')

    axes[1, 1].plot(range(len(df)), df[df.columns[4]])
    axes[1, 1].set_title('high')

    plt.tight_layout()
    plt.show()

    # Set up the grid for subplots
    fig, axs = plt.subplots(4, 4, figsize=(12, 12))
    axs = axs.flatten()

    # Plot each histogram
    for i, col in enumerate(df.columns[1:17]):
        axs[i].hist(df[col], bins=30, color='skyblue', edgecolor='black', density=True)
        axs[i].set_title(col)

    fig.subplots_adjust(top=0.9, bottom=0.05, left=0.05, right=0.95, hspace=0.4, wspace=0.4)
    plt.show()

In [None]:
distribution_clip = 10

In [None]:
dataset_train = LogReturnCoinDataset(csv_path="data/BTC_ETH_BNB_XRP_6h_log_returns_train.csv", coin_symbol="BTC",
                               input_window=28, output_window=8, augmentation_p=0, augmentation_noise_std=0,
                               augment_constant_c=0, augment_scale_s=0, distribution_scale=100, distribution_clip=distribution_clip)
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=False, drop_last=False)

plot_features(dataset_train.df)
dataset_train.df.describe()

In [None]:
dataset_val = LogReturnCoinDataset(csv_path="data/BTC_ETH_BNB_XRP_6h_log_returns_val.csv", coin_symbol="BTC",
                               input_window=28, output_window=8, augmentation_p=0, augmentation_noise_std=0,
                               augment_constant_c=0, augment_scale_s=0, distribution_scale=100, distribution_clip=distribution_clip)
train_loader = DataLoader(dataset_val, batch_size=32, shuffle=False, drop_last=False)

plot_features(dataset_val.df)
dataset_val.df.describe()

In [None]:
dataset_val = LogReturnCoinDataset(csv_path="data/BTC_ETH_BNB_XRP_6h_log_returns_train_toy.csv", coin_symbol="BTC",
                               input_window=28, output_window=8, augmentation_p=0, augmentation_noise_std=0,
                               augment_constant_c=0, augment_scale_s=0, distribution_scale=100, distribution_clip=distribution_clip)
train_loader = DataLoader(dataset_val, batch_size=32, shuffle=False, drop_last=False)

plot_features(dataset_val.df)
dataset_val.df.describe()

In [None]:
dataset_val = LogReturnCoinDataset(csv_path="data/BTC_ETH_BNB_XRP_6h_log_returns_val_toy.csv", coin_symbol="BTC",
                               input_window=28, output_window=8, augmentation_p=0, augmentation_noise_std=0,
                               augment_constant_c=0, augment_scale_s=0, distribution_scale=100, distribution_clip=distribution_clip)
train_loader = DataLoader(dataset_val, batch_size=32, shuffle=False, drop_last=False)

plot_features(dataset_val.df)
dataset_val.df.describe()