In [1]:
import os
import time
from pathlib import Path
from dataclasses import dataclass, field

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# from transformers import AutoformerConfig, AutoformerModel

from typing import Optional

from utils.time_features import time_features
from modeling_DLinear.models.Autoformer import Model as AutoFormerModel
from modeling_DLinear.utils.tools import EarlyStopping

In [2]:
if torch.backends.mps.is_available():
    DEVICE = 'mps'
elif torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

In [3]:
DATETIME_COLUMN = 'candle_date_time_kst'

TARGET_COLUMN = 'best_profit_rate'

# Parameters
# INPUT_LENGTH = 60  # Number of past time steps to use as input
# OUTPUT_LENGTH = 12  # Number of future time steps to predict
# BATCH_SIZE = 32
# LEARNING_RATE = 5e-4
# EPOCHS = 10

SEQUENCE_LENGTH = 24 * 4 * 4
PREDICTION_LENGTH = 24 * 4
LABEL_LENGTH = 24 * 4

BATCH_SIZE = 128
NUM_BATCHES_PER_EPOCH = 100
EPOCHS = 50
LEARNING_RATE = 5e-4
SCALING = 'std'

In [4]:
class DataUtils:
  
  default_path = os.path.join(Path(os.getcwd()).parent, 'data')
  
  @staticmethod
  def load_parquet(file_name: str, file_dir: Optional[str] = None):
    if not file_dir:
        file_dir = DataUtils.default_path
        
    path = os.path.join(file_dir, file_name)

    if not os.path.exists(path) or file_name.split('.')[-1] != 'parquet':
        return

    print(f'Loading parquet file from: {path}')

    return pd.read_parquet(path)
  

In [5]:
# Custom Dataset for Multivariate Time Series
class TimeSeriesDataset(Dataset):
    
    def __init__(self, features, target, time_features, sequence_length, label_length, prediction_length):
        self.features = features
        self.target = target.flatten()
        self.time_features = time_features
        # self._make_time_features()
        self.sequence_length = sequence_length
        self.label_length = label_length
        self.prediction_length = prediction_length

    def __len__(self):
        return len(self.features) - self.sequence_length - self.prediction_length + 1

    def __getitem__(self, idx):
        start_x = idx
        end_x = idx + self.sequence_length
        start_y = end_x - self.label_length
        end_y = end_x + self.prediction_length
        
        x = self.features[start_x : end_x]
        mark_x = self.time_features[start_x : end_x].values
        
        # future_values = self.features[start_y : end_y]
        # y = self.target[start_y : end_y].reshape(1, -1)
        # y = np.array([self.target[i : i + self.label_length] for i in range(start_y, end_y)])
        y = self.features[start_y : end_y]
        mark_y = self.time_features[start_y : end_y].values
        # print('&&&& ', self.target.shape, y.shape)
        # y = self.target[idx : idx + self.sequence_length]
        
        # past_observed_mask = np.ones((self.sequence_length, past_values.shape[1]))
        # future_observed_mask = np.ones((self.sequence_length, future_values.shape[1]))
        
        # print(past_values.shape, past_time_features.shape, future_time_features.shape, past_observed_mask.shape)
        
        return {
            # 'past_values': torch.tensor(past_values, dtype=torch.float32), 
            # 'past_time_features': torch.tensor(past_time_features, dtype=torch.float32),
            # 'past_observed_mask': torch.tensor(past_observed_mask, dtype=torch.bool),
            # 'future_values': torch.tensor(future_values, dtype=torch.float32), 
            # 'future_time_features': torch.tensor(future_time_features, dtype=torch.float32),
            # 'future_observed_mask': torch.tensor(future_observed_mask, dtype=torch.bool),
            'x': torch.tensor(x, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float32),
            'mark_x': torch.tensor(mark_x, dtype=torch.float32),
            'mark_y': torch.tensor(mark_y, dtype=torch.float32),
        }
        
    def _make_time_features(self): 
        print(self.features, DATETIME_COLUMN)
        df_stamp = pd.to_datetime(self.features[DATETIME_COLUMN])
        df_stamp['month'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.month, 1)
        df_stamp['day'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.day, 1)
        df_stamp['weekday'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.weekday(), 1)
        df_stamp['hour'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.hour, 1)
        df_stamp['minute'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.minute, 1)
        df_stamp['second'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.second, 1)
        # self.time_features = df_stamp.drop([DATETIME_COLUMN], 1).values

In [6]:
def generate_time_features(data):
    # Extract time-related features from the timestamp column
    timestamps = pd.to_datetime(data[DATETIME_COLUMN])
    time_features = pd.DataFrame({
        "second_sin": np.sin(2 * np.pi * timestamps.dt.second / 24),
        "second_cos": np.cos(2 * np.pi * timestamps.dt.second / 24),
        "minute_sin": np.sin(2 * np.pi * timestamps.dt.minute / 24),
        "minute_cos": np.cos(2 * np.pi * timestamps.dt.minute / 24),
        "hour_sin": np.sin(2 * np.pi * timestamps.dt.hour / 24),
        "hour_cos": np.cos(2 * np.pi * timestamps.dt.hour / 24),
        "day_sin": np.sin(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        "day_cos": np.cos(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        "month_sin": np.sin(2 * np.pi * timestamps.dt.month / 12),
        "month_cos": np.cos(2 * np.pi * timestamps.dt.month / 12),
    })
    return time_features

def preprocess_data(data, feature_columns, target_column, sequence_length, label_length, prediction_length):
    # Sort by timestamp if necessary
    data = data.sort_values(DATETIME_COLUMN)

    # Normalize the features and target
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    # features = data.drop(columns=[target_column]).values
    features = data[feature_columns].values
    target = data[target_column].values.reshape(-1, 1)

    features_normalized = scaler_features.fit_transform(features)
    target_normalized = scaler_target.fit_transform(target)
    isnan = np.isnan(features_normalized)
    # print(features_normalized.shape, isnan.shape, features_normalized[~isnan.any(axis=1)].shape, target_normalized[~isnan.any(axis=1)].shape)
    
    # Generate time-based features
    time_features = generate_time_features(data=data)
    NUM_TIME_FEATURES = time_features.shape[1]
    # print(features.shape, target.shape, time_features.shape)

    # Split into train and validation sets
    
    # Split into train and validation sets
    (
        train_features, 
        val_features, 
        train_target, 
        val_target, 
        train_time_features, 
        val_time_features
    ) = (
        train_test_split(
            features_normalized[~isnan.any(axis=1)], 
            target_normalized[~isnan.any(axis=1)], 
            time_features[~isnan.any(axis=1)], 
            test_size=0.2, 
            shuffle=False
        )
    )
    # train_features, val_features, train_target, val_target = train_test_split(
    #     features_normalized, target_normalized, test_size=0.2, shuffle=False
    # )

    # Create datasets
    train_dataset = TimeSeriesDataset(
        train_features,
        train_target,
        train_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )
    val_dataset = TimeSeriesDataset(
        val_features,
        val_target,
        val_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )

    return train_dataset, val_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES

In [7]:
data = DataUtils.load_parquet('IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet')
feature_columns = [
    # 'best_profit_rate', 
    'variance', 
    'worst_profit_rate_before', 
    'opening_price', 
    'high_price', 
    'mid_price', 
    'low_price', 
    'candle_acc_trade_volume', 
    # 'diff_opening_price',
    # 'diff_high_price',
    # 'diff_mid_price',
    # 'diff_low_price', 
    # 'diff_candle_acc_trade_volume',
    'timedelta_after',
    'best_profit_rate',
]
# dataset = preprocess_data(data, feature_columns, 'best_profit_rate', 60, 10)
  # data[[
  #       'variance', 
  #       # 'best_profit_rate_before',
  #       'worst_profit_rate_before', 
  #       'opening_price', 
  #       'high_price', 
  #       'mid_price', 
  #       'low_price', 
  #       'candle_acc_trade_volume', 
  #       # 'diff_opening_price',
  #       # 'diff_high_price',
  #       # 'diff_mid_price',
  #       # 'diff_low_price', 
  #       # 'diff_candle_acc_trade_volume',
  #       'timedelta_after',
  #     ]],
  #     data[['best_profit_rate']]

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet


In [8]:
train_dataset, val_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES = preprocess_data(
  data,
  feature_columns,
  TARGET_COLUMN,
  sequence_length=SEQUENCE_LENGTH,
  prediction_length=PREDICTION_LENGTH,
  label_length=LABEL_LENGTH,
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
@dataclass
class AutoFormerConfig:
  seq_len: int = SEQUENCE_LENGTH
  pred_len: int = PREDICTION_LENGTH
  label_len: int = LABEL_LENGTH
  embed_type: int = 0
  enc_in: int = 9
  dec_in: int = 9
  c_out: int = 9
  d_model: int = 512
  n_heads: int = 8
  e_layers: int = 2
  d_layers: int = 1
  d_ff: int = 2048
  moving_avg: int = 25
  factor: int = 1
  distill: bool = True
  dropout: float = 0.1
  activation: str = 'gelu'
  output_attention: bool = False
  embed: str = 'timeF'
  do_predict: bool = False #whether to predict unseen future data
  freq: str = 'ex' 
  

configs = AutoFormerConfig()
model = AutoFormerModel(configs)

In [10]:
device = torch.device(DEVICE)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.MSELoss()

for epoch in range(EPOCHS):
    train_losses = []
    valid_losses = []
    
    model.train()
    
    print(f'EPOCH {epoch + 1}')
    
    for i, batch in enumerate(train_loader):
        print(f'\rBatch {i + 1}/{len(train_loader)}', end='')
        
        def _get(key):
            return batch[key].to(device)
        
        x = _get('x')
        y = _get('y')
        mark_x = _get('mark_x')
        mark_y = _get('mark_y')
        
        # decoder input
        x_dec = torch.zeros_like(y[:, -PREDICTION_LENGTH:, :]).float()
        x_dec = torch.cat([y[:, :LABEL_LENGTH, :], x_dec], dim=1).float().to(device)
        # print(x.shape, y.shape, mark_x.shape, mark_y.shape, x_dec.shape)
        # print('!1', x_dec.shape, y.shape)
        
        optimizer.zero_grad()
        outputs = model(x, mark_x, x_dec, mark_y)
        # print(outputs.shape, y.shape, y)
        
        f_dim = -1
        outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
        y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        
    print()
    
    model.eval()
    # val_loss = 0.0
    
    with torch.no_grad():
        for batch in train_loader:
            def _get(key):
                return batch[key].to(device)
            
            x = _get('x')
            y = _get('y')
            mark_x = _get('mark_x')
            mark_y = _get('mark_y')
            
            # decoder input
            x_dec = torch.zeros_like(y[:, -PREDICTION_LENGTH:, :]).float()
            x_dec = torch.cat([y[:, :LABEL_LENGTH, :], x_dec], dim=1).float().to(device)
            # print('!1', dec_inp.shape, y.shape)
            
            optimizer.zero_grad()
            outputs = model(x, mark_x, x_dec, mark_y)
            loss = criterion(outputs.detach().cpu(), y.detach().cpu())
            valid_losses.append(loss)
            
    train_loss = np.average(train_losses)
    valid_loss = np.average(valid_losses)
    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {valid_loss:.4f}")

EPOCH 1
Batch 176/5898

KeyboardInterrupt: 

In [None]:
# Save the model
model.save_pretrained('./autoformer_model')