In [23]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoformerConfig, AutoformerModel

from typing import Optional

In [49]:
if torch.backends.mps.is_available():
    DEVICE = 'mps'
elif torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

In [28]:
class DataUtils:
  
  default_path = os.path.join(Path(os.getcwd()).parent, 'data')
  
  @staticmethod
  def load_parquet(file_name: str, file_dir: Optional[str] = None):
    if not file_dir:
        file_dir = DataUtils.default_path
        
    path = os.path.join(file_dir, file_name)

    if not os.path.exists(path) or file_name.split('.')[-1] != 'parquet':
        return

    print(f'Loading parquet file from: {path}')

    return pd.read_parquet(path)
  

In [74]:
# Custom Dataset for Multivariate Time Series
class TimeSeriesDataset(Dataset):
    def __init__(self, features, target, input_length, output_length):
        self.features = features
        self.target = target
        self.input_length = input_length
        self.output_length = output_length

    def __len__(self):
        return len(self.features) - self.input_length - self.output_length + 1

    def __getitem__(self, idx):
        x = self.features[idx : idx + self.input_length]
        y = self.target[idx : idx + self.input_length]
        # y = self.target[idx + self.input_length : idx + self.input_length + self.output_length]
        past_observed_mask = np.ones((self.input_length, x.shape[1]))
        return (
            torch.tensor(x, dtype=torch.float32), 
            torch.tensor(y, dtype=torch.float32),
            torch.tensor(past_observed_mask, dtype=torch.float32),
        )

# Preprocessing Function
def preprocess_data(data, feature_columns, target_column, input_length, output_length):
    # Sort by timestamp if necessary
    data = data.sort_values('candle_date_time_kst')

    # Normalize the features and target
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    # features = data.drop(columns=[target_column]).values
    features = data[feature_columns].values
    target = data[target_column].values.reshape(-1, 1)

    features_normalized = scaler_features.fit_transform(features)
    target_normalized = scaler_target.fit_transform(target)

    # Split into train and validation sets
    train_features, val_features, train_target, val_target = train_test_split(
        features_normalized, target_normalized, test_size=0.2, shuffle=False
    )

    # Create datasets
    train_dataset = TimeSeriesDataset(train_features, train_target.flatten(), input_length, output_length)
    val_dataset = TimeSeriesDataset(val_features, val_target.flatten(), input_length, output_length)

    return train_dataset, val_dataset, scaler_features, scaler_target

In [75]:
data = DataUtils.load_parquet('IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet')
feature_columns = [
    # 'best_profit_rate', 
    'variance', 
    'worst_profit_rate_before', 
    'opening_price', 
    'high_price', 
    'mid_price', 
    'low_price', 
    'candle_acc_trade_volume', 
    # 'diff_opening_price',
    # 'diff_high_price',
    # 'diff_mid_price',
    # 'diff_low_price', 
    # 'diff_candle_acc_trade_volume',
    'timedelta_after',
]
# dataset = preprocess_data(data, feature_columns, 'best_profit_rate', 60, 10)
  # data[[
  #       'variance', 
  #       # 'best_profit_rate_before',
  #       'worst_profit_rate_before', 
  #       'opening_price', 
  #       'high_price', 
  #       'mid_price', 
  #       'low_price', 
  #       'candle_acc_trade_volume', 
  #       # 'diff_opening_price',
  #       # 'diff_high_price',
  #       # 'diff_mid_price',
  #       # 'diff_low_price', 
  #       # 'diff_candle_acc_trade_volume',
  #       'timedelta_after',
  #     ]],
  #     data[['best_profit_rate']]

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet


In [76]:
# Parameters
TARGET_COLUMN = 'best_profit_rate'
INPUT_LENGTH = 48  # Number of past time steps to use as input
OUTPUT_LENGTH = 12  # Number of future time steps to predict
BATCH_SIZE = 32
EPOCHS = 10
LEARNING_RATE = 5e-4

In [77]:
# Preprocess data
train_dataset, val_dataset, scaler_features, scaler_target = preprocess_data(
  data,
  feature_columns,
  TARGET_COLUMN,
  60,
  10
)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [78]:
# Load Autoformer Model
configuration = AutoformerConfig(
    input_size=len(feature_columns),  # Number of input features
    prediction_length=OUTPUT_LENGTH,
    context_length=INPUT_LENGTH,
    encoder_layers=3,
    decoder_layers=3,
)
model = AutoformerModel(configuration)

In [79]:
# Optimizer and Loss Function
device = torch.device(DEVICE)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.MSELoss()

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for x, y, mask in train_loader:
        x, y, mask = x.to(device), y.to(device), mask.to(device)

        optimizer.zero_grad()
        outputs = model(
            # x.unsqueeze(-1),
            past_values=x,
            past_time_features=torch.zeros_like(x),  # Placeholder for time-based features
            past_observed_mask=mask,  # Mask indicating observed data
        ).logits
        loss = criterion(outputs.squeeze(-1), y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x.unsqueeze(-1)).logits
            loss = criterion(outputs.squeeze(-1), y)
            val_loss += loss.item()

    print(f"Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss / len(train_loader):.4f}, Val Loss: {val_loss / len(val_loader):.4f}")

ValueError: input length 48 and time feature lengths 53 does not match

In [None]:
# Save the model
model.save_pretrained('./autoformer_model')