In [1]:
import os
import sys
import math
import time
import asyncio
from datetime import datetime, timedelta
from pathlib import Path
from dataclasses import dataclass, field

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# from transformers import AutoformerConfig, AutoformerModel

from typing import Optional

from modeling_DLinear.models.DLinear import Model as DLinearModel
from modeling_DLinear.utils.tools import EarlyStopping

from upbit import UpbitCandles
from utils.datetime import kst_time

In [2]:
if torch.backends.mps.is_available():
    DEVICE = 'mps'
elif torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

In [208]:
DATETIME_COLUMN = 'candle_date_time_kst'
TARGET_COLUMN = 'best_profit_rate'

CHECKPOINT_PATH = '../model_checkpoints/simple_time_features'
if not os.path.exists(CHECKPOINT_PATH):
  os.makedirs(CHECKPOINT_PATH)

TRAIN_DATA_FILE_NAME = 'IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet'
TEST_DATA_FILE_NAME = 'IOTA_1s_2000_2025-01-12T23:21:27+09:00.parquet_20250112232127.parquet'

# Parameters
# INPUT_LENGTH = 60  # Number of past time steps to use as input
# OUTPUT_LENGTH = 12  # Number of future time steps to predict
# BATCH_SIZE = 32
# LEARNING_RATE = 5e-4
# EPOCHS = 10

SEQUENCE_LENGTH = 24 * 4 * 4
PREDICTION_LENGTH = 24 * 4
LABEL_LENGTH = 24 * 4

BATCH_SIZE = 128
NUM_BATCHES_PER_EPOCH = 100
EPOCHS = 50
LEARNING_RATE = 5e-4
SCALING = 'std'

In [209]:
class DataUtils:
  
  default_path = os.path.join(Path(os.getcwd()).parent, 'data')
  
  @staticmethod
  def load_parquet(file_name: str, file_dir: Optional[str] = None):
    if not file_dir:
        file_dir = DataUtils.default_path
        
    path = os.path.join(file_dir, file_name)

    if not os.path.exists(path) or file_name.split('.')[-1] != 'parquet':
        return

    print(f'Loading parquet file from: {path}')

    return pd.read_parquet(path)
  

In [210]:
# Custom Dataset for Multivariate Time Series
class TimeSeriesDataset(Dataset):
    
    def __init__(self, features, time_features, sequence_length, label_length, prediction_length):
        self.features = features
        # self.target = target.flatten()
        self.time_features = time_features
        # self._make_time_features()
        self.sequence_length = sequence_length
        self.label_length = label_length
        self.prediction_length = prediction_length

    def __len__(self):
        return len(self.features) - self.sequence_length - self.prediction_length + 1

    def __getitem__(self, idx):
        start_x = idx
        end_x = idx + self.sequence_length
        start_y = end_x - self.label_length
        end_y = end_x + self.prediction_length
        
        x = self.features[start_x : end_x]
        # mark_x = self.time_features[start_x : end_x].values
        
        y = self.features[start_y : end_y]
        # mark_y = self.time_features[start_y : end_y].values
        
        return {
            'x': torch.tensor(x, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float32),
            # 'mark_x': torch.tensor(mark_x, dtype=torch.float32),
            # 'mark_y': torch.tensor(mark_y, dtype=torch.float32),
        }
        
    def _make_time_features(self): 
        print(self.features, DATETIME_COLUMN)
        df_stamp = pd.to_datetime(self.features[DATETIME_COLUMN])
        df_stamp['month'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.month, 1)
        df_stamp['day'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.day, 1)
        df_stamp['weekday'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.weekday(), 1)
        df_stamp['hour'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.hour, 1)
        df_stamp['minute'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.minute, 1)
        df_stamp['second'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.second, 1)
        # self.time_features = df_stamp.drop([DATETIME_COLUMN], 1).values

In [211]:
def generate_time_features(data):
    # Extract time-related features from the timestamp column
    timestamps = pd.to_datetime(data[DATETIME_COLUMN])
    time_features = pd.DataFrame({
        # "second_sin": np.sin(2 * np.pi * timestamps.dt.second / 24),
        # "second_cos": np.cos(2 * np.pi * timestamps.dt.second / 24),
        # "minute_sin": np.sin(2 * np.pi * timestamps.dt.minute / 24),
        # "minute_cos": np.cos(2 * np.pi * timestamps.dt.minute / 24),
        # "hour_sin": np.sin(2 * np.pi * timestamps.dt.hour / 24),
        # "hour_cos": np.cos(2 * np.pi * timestamps.dt.hour / 24),
        # "day_sin": np.sin(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        # "day_cos": np.cos(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        # "month_sin": np.sin(2 * np.pi * timestamps.dt.month / 12),
        # "month_cos": np.cos(2 * np.pi * timestamps.dt.month / 12),
        'month': timestamps.dt.month,
        'day': timestamps.dt.day,
        'weekday': timestamps.dt.weekday,
        'hour': timestamps.dt.hour,
        'minute': timestamps.dt.minute,
        'second': timestamps.dt.second,
    })
    return time_features

def preprocess_data(data, feature_columns, sequence_length, label_length, prediction_length, test_size=0.2):
    # Sort by timestamp if necessary
    data = data.sort_values(DATETIME_COLUMN)

    # Normalize the features and target
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    df_features = data[feature_columns].dropna()
    
    features = df_features.drop(columns=[DATETIME_COLUMN, TARGET_COLUMN]).values
    # features = data[feature_columns].values
    target = df_features[TARGET_COLUMN].values.reshape(-1, 1)

    features_normalized = scaler_features.fit_transform(features)
    target_normalized = scaler_target.fit_transform(target)
    data_normalized = np.concatenate([features_normalized, target_normalized], axis=1)
    
    time_features = generate_time_features(df_features)
    NUM_TIME_FEATURES = time_features.shape[1]
    
    if test_size == 1:
        test_dataset = TimeSeriesDataset(
            data_normalized,
            # target_normalized[~isnan.any(axis=1)],
            time_features,
            sequence_length=sequence_length,
            prediction_length=prediction_length,
            label_length=label_length,
        )
        return None, test_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES
    
    (
        train_features, 
        val_features, 
        # train_target, 
        # val_target, 
        train_time_features, 
        val_time_features
    ) = (
        train_test_split(
            data_normalized, 
            # target_normalized[~isnan.any(axis=1)], 
            time_features, 
            test_size=test_size, 
            shuffle=False
        )
    )
    
    train_dataset = TimeSeriesDataset(
        train_features,
        # train_target,
        train_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )
    val_dataset = TimeSeriesDataset(
        val_features,
        # val_target,
        val_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )

    return train_dataset, val_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES

# def inverse_transform(data, predictions, feature_columns, scaler):
#     features_dict = dict([key, data[key]] for key in feature_columns if key != TARGET_COLUMN)
#     df = pd.DataFrame({
#         **features_dict,
#         TARGET_COLUMN: predictions,
#     })
#     return pd.DataFrame(scaler.inverse_transform(df), columns=df.columns)

In [212]:
data = DataUtils.load_parquet(TRAIN_DATA_FILE_NAME)
feature_columns = [
    # 'best_profit_rate', 
    'variance', 
    'worst_profit_rate_before', 
    'opening_price', 
    'high_price', 
    'mid_price', 
    'low_price', 
    'candle_acc_trade_volume', 
    # 'diff_opening_price',
    # 'diff_high_price',
    # 'diff_mid_price',
    # 'diff_low_price', 
    # 'diff_candle_acc_trade_volume',
    'timedelta_after',
    DATETIME_COLUMN,
    # 'best_profit_rate',
    TARGET_COLUMN,
]
# dataset = preprocess_data(data, feature_columns, 'best_profit_rate', 60, 10)
  # data[[
  #       'variance', 
  #       # 'best_profit_rate_before',
  #       'worst_profit_rate_before', 
  #       'opening_price', 
  #       'high_price', 
  #       'mid_price', 
  #       'low_price', 
  #       'candle_acc_trade_volume', 
  #       # 'diff_opening_price',
  #       # 'diff_high_price',
  #       # 'diff_mid_price',
  #       # 'diff_low_price', 
  #       # 'diff_candle_acc_trade_volume',
  #       'timedelta_after',
  #     ]],
  #     data[['best_profit_rate']]

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet


In [213]:
train_dataset, valid_dataset, _, _, NUM_TIME_FEATURES = preprocess_data(
  data,
  feature_columns,
  sequence_length=SEQUENCE_LENGTH,
  prediction_length=PREDICTION_LENGTH,
  label_length=LABEL_LENGTH,
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [214]:

@dataclass
class DLinearConfig:
  individual: bool = False
  seq_len: int = SEQUENCE_LENGTH
  pred_len: int = PREDICTION_LENGTH
  label_len: int = LABEL_LENGTH
  embed_type: int = 0
  enc_in: int = 9
  dec_in: int = 9
  c_out: int = 9
  d_model: int = 512
  n_heads: int = 8
  e_layers: int = 2
  d_layers: int = 1
  d_ff: int = 2048
  moving_avg: int = 25
  factor: int = 1
  distill: bool = True
  dropout: float = 0.1
  activation: str = 'gelu'
  output_attention: bool = False
  embed: str = 'timeF'
  do_predict: bool = False #whether to predict unseen future data
  freq: str = 'ex' 
  

configs = DLinearConfig()
model = DLinearModel(configs)

f_dim = -1
device = torch.device(DEVICE)

model.to(device)

Model(
  (decompsition): series_decomp(
    (moving_avg): moving_avg(
      (avg): AvgPool1d(kernel_size=(25,), stride=(1,), padding=(0,))
    )
  )
  (Linear_Seasonal): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Trend): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Decoder): Linear(in_features=384, out_features=96, bias=True)
)

In [215]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.MSELoss()

early_stopping = EarlyStopping(patience=5, verbose=True)

best_valid_loss = np.inf
learning_rate = LEARNING_RATE

for epoch in range(EPOCHS):
    train_losses = []
    valid_losses = []
    
    model.train()
    
    for i, batch in enumerate(train_loader):
        print(f'\r[EPOCH {epoch + 1}] {i + 1}/{len(train_loader)}', end=' ')
        
        def _get(key):
            return batch[key].to(device)
        
        x = _get('x')
        y = _get('y')
        
        optimizer.zero_grad()
        outputs = model(x)
        
        outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
        y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        train_losses.append(loss.item())
        
    # print()
    
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(valid_loader):
            def _get(key):
                return batch[key].to(device)
            
            x = _get('x')
            y = _get('y')
            
            optimizer.zero_grad()
            outputs = model(x)
            
            outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
            y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
            
            loss = criterion(outputs.detach().cpu(), y.detach().cpu())
            valid_losses.append(loss)
            
    train_loss = np.average(train_losses)
    valid_loss = np.average(valid_losses)
    
    print(f'- Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), '../model_checkpoints/DLinear.pth')
    #     print('(Best model)')
    # else:
    #     print()
       
    early_stopping(valid_loss, model, CHECKPOINT_PATH)
     
    if early_stopping.early_stop:
        print('Early stopping')
        break
        
    prev_learning_rate = learning_rate
    learning_rate *= (0.5 ** ((epoch - 1) // 1))
    print(f'Learning rate: {prev_learning_rate} --> {learning_rate}')

[EPOCH 1] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
Validation loss decreased (inf --> 0.000036).  Saving model ...
Learning rate: 0.0005 --> 0.001
[EPOCH 2] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
EarlyStopping counter: 1 out of 5
Learning rate: 0.001 --> 0.001
[EPOCH 3] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
EarlyStopping counter: 2 out of 5
Learning rate: 0.001 --> 0.0005
[EPOCH 4] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
EarlyStopping counter: 3 out of 5
Learning rate: 0.0005 --> 0.000125
[EPOCH 5] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
EarlyStopping counter: 4 out of 5
Learning rate: 0.000125 --> 1.5625e-05
[EPOCH 6] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0000
EarlyStopping counter: 5 out of 5
Early stopping


In [216]:
class Tester:
  
  def __init__(self, model):
    self.model = model
    # self.data = None
    self.load_data()

  # def _get_candles_from_upbit(self, count, to):
  #   num_batches = count // 200 + math.ceil(count / 200)
    
  #   data = []
    
  #   for i in range(num_batches):
  #     print(f'\r{i + 1}/{num_batches}', end="")

  #     # url = f"{base_url(market, candle_unit)}?market={market}&count={count}&to={str(to).split('.')[:-1][0]}"
  #     # response = requests.get(url, headers=headers)
  #     # data += json.loads(response.text)
  #     _count = min(count - len(data), 200)
  #     data += UpbitCandles.get_candles('KRW-IOTA', 'second', to, _count)
  #     print(data)
  #     # to -= delta_to * count
  #     last_datetime = datetime.strptime(data[-1]['candle_date_time_kst'], '%Y-%m-%dT%H:%M:%S')
  #     to = kst_time(last_datetime - timedelta(seconds=1))

  #     time.sleep(0.1)

  #   print()

  #   self.data = pd.DataFrame(data=data)

  # def get_data(self):
  #   # while True:
  #   self._get_candles_from_upbit(count=2000, to='2024-10-15T18:19:37+09:00')
  #   self.dataset, _, _ = preprocess_data(
  #     data=data,
  #     feature_columns=feature_columns,
  #     target_column=TARGET_COLUMN,
  #     sequence_length=SEQUENCE_LENGTH,
  #     label_length=PREDICTION_LENGTH,
  #     prediction_length=LABEL_LENGTH, 
  #     test_size=1
  #   )
    
  def load_data(self):
    self.data = DataUtils.load_parquet(TEST_DATA_FILE_NAME)
    _, self.dataset, _, self.scaler_target, _ = preprocess_data(
      data=self.data,
      feature_columns=feature_columns,
      # target_column=TARGET_COLUMN,
      sequence_length=SEQUENCE_LENGTH,
      label_length=PREDICTION_LENGTH,
      prediction_length=LABEL_LENGTH, 
      test_size=1
    )
    
  def data_loader(self):
    return DataLoader(self.dataset, batch_size=1, shuffle=False, drop_last=True)
  
  def test(self):
    self.model.eval()
    
    test_losses = []
    
    with torch.no_grad():
      for i, batch in enumerate(self.data_loader()):
        def _get(key):
            return batch[key].to(device)
          
        x = _get('x')
        y = _get('y')
        
        optimizer.zero_grad()
        outputs = model(x)
        
        outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
        y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        
        loss = criterion(outputs.detach().cpu(), y.detach().cpu())
        test_losses.append(loss)
      
    print(f'TEST LOSS: {np.average(test_losses)}')
    return np.average(test_losses)

  def test_sample(self):
    self.model.eval()
    
    output_list = []
    truth_list = []
    
    with torch.no_grad():
      for i, batch in enumerate(self.data_loader()):
        def _get(key):
            return batch[key].to(device)
            
        x = _get('x')
        y = _get('y')
        
        outputs = model(x)[:, -PREDICTION_LENGTH:, f_dim:]
        y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        
        output_list += list(outputs.detach().cpu().flatten().numpy())
        truth_list += list(y.detach().cpu().flatten().numpy())
      
    out = self.scaler_target.inverse_transform(np.array(output_list).reshape(-1, 1))
    truth = self.scaler_target.inverse_transform(np.array(truth_list).reshape(-1, 1))
    
    return pd.DataFrame({
      'output': out.flatten(),
      'truth': truth.flatten(),
      'diff': out.flatten() - truth.flatten(),
    })

In [217]:
model.load_state_dict(torch.load(f'{CHECKPOINT_PATH}/checkpoint.pth'))
tester = Tester(model)

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000_2025-01-12T23:21:27+09:00.parquet_20250112232127.parquet


In [218]:
tester.test()
df = tester.test_sample()

TEST LOSS: 0.04067673906683922


In [219]:
df

Unnamed: 0,output,truth,diff
0,0.000125,0.000199,-0.000074
1,-0.000264,-0.000099,-0.000165
2,-0.000032,0.000298,-0.000330
3,-0.000194,0.000298,-0.000492
4,-0.000092,0.000298,-0.000390
...,...,...,...
134491,0.000915,0.002168,-0.001253
134492,0.001017,0.003354,-0.002338
134493,0.000956,0.002761,-0.001805
134494,0.001043,0.003750,-0.002708


In [220]:
thresholds = [2e-4, 1e-4, 5e-5, 1e-5]
abs_diff = df['diff'].abs()

for threshold in thresholds:
    print(f'{threshold}: {len(abs_diff[abs_diff <= threshold]) / len(abs_diff)}')

0.0002: 0.07012847965738758
0.0001: 0.03499732334047109
5e-05: 0.017814656197953842
1e-05: 0.003643231025458006
