In [43]:

import os
import sys
import math
import time
import asyncio
from datetime import datetime, timedelta
from pathlib import Path
from dataclasses import dataclass, field

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
# from transformers import AutoformerConfig, AutoformerModel

import plotly.express as px
import plotly.graph_objects as go

from typing import Optional

from modeling_DLinear.models.DLinear import Model as DLinearModel
from modeling_DLinear.utils.tools import EarlyStopping

from upbit import UpbitCandles
from utils.datetime import kst_time


In [2]:
if torch.backends.mps.is_available():
    DEVICE = 'mps'
elif torch.cuda.is_available():
    DEVICE = 'cuda'
else:
    DEVICE = 'cpu'

In [3]:
DATETIME_COLUMN = 'candle_date_time_kst'
TARGET_COLUMN = 'best_profit_rate'

CHECKPOINT_PATH = '../model_checkpoints/simple_time_features'
if not os.path.exists(CHECKPOINT_PATH):
  os.makedirs(CHECKPOINT_PATH)

TRAIN_DATA_FILE_NAME = 'IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet'
TEST_DATA_FILE_NAME = 'IOTA_1s_2000_2025-01-12T23:21:27+09:00.parquet_20250112232127.parquet'

# Parameters
# INPUT_LENGTH = 60  # Number of past time steps to use as input
# OUTPUT_LENGTH = 12  # Number of future time steps to predict
# BATCH_SIZE = 32
# LEARNING_RATE = 5e-4
# EPOCHS = 10

SEQUENCE_LENGTH = 24 * 4 * 4
PREDICTION_LENGTH = 24 * 4
LABEL_LENGTH = 24 * 4

BATCH_SIZE = 128
NUM_BATCHES_PER_EPOCH = 100
EPOCHS = 50
LEARNING_RATE = 1e-3
SCALING = 'std'

In [4]:
class DataUtils:
  
  default_path = os.path.join(Path(os.getcwd()).parent, 'data')
  
  @staticmethod
  def load_parquet(file_name: str, file_dir: Optional[str] = None):
    if not file_dir:
        file_dir = DataUtils.default_path
        
    path = os.path.join(file_dir, file_name)

    if not os.path.exists(path) or file_name.split('.')[-1] != 'parquet':
        return

    print(f'Loading parquet file from: {path}')

    return pd.read_parquet(path)
  

In [5]:
# Custom Dataset for Multivariate Time Series
class TimeSeriesDataset(Dataset):

    def __init__(self, features, time_features, sequence_length, label_length, prediction_length):
        self.features = features
        # self.target = target.flatten()
        self.time_features = time_features
        # self._make_time_features()
        self.sequence_length = sequence_length
        self.label_length = label_length
        self.prediction_length = prediction_length

    def __len__(self):
        return len(self.features) - self.sequence_length - self.prediction_length + 1

    def __getitem__(self, idx):
        start_x = idx
        end_x = idx + self.sequence_length
        start_y = end_x - self.label_length
        end_y = end_x + self.prediction_length
        
        x = self.features[start_x : end_x]
        # mark_x = self.time_features[start_x : end_x].values
        
        y = self.features[start_y : end_y]
        # mark_y = self.time_features[start_y : end_y].values
        
        return {
            'x': torch.tensor(x, dtype=torch.float32),
            'y': torch.tensor(y, dtype=torch.float32),
            # 'mark_x': torch.tensor(mark_x, dtype=torch.float32),
            # 'mark_y': torch.tensor(mark_y, dtype=torch.float32),
        }

    def _make_time_features(self): 
        # print(self.features, DATETIME_COLUMN)
        df_stamp = pd.to_datetime(self.features[DATETIME_COLUMN])
        df_stamp['month'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.month, 1)
        df_stamp['day'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.day, 1)
        df_stamp['weekday'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.weekday(), 1)
        df_stamp['hour'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.hour, 1)
        df_stamp['minute'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.minute, 1)
        df_stamp['second'] = df_stamp[DATETIME_COLUMN].apply(lambda row: row.second, 1)
        # self.time_features = df_stamp.drop([DATETIME_COLUMN], 1).values

In [6]:
def generate_time_features(data):
    # Extract time-related features from the timestamp column
    timestamps = pd.to_datetime(data[DATETIME_COLUMN])
    time_features = pd.DataFrame({
        # "second_sin": np.sin(2 * np.pi * timestamps.dt.second / 24),
        # "second_cos": np.cos(2 * np.pi * timestamps.dt.second / 24),
        # "minute_sin": np.sin(2 * np.pi * timestamps.dt.minute / 24),
        # "minute_cos": np.cos(2 * np.pi * timestamps.dt.minute / 24),
        # "hour_sin": np.sin(2 * np.pi * timestamps.dt.hour / 24),
        # "hour_cos": np.cos(2 * np.pi * timestamps.dt.hour / 24),
        # "day_sin": np.sin(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        # "day_cos": np.cos(2 * np.pi * timestamps.dt.day / timestamps.dt.days_in_month),
        # "month_sin": np.sin(2 * np.pi * timestamps.dt.month / 12),
        # "month_cos": np.cos(2 * np.pi * timestamps.dt.month / 12),
        'month': timestamps.dt.month,
        'day': timestamps.dt.day,
        'weekday': timestamps.dt.weekday,
        'hour': timestamps.dt.hour,
        'minute': timestamps.dt.minute,
        'second': timestamps.dt.second,
    })
    return time_features

def preprocess_data(data, feature_columns, sequence_length, label_length, prediction_length, test_size=0.2):
    # Sort by timestamp if necessary
    data = data.sort_values(DATETIME_COLUMN)

    # Normalize the features and target
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()

    df_features = data[feature_columns].dropna()
    
    features = df_features.drop(columns=[DATETIME_COLUMN, TARGET_COLUMN]).values
    # features = data[feature_columns].values
    target = df_features[TARGET_COLUMN].values.reshape(-1, 1)

    features_normalized = scaler_features.fit_transform(features)
    target_normalized = scaler_target.fit_transform(target)
    data_normalized = np.concatenate([features_normalized, target_normalized], axis=1)
    
    time_features = generate_time_features(df_features)
    NUM_TIME_FEATURES = time_features.shape[1]
    
    if test_size == 1:
        test_dataset = TimeSeriesDataset(
            data_normalized,
            # target_normalized[~isnan.any(axis=1)],
            time_features,
            sequence_length=sequence_length,
            prediction_length=prediction_length,
            label_length=label_length,
        )
        return None, test_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES
    
    (
        train_features, 
        val_features, 
        # train_target, 
        # val_target, 
        train_time_features, 
        val_time_features
    ) = (
        train_test_split(
            data_normalized, 
            # target_normalized[~isnan.any(axis=1)], 
            time_features, 
            test_size=test_size, 
            shuffle=False
        )
    )
    
    train_dataset = TimeSeriesDataset(
        train_features,
        # train_target,
        train_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )
    val_dataset = TimeSeriesDataset(
        val_features,
        # val_target,
        val_time_features,
        sequence_length=sequence_length,
        prediction_length=prediction_length,
        label_length=label_length,
    )

    return train_dataset, val_dataset, scaler_features, scaler_target, NUM_TIME_FEATURES

# def inverse_transform(data, predictions, feature_columns, scaler):
#     features_dict = dict([key, data[key]] for key in feature_columns if key != TARGET_COLUMN)
#     df = pd.DataFrame({
#         **features_dict,
#         TARGET_COLUMN: predictions,
#     })
#     return pd.DataFrame(scaler.inverse_transform(df), columns=df.columns)

In [7]:
data = DataUtils.load_parquet(TRAIN_DATA_FILE_NAME)
feature_columns = [
    # 'best_profit_rate', 
    'variance', 
    'worst_profit_rate_before', 
    'opening_price', 
    'high_price', 
    'mid_price', 
    'low_price', 
    'candle_acc_trade_volume', 
    # 'diff_opening_price',
    # 'diff_high_price',
    # 'diff_mid_price',
    # 'diff_low_price', 
    # 'diff_candle_acc_trade_volume',
    'timedelta_after',
    DATETIME_COLUMN,
    # 'best_profit_rate',
    TARGET_COLUMN,
]
# dataset = preprocess_data(data, feature_columns, 'best_profit_rate', 60, 10)
  # data[[
  #       'variance', 
  #       # 'best_profit_rate_before',
  #       'worst_profit_rate_before', 
  #       'opening_price', 
  #       'high_price', 
  #       'mid_price', 
  #       'low_price', 
  #       'candle_acc_trade_volume', 
  #       # 'diff_opening_price',
  #       # 'diff_high_price',
  #       # 'diff_mid_price',
  #       # 'diff_low_price', 
  #       # 'diff_candle_acc_trade_volume',
  #       'timedelta_after',
  #     ]],
  #     data[['best_profit_rate']]

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000000_2025-01-05T18:19:37+09:00.parquet_20250105181937.parquet


In [8]:
train_dataset, valid_dataset, _, _, NUM_TIME_FEATURES = preprocess_data(
  data,
  feature_columns,
  sequence_length=SEQUENCE_LENGTH,
  prediction_length=PREDICTION_LENGTH,
  label_length=LABEL_LENGTH,
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [25]:

@dataclass
class DLinearConfig:
  individual: bool = False
  seq_len: int = SEQUENCE_LENGTH
  pred_len: int = PREDICTION_LENGTH
  label_len: int = LABEL_LENGTH
  embed_type: int = 0
  enc_in: int = 9
  dec_in: int = 9
  c_out: int = 9
  d_model: int = 1024
  n_heads: int = 12
  e_layers: int = 12
  d_layers: int = 12
  d_ff: int = 2048
  moving_avg: int = 25
  factor: int = 1
  distill: bool = True
  dropout: float = 0.1
  activation: str = 'gelu'
  output_attention: bool = False
  embed: str = 'timeF'
  do_predict: bool = False #whether to predict unseen future data
  freq: str = 'ex' 
  

configs = DLinearConfig()
model = DLinearModel(configs)

f_dim = 0
device = torch.device(DEVICE)

model.to(device)

Model(
  (decompsition): series_decomp(
    (moving_avg): moving_avg(
      (avg): AvgPool1d(kernel_size=(25,), stride=(1,), padding=(0,))
    )
  )
  (Linear_Seasonal): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Trend): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Decoder): Linear(in_features=384, out_features=96, bias=True)
)

In [26]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.MSELoss()

early_stopping = EarlyStopping(patience=5, verbose=True)

In [None]:
best_valid_loss = np.inf
learning_rate = LEARNING_RATE

for epoch in range(EPOCHS):
    # train_losses = []
    # valid_losses = []
    train_loss = 0
    valid_loss = 0
    
    model.train()
    
    for i, batch in enumerate(train_loader):
        print(f'\r[EPOCH {epoch + 1}] {i + 1}/{len(train_loader)}', end=' ')
        
        def _get(key):
            return batch[key].to(device)
        
        x = _get('x')
        y = _get('y')
        
        optimizer.zero_grad()
        outputs = model(x)
        
        outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
        y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        # train_losses.append(loss.item())
        train_loss += loss.item()
        
    model.eval()
    
    with torch.no_grad():
        for i, batch in enumerate(valid_loader):
            def _get(key):
                return batch[key].to(device)
            
            x = _get('x')
            y = _get('y')
            
            optimizer.zero_grad()
            outputs = model(x)
            
            outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
            y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
            
            loss = criterion(outputs.detach().cpu(), y.detach().cpu())
            # valid_losses.append(loss)
            valid_loss += loss.item()
            
    # train_loss = np.average(train_losses)
    # valid_loss = np.average(valid_losses)
    train_loss /= len(train_loader)
    valid_loss /= len(valid_loader)
    
    print(f'- Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}')
    
    # if valid_loss < best_valid_loss:
    #     best_valid_loss = valid_loss
    #     torch.save(model.state_dict(), '../model_checkpoints/DLinear.pth')
    #     print('(Best model)')
    # else:
    #     print()
       
    early_stopping(valid_loss, model, CHECKPOINT_PATH)
     
    if early_stopping.early_stop:
        print('Early stopping')
        break
        
    prev_learning_rate = learning_rate
    learning_rate *= (0.75 ** ((epoch - 1) // 1))
    print(f'Learning rate: {prev_learning_rate} --> {learning_rate}')

[EPOCH 1] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
Validation loss decreased (inf --> 0.000076).  Saving model ...
Learning rate: 0.0005 --> 0.001
[EPOCH 2] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
Validation loss decreased (0.000076 --> 0.000060).  Saving model ...
Learning rate: 0.001 --> 0.001
[EPOCH 3] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
EarlyStopping counter: 1 out of 5
Learning rate: 0.001 --> 0.0005
[EPOCH 4] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
EarlyStopping counter: 2 out of 5
Learning rate: 0.0005 --> 0.000125
[EPOCH 5] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
Validation loss decreased (0.000060 --> 0.000056).  Saving model ...
Learning rate: 0.000125 --> 1.5625e-05
[EPOCH 6] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
EarlyStopping counter: 1 out of 5
Learning rate: 1.5625e-05 --> 9.765625e-07
[EPOCH 7] 5898/5898 - Train Loss: 0.0001, Valid Loss: 0.0001
EarlyStopping counter: 2 out of 5
Learning rate: 9.765625

In [28]:
class Tester:
  
  def __init__(self, model):
    self.model = model
    # self.data = None
    self.load_data()
    
  def load_data(self):
    self.data = DataUtils.load_parquet(TEST_DATA_FILE_NAME)
    _, self.dataset, _, self.scaler_target, _ = preprocess_data(
      data=self.data,
      feature_columns=feature_columns,
      # target_column=TARGET_COLUMN,
      sequence_length=SEQUENCE_LENGTH,
      label_length=PREDICTION_LENGTH,
      prediction_length=LABEL_LENGTH, 
      test_size=1
    )
    
  def data_loader(self):
    return DataLoader(self.dataset, batch_size=1, shuffle=False, drop_last=False)
  
  def test(self):
    self.model.eval()
    
    test_losses = []
    
    with torch.no_grad():
      for i, batch in enumerate(self.data_loader()):
        def _get(key):
            return batch[key].to(device)
          
        x = _get('x')
        y = _get('y')
        
        optimizer.zero_grad()
        outputs = model(x)
        
        # outputs = outputs[:, -PREDICTION_LENGTH:, f_dim:]
        # y = y[:, -PREDICTION_LENGTH:, f_dim:].to(device)
        outputs = outputs[:, 0, f_dim:]
        y = y[:, 0, f_dim:].to(device)
        
        loss = criterion(outputs.detach().cpu(), y.detach().cpu())
        test_losses.append(loss)
      
    print(f'TEST LOSS: {np.average(test_losses)}')
    return np.average(test_losses)

  def test_sample(self):
    self.model.eval()
    
    output_list = []
    truth_list = []
    
    with torch.no_grad():
      for i, batch in enumerate(self.data_loader()):
        def _get(key):
            return batch[key].to(device)
            
        x = _get('x')
        y = _get('y')
        
        outputs = model(x)[:, 0, f_dim:]
        y = y[:, 0, f_dim:].to(device)
        
        output_list += list(outputs.detach().cpu().flatten().numpy())
        truth_list += list(y.detach().cpu().flatten().numpy())
      
    out = self.scaler_target.inverse_transform(np.array(output_list).reshape(-1, 1))
    truth = self.scaler_target.inverse_transform(np.array(truth_list).reshape(-1, 1))
    
    print(out.shape, truth.shape)
    
    return pd.DataFrame({
      # DATETIME_COLUMN: self.data[DATETIME_COLUMN],
      'output': out.flatten(),
      'truth': truth.flatten(),
      'diff': out.flatten() - truth.flatten(),
    })

In [29]:
model.load_state_dict(torch.load(f'{CHECKPOINT_PATH}/checkpoint.pth'))
tester = Tester(model)

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1s_2000_2025-01-12T23:21:27+09:00.parquet_20250112232127.parquet


In [30]:
tester.test()
df = tester.test_sample()

TEST LOSS: 0.049236007034778595
(12609, 1) (12609, 1)


In [31]:
df['abs_diff'] = df['diff'].abs()

In [32]:
_cat = pd.cut(df['output'], bins=[-np.inf, 0.003, np.inf], labels=['down', 'up'])
pd.DataFrame({
  'up': [_cat[_cat == 'up'].count()],
  # 'stay': [_cat[_cat == 'stay'].count()],
  'down': [_cat[_cat == 'down'].count()],
})

Unnamed: 0,up,down
0,6346,6263


In [33]:
df.loc[df['abs_diff'] == df['abs_diff'].max()]

Unnamed: 0,output,truth,diff,abs_diff
11139,-0.001729,0.010568,-0.012297,0.012297


In [34]:
df['diff'].describe()

count    12609.000000
mean         0.000257
std          0.002769
min         -0.012297
25%         -0.001241
50%          0.000106
75%          0.001757
max          0.011300
Name: diff, dtype: float64

In [35]:
df.sort_values('output', ascending=False)

Unnamed: 0,output,truth,diff,abs_diff
7266,0.010398,0.007771,0.002628,0.002628
7239,0.010387,0.007099,0.003287,0.003287
7265,0.010369,0.007771,0.002599,0.002599
7240,0.010347,0.007160,0.003187,0.003187
7238,0.010335,0.007099,0.003235,0.003235
...,...,...,...,...
11634,-0.002095,-0.001963,-0.000132,0.000132
4614,-0.002096,-0.001857,-0.000239,0.000239
12561,-0.002144,-0.001964,-0.000180,0.000180
6909,-0.002154,-0.001962,-0.000192,0.000192


In [36]:
def truth_vs_output(df, fig=None):
  if fig is None:
    fig = go.Figure()
    
  fig.add_trace(go.Scatter(x=df.index, y=df['truth'], mode='lines', name='Truth'))
  fig.add_trace(go.Scatter(x=df.index, y=df['output'], mode='lines', name='Output'))
  
  return fig

In [37]:
truth_vs_output(df)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [38]:
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df.index, y=df['diff'], mode='lines', name='Differences'))
  fig

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [39]:
expect_to_buy = df.loc[df['output'] >= 0.003]
expect_to_buy[expect_to_buy['truth'] >= 0.003].values.shape[0] / len(expect_to_buy)

0.7259691144027735

In [40]:
expect_to_buy = df.loc[df['output'] >= 0.002]
expect_to_buy[expect_to_buy['truth'] >= 0.002].values.shape[0] / len(expect_to_buy)

0.7455820855254283

In [41]:
expect_to_buy = df.loc[df['output'] >= 0.0025]
expect_to_buy[expect_to_buy['truth'] >= 0.0025].values.shape[0] / len(expect_to_buy)

0.7253705318221447

In [42]:
expect_to_buy = df.loc[df['output'] >= 0.005]
expect_to_buy[expect_to_buy['truth'] >= 0.005].values.shape[0] / len(expect_to_buy)

0.7048579285059579

In [220]:
thresholds = [2e-4, 1e-4, 5e-5, 1e-5]
abs_diff = df['diff'].abs()

for threshold in thresholds:
    print(f'{threshold}: {len(abs_diff[abs_diff <= threshold]) / len(abs_diff)}')

0.0002: 0.07012847965738758
0.0001: 0.03499732334047109
5e-05: 0.017814656197953842
1e-05: 0.003643231025458006


In [265]:
model

Model(
  (decompsition): series_decomp(
    (moving_avg): moving_avg(
      (avg): AvgPool1d(kernel_size=(25,), stride=(1,), padding=(0,))
    )
  )
  (Linear_Seasonal): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Trend): Linear(in_features=384, out_features=96, bias=True)
  (Linear_Decoder): Linear(in_features=384, out_features=96, bias=True)
)