In [46]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from dataclasses import dataclass
from typing import Optional

import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import root_mean_squared_error

In [39]:
DATETIME_COLUMN = 'candle_date_time_kst'
# TARGET_COLUMN = 'win_or_lose'

# CHECKPOINT_PATH = '../model_checkpoints/simple_time_features'
CHECKPOINT_PATH = '../model_checkpoints/xgboost_sec'
if not os.path.exists(CHECKPOINT_PATH):
  os.makedirs(CHECKPOINT_PATH)

TRAIN_DATA_FILE_NAME = 'IOTA_1m_3000000_2025-01-14T23:04:51+09:00.parquet_20250114230451.parquet'
# TEST_DATA_FILE_NAME = 'IOTA_1s_2000_2025-01-12T23:21:27+09:00.parquet_20250112232127.parquet'

# Parameters
# INPUT_LENGTH = 60  # Number of past time steps to use as input
# OUTPUT_LENGTH = 12  # Number of future time steps to predict
# BATCH_SIZE = 32
# LEARNING_RATE = 5e-4
# EPOCHS = 10

SEQUENCE_LENGTH = 24 * 4 * 4
PREDICTION_LENGTH = 24 * 4
LABEL_LENGTH = 24 * 4

BATCH_SIZE = 128
NUM_BATCHES_PER_EPOCH = 100
EPOCHS = 50
LEARNING_RATE = 5e-4
SCALING = 'std'

In [40]:
class DataUtils:
  
  default_path = os.path.join(Path(os.getcwd()).parent, 'data')
  
  @staticmethod
  def load_parquet(file_name: str, file_dir: Optional[str] = None):
    if not file_dir:
        file_dir = DataUtils.default_path
        
    path = os.path.join(file_dir, file_name)

    if not os.path.exists(path) or file_name.split('.')[-1] != 'parquet':
        return

    print(f'Loading parquet file from: {path}')

    return pd.read_parquet(path)
  
  @staticmethod
  def feature_engineering(df: Optional[pd.DataFrame]) -> pd.DataFrame:
    if df is None:
      return pd.DataFrame()

    df['return_1m'] = df['mid_price'].pct_change(1)
    df['return_5m'] = df['mid_price'].pct_change(5)
    df['return_10m'] = df['mid_price'].pct_change(10)

    # 2. 이동평균 Feature
    df['ma_5'] = df['mid_price'].rolling(window=5).mean()
    df['ma_10'] = df['mid_price'].rolling(window=10).mean()
    df['ma_30'] = df['mid_price'].rolling(window=30).mean()

    # 3. 이동 표준편차 Feature (변동성)
    df['std_5'] = df['mid_price'].rolling(window=5).std()
    df['std_10'] = df['mid_price'].rolling(window=10).std()

    # 4. 거래량 Feature
    df['volume_change_1m'] = df['candle_acc_trade_volume'].pct_change(1)
    df['volume_ma_5'] = df['candle_acc_trade_volume'].rolling(5).mean()
    df['volume_ratio'] = df['candle_acc_trade_volume'] / (df['volume_ma_5'] + 1e-9)

    # 5. 가격 구조 Feature
    df['high_low_spread'] = df['high_price'] - df['low_price']
    df['is_bullish'] = (df['mid_price'] > df['opening_price']).astype(int)
    df['body_size'] = np.abs(df['opening_price'] - df['mid_price'])
    df['body_to_range'] = df['body_size'] / (df['high_price'] - df['low_price'] + 1e-9)

    # 6. 기술적 지표 Feature

    # Relative Strength Index (RSI)
    window_length = 14
    delta = df['mid_price'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window_length).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window_length).mean()
    RS = gain / (loss + 1e-9)
    df['RSI'] = 100 - (100 / (1 + RS))

    # MACD
    ema12 = df['mid_price'].ewm(span=12, adjust=False).mean()
    ema26 = df['mid_price'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema12 - ema26
    df['MACD_signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # 7. 시간 정보 Feature (timestamp 있어야 가능)
    # 예: df['timestamp'] 가 datetime 타입이라고 가정
    if 'timestamp' in df.columns:
      df['timestamp'] = pd.to_datetime(df['timestamp'])
      df['hour'] = df['timestamp'].dt.hour
      df['minute'] = df['timestamp'].dt.minute
      df['dayofweek'] = df['timestamp'].dt.dayofweek

    # 8. Target 생성 (10분 후 수익률)
    prediction_horizon = 10
    df['target_return'] = (df['mid_price'].shift(-prediction_horizon) - df['mid_price']) / df['mid_price']

    # 9. NaN 제거
    df = df.dropna().reset_index(drop=True)
    
    return df

In [41]:
data = DataUtils.load_parquet(TRAIN_DATA_FILE_NAME)

Loading parquet file from: /Users/minjiwon/upbase-data-server/data/IOTA_1m_3000000_2025-01-14T23:04:51+09:00.parquet_20250114230451.parquet


In [47]:
data_pp = DataUtils.feature_engineering(data)

features = [
    'worst_profit_rate_before',
    'opening_price', 'high_price', 'low_price', 'mid_price',
    'candle_acc_trade_volume',
    'return_1m', 'return_5m', 'return_10m',
    'ma_5', 'ma_10', 'ma_30',
    'std_5', 'std_10',
    'volume_change_1m'
]

# 1. feature/target 설정
X = data_pp[features]
y = data_pp['target_return']

# 2. 시계열 Train/Validation/Test Split
train_ratio = 0.7
valid_ratio = 0.15
test_ratio = 0.15

# 분할 인덱스 계산
n_total = len(data_pp)
n_train = int(n_total * train_ratio)
n_valid = int(n_total * valid_ratio)
n_test = n_total - n_train - n_valid

# 데이터 분할
X_train = X.iloc[:n_train]
y_train = y.iloc[:n_train]

X_valid = X.iloc[n_train:n_train+n_valid]
y_valid = y.iloc[n_train:n_train+n_valid]

X_test = X.iloc[n_train+n_valid:]
y_test = y.iloc[n_train+n_valid:]

# 1. DMatrix 변환
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)
dtest = xgb.DMatrix(X_test, label=y_test)

# 2. 파라미터 설정
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 5,
    'learning_rate': 0.03,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',
    'seed': 42
}

# 3. 학습
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=100
)

# 4. 예측 및 평가
y_pred_test = model.predict(dtest)
test_rmse = root_mean_squared_error(y_test, y_pred_test)
print(f'✅ 최종 Test RMSE: {test_rmse:.6f}')

[0]	train-rmse:0.00889	valid-rmse:0.00551
[56]	train-rmse:0.00871	valid-rmse:0.00555
✅ 최종 Test RMSE: 0.006597
