In [94]:
! pip install ta



In [95]:
# For prepare data
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

import ta
from ta import add_all_ta_features
from ta.momentum import RSIIndicator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import BollingerBands, AverageTrueRange
from ta.trend import MACD, ADXIndicator

from tqdm import tqdm

# For modeling
import math
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch import optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import Dataset, DataLoader, Subset
import torch.nn.functional as F
# from pytorch_optimizer import SAM

# Prepare data

In [96]:
# 1. Hàm load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Xử lý datetime
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df = df.sort_values('Datetime').drop(['Date', 'Time'], axis=1)
    return df

train_df = load_data("/kaggle/input/xauusd1m/dynamic_labeled_train.csv")
val_df = load_data("/kaggle/input/xauusd1m/dynamic_labeled_dev.csv")
test_df = load_data("/kaggle/input/xauusd1m/dynamic_labeled_test.csv")

# train_df = train_df.loc[train_df['Datetime'].dt.year.isin(range(2018, 2021))]

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3446549 entries, 0 to 3446548
Data columns (total 7 columns):
 #   Column    Dtype         
---  ------    -----         
 0   Open      float64       
 1   High      float64       
 2   Low       float64       
 3   Close     float64       
 4   Volume    int64         
 5   Label     object        
 6   Datetime  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(1), object(1)
memory usage: 184.1+ MB


In [97]:
label_mapping = {
    'BUY': 0,
    'SELL': 1,
    'HOLD': 2
}

def map_label(x):
    return label_mapping[x] if x in label_mapping else x

train_df['Label'] = train_df['Label'].map(map_label)
val_df['Label'] = val_df['Label'].map(map_label)
test_df['Label'] = test_df['Label'].map(map_label)

In [98]:
def add_technical_indicators(df):
    """Thêm các chỉ báo kỹ thuật."""
    print("Thêm các chỉ báo kỹ thuật...")
    # Momentum
    df['RSI'] = ta.momentum.RSIIndicator(df['Close']).rsi()
    df['Momentum'] = ta.momentum.ROCIndicator(df['Close']).roc()
    df['CMO'] = ta.momentum.kama(df['Close'])
    df['Williams_%R'] = ta.momentum.WilliamsRIndicator(df['High'], df['Low'], df['Close']).williams_r()
    # Volatility
    df['ATR'] = ta.volatility.AverageTrueRange(df['High'], df['Low'], df['Close']).average_true_range()
    bb = ta.volatility.BollingerBands(df['Close'])
    df['BB_Mid'] = bb.bollinger_mavg()
    df['BB_Upper'] = bb.bollinger_hband()
    df['BB_Lower'] = bb.bollinger_lband()
    df['BB_Bandwidth'] = bb.bollinger_wband()
    keltner = ta.volatility.KeltnerChannel(df['High'], df['Low'], df['Close'])
    df['KC_High'] = keltner.keltner_channel_hband()
    df['KC_Low'] = keltner.keltner_channel_lband()
    donchian = ta.volatility.DonchianChannel(df['High'], df['Low'], df['Close'])
    df['DC_High'] = donchian.donchian_channel_hband()
    df['DC_Low'] = donchian.donchian_channel_lband()
    # Trend
    df['SMA_20'] = ta.trend.SMAIndicator(df['Close'], window=20).sma_indicator()
    df['EMA_20'] = ta.trend.EMAIndicator(df['Close'], window=20).ema_indicator()
    df['DPO'] = ta.trend.DPOIndicator(df['Close']).dpo()
    macd = ta.trend.MACD(df['Close'])
    df['MACD'] = macd.macd()
    df['MACD_Hist'] = macd.macd_diff()
    df['Mass_Index'] = ta.trend.mass_index(df['High'], df['Low'])
    # Volume
    df['AD'] = ta.volume.AccDistIndexIndicator(df['High'], df['Low'], df['Close'], df['Volume']).acc_dist_index()
    df['CMF'] = ta.volume.ChaikinMoneyFlowIndicator(df['High'], df['Low'], df['Close'], df['Volume']).chaikin_money_flow()
    df['Force_Index'] = ta.volume.ForceIndexIndicator(df['Close'], df['Volume']).force_index()
    df['MFI'] = ta.volume.MFIIndicator(df['High'], df['Low'], df['Close'], df['Volume']).money_flow_index()
    df['OBV'] = ta.volume.OnBalanceVolumeIndicator(df['Close'], df['Volume']).on_balance_volume()

    print("Hoàn thành thêm chỉ báo.")
    return df.reset_index(drop=True)

train_df = add_technical_indicators(train_df)
val_df = add_technical_indicators(val_df)
test_df = add_technical_indicators(test_df)

train_df.info()

Thêm các chỉ báo kỹ thuật...
Hoàn thành thêm chỉ báo.
Thêm các chỉ báo kỹ thuật...


  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)
  return op(a, b)


Hoàn thành thêm chỉ báo.
Thêm các chỉ báo kỹ thuật...
Hoàn thành thêm chỉ báo.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3446549 entries, 0 to 3446548
Data columns (total 31 columns):
 #   Column        Dtype         
---  ------        -----         
 0   Open          float64       
 1   High          float64       
 2   Low           float64       
 3   Close         float64       
 4   Volume        int64         
 5   Label         int64         
 6   Datetime      datetime64[ns]
 7   RSI           float64       
 8   Momentum      float64       
 9   CMO           float64       
 10  Williams_%R   float64       
 11  ATR           float64       
 12  BB_Mid        float64       
 13  BB_Upper      float64       
 14  BB_Lower      float64       
 15  BB_Bandwidth  float64       
 16  KC_High       float64       
 17  KC_Low        float64       
 18  DC_High       float64       
 19  DC_Low        float64       
 20  SMA_20        float64       
 21  EMA_20        float64

In [99]:
def add_basic_features(df):
    """Tiền xử lý cuối: pct_change, ánh xạ nhãn, thêm đặc trưng thời gian."""
    print("Áp dụng tiền xử lý...")
    # Tính phần trăm thay đổi cho OHLC
    cols_to_pct = ['Open', 'High', 'Low', 'Close']
    existing_cols = [col for col in cols_to_pct if col in df.columns]
    if existing_cols:
        print(f"Tính phần trăm thay đổi cho: {existing_cols}")
        df[existing_cols] = df[existing_cols].pct_change().fillna(0) * 100
        df['Cum_Return'] = df['Close'].rolling(window=20).sum()
        df['Cum_Turnover'] = df['Volume'].rolling(window=20).sum()
    else:
        print("Cảnh báo: Không tìm thấy cột OHLC để tính pct_change.")

    # Thêm đặc trưng thời gian
    if 'Datetime' in df.columns:
        print("Thêm đặc trưng thời gian (hour, day_of_week, minute_of_day, index)...")
        df['Hour'] = df['Datetime'].dt.hour / 23.0
        df['Day_Of_Week'] = df['Datetime'].dt.dayofweek / 6.0
        df['Minute_Of_Day'] = (df['Datetime'].dt.hour * 60 + df['Datetime'].dt.minute) / 1439.0
        # df = df.drop(columns=['Datetime'], errors='ignore')
    else:
        print("Cảnh báo: Không thể tạo đặc trưng thời gian do thiếu cột 'Datetime'.")

    print("Hoàn thành tiền xử lý.")
    return df.reset_index(drop=True)

train_df = add_basic_features(train_df)
val_df = add_basic_features(val_df)
test_df = add_basic_features(test_df)
train_df.info()

Áp dụng tiền xử lý...
Tính phần trăm thay đổi cho: ['Open', 'High', 'Low', 'Close']
Thêm đặc trưng thời gian (hour, day_of_week, minute_of_day, index)...
Hoàn thành tiền xử lý.
Áp dụng tiền xử lý...
Tính phần trăm thay đổi cho: ['Open', 'High', 'Low', 'Close']
Thêm đặc trưng thời gian (hour, day_of_week, minute_of_day, index)...
Hoàn thành tiền xử lý.
Áp dụng tiền xử lý...
Tính phần trăm thay đổi cho: ['Open', 'High', 'Low', 'Close']
Thêm đặc trưng thời gian (hour, day_of_week, minute_of_day, index)...
Hoàn thành tiền xử lý.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3446549 entries, 0 to 3446548
Data columns (total 36 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Open           float64       
 1   High           float64       
 2   Low            float64       
 3   Close          float64       
 4   Volume         int64         
 5   Label          int64         
 6   Datetime       datetime64[ns]
 7   RSI            float64       
 8   M

In [100]:
def drop_na_cols(df, threshold=0.01):
    cnt = 0
    for col in df.columns:
        na = df[[col]].isna().sum()
        if na.values > len(df) * threshold:
            df.drop(col, axis=1, inplace=True)
            cnt += 1
    print(f'Deleted {cnt} cols')
    return df

# 6. Hàm xử lý missing values
def handle_missing_data(df, threshold=0.01):
    df = drop_na_cols(df, threshold)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    df = df.bfill().ffill()
    df = df.dropna()
    
    return df

# train_df
train_df = handle_missing_data(train_df)
val_df = handle_missing_data(val_df)
test_df = handle_missing_data(test_df)
train_df.info()

Deleted 0 cols
Deleted 0 cols
Deleted 0 cols
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3446549 entries, 0 to 3446548
Data columns (total 36 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Open           float64       
 1   High           float64       
 2   Low            float64       
 3   Close          float64       
 4   Volume         int64         
 5   Label          int64         
 6   Datetime       datetime64[ns]
 7   RSI            float64       
 8   Momentum       float64       
 9   CMO            float64       
 10  Williams_%R    float64       
 11  ATR            float64       
 12  BB_Mid         float64       
 13  BB_Upper       float64       
 14  BB_Lower       float64       
 15  BB_Bandwidth   float64       
 16  KC_High        float64       
 17  KC_Low         float64       
 18  DC_High        float64       
 19  DC_Low         float64       
 20  SMA_20         float64       
 21  EMA_20         float64       
 2

In [101]:
def normalize_by_blocks(data, block_size):
    print(f"Áp dụng normalize_by_blocks với block_size={block_size}...")
    if isinstance(data, pd.DataFrame):
        columns = data.columns
        index = data.index
        data_np = data.values.astype('float32')
    else:
        data_np = data.astype('float32')
        columns = None
        index = None

    result = np.zeros_like(data_np)
    num_blocks = 0

    for start_idx in range(0, len(data_np), block_size):
        end_idx = min(start_idx + block_size, len(data_np))
        block = data_np[start_idx:end_idx]

        if block.shape[0] > 0:
            scaler = StandardScaler()
            if block.shape[0] == 1:
                normalized_block = block - np.mean(block, axis=0)
            else:
                std_devs = np.std(block, axis=0)
                if np.any(std_devs == 0):
                    normalized_block = np.zeros_like(block)
                    valid_cols = std_devs != 0
                    if np.any(valid_cols):
                        scaler.fit(block[:, valid_cols])
                        normalized_block[:, valid_cols] = scaler.transform(block[:, valid_cols])
                    zero_std_cols = std_devs == 0
                    if np.any(zero_std_cols):
                        normalized_block[:, zero_std_cols] = block[:, zero_std_cols] - np.mean(block[:, zero_std_cols], axis=0)
                else:
                    normalized_block = scaler.fit_transform(block)

            if np.isnan(normalized_block).any() or np.isinf(normalized_block).any():
                normalized_block = np.nan_to_num(normalized_block, nan=0.0, posinf=0.0, neginf=0.0)

            result[start_idx:end_idx] = normalized_block
            num_blocks += 1

    print(f"Hoàn thành normalize_by_blocks. Đã xử lý {num_blocks} khối.")
    if columns is not None and index is not None:
        return pd.DataFrame(result, columns=columns, index=index)
    else:
        return result

# Định nghĩa các cột đặc trưng
feature_cols = [col for col in train_df.columns if col not in ['Label', 'Datetime']]
print(f"Sử dụng {len(feature_cols)} cột đặc trưng: {feature_cols}")

# Áp dụng chuẩn hóa theo khối
BLOCK_SIZE = 128
train_df[feature_cols] = normalize_by_blocks(train_df[feature_cols], BLOCK_SIZE)
val_df[feature_cols] = normalize_by_blocks(val_df[feature_cols], BLOCK_SIZE)
test_df[feature_cols] = normalize_by_blocks(test_df[feature_cols], BLOCK_SIZE)



Sử dụng 34 cột đặc trưng: ['Open', 'High', 'Low', 'Close', 'Volume', 'RSI', 'Momentum', 'CMO', 'Williams_%R', 'ATR', 'BB_Mid', 'BB_Upper', 'BB_Lower', 'BB_Bandwidth', 'KC_High', 'KC_Low', 'DC_High', 'DC_Low', 'SMA_20', 'EMA_20', 'DPO', 'MACD', 'MACD_Hist', 'Mass_Index', 'AD', 'CMF', 'Force_Index', 'MFI', 'OBV', 'Cum_Return', 'Cum_Turnover', 'Hour', 'Day_Of_Week', 'Minute_Of_Day']
Áp dụng normalize_by_blocks với block_size=128...
Hoàn thành normalize_by_blocks. Đã xử lý 26927 khối.
Áp dụng normalize_by_blocks với block_size=128...
Hoàn thành normalize_by_blocks. Đã xử lý 2743 khối.
Áp dụng normalize_by_blocks với block_size=128...
Hoàn thành normalize_by_blocks. Đã xử lý 8808 khối.


In [102]:
train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime', 'RSI',
       'Momentum', 'CMO', 'Williams_%R', 'ATR', 'BB_Mid', 'BB_Upper',
       'BB_Lower', 'BB_Bandwidth', 'KC_High', 'KC_Low', 'DC_High', 'DC_Low',
       'SMA_20', 'EMA_20', 'DPO', 'MACD', 'MACD_Hist', 'Mass_Index', 'AD',
       'CMF', 'Force_Index', 'MFI', 'OBV', 'Cum_Return', 'Cum_Turnover',
       'Hour', 'Day_Of_Week', 'Minute_Of_Day'],
      dtype='object')

In [103]:
class PreprocessedStockDataset(Dataset):
    def __init__(self, df, sequence_length, feature_cols, use_time2vec=True):
        self.sequence_length = sequence_length
        self.feature_cols = [col for col in feature_cols if col in df.columns]
        self.use_time2vec = use_time2vec
        self.has_time_input = False

        print(f"Khởi tạo PreprocessedStockDataset với {len(self.feature_cols)} đặc trưng.")
        if self.use_time2vec:
            if "Minute_Of_Day" in df.columns and pd.api.types.is_numeric_dtype(df['Minute_Of_Day']):
                print("Sử dụng 'minute_of_day' cho Time2Vec.")
                self.time_features = df["Minute_Of_Day"].values[:, np.newaxis].astype(np.float32)
                self.has_time_input = True
            else:
                print("Cảnh báo: Không tìm thấy 'minute_of_day'. Time2Vec sẽ dùng giá trị 0.")
                self.time_features = np.zeros((len(df), 1), dtype=np.float32)
        else:
            self.time_features = np.zeros((len(df), 1), dtype=np.float32)

        self.features = df[self.feature_cols].values.astype(np.float32)
        if 'Label' in df.columns and pd.api.types.is_numeric_dtype(df['Label']):
            self.labels = df['Label'].values.astype(np.int64)
            print(f"Kích thước - Features: {self.features.shape}, Labels: {self.labels.shape}, Time: {self.time_features.shape}")
        else:
            print("Cảnh báo: Không tìm thấy cột 'Label'. Tạo nhãn giả (0).")
            self.labels = np.zeros(len(df), dtype=np.int64)

        if len(self.features) <= self.sequence_length:
            raise ValueError(f"Độ dài DataFrame ({len(self.features)}) phải lớn hơn sequence_length ({self.sequence_length}).")

    def __len__(self):
        return len(self.features) - self.sequence_length

    def __getitem__(self, idx):
        end_idx = idx + self.sequence_length
        feat_seq = self.features[idx:end_idx]
        time_seq = self.time_features[idx:end_idx]
        label = self.labels[end_idx]
        return torch.from_numpy(feat_seq).float(), torch.from_numpy(time_seq).float(), torch.tensor(label).long()

# Tạo Dataset
USE_TIME2VEC = True
SEQ_LEN = 128
print("\n--- Tạo Dataset và DataLoader ---")
train_dataset = PreprocessedStockDataset(train_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)
val_dataset = PreprocessedStockDataset(val_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)
test_dataset = PreprocessedStockDataset(test_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)

# Tạo DataLoader
BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())


--- Tạo Dataset và DataLoader ---
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.
Kích thước - Features: (3446549, 34), Labels: (3446549,), Time: (3446549, 1)
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.
Kích thước - Features: (350984, 34), Labels: (350984,), Time: (350984, 1)
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.
Kích thước - Features: (1127367, 34), Labels: (1127367,), Time: (1127367, 1)


In [104]:
# Kiểm tra
print("Number of batches:", len(train_loader))
sample_batch = next(iter(train_loader))
print("Batch input shape:", sample_batch[0].shape)
print("Batch times shape:", sample_batch[1].shape)
print("Batch labels shape:", sample_batch[2].shape)
print("\nExample input shape for Transformer:", sample_batch[0][0].shape)
print(sample_batch[0][0])

Number of batches: 13463
Batch input shape: torch.Size([256, 128, 34])
Batch times shape: torch.Size([256, 128, 1])
Batch labels shape: torch.Size([256])

Example input shape for Transformer: torch.Size([128, 34])
tensor([[ 0.6224,  0.8578,  0.7255,  ...,  0.9812,  0.0000,  0.7443],
        [ 0.5433,  0.2644, -0.0107,  ...,  0.9812,  0.0000,  0.7713],
        [-0.3728, -0.6069, -0.3159,  ...,  0.9812,  0.0000,  0.7984],
        ...,
        [-0.9408, -0.0384, -0.0298,  ...,  0.6713,  0.0000,  0.6631],
        [ 0.5465, -0.0837,  0.2351,  ...,  0.6713,  0.0000,  0.6901],
        [-0.2076, -0.5815, -2.3689,  ...,  0.6713,  0.0000,  0.7172]])


# Modeling

In [105]:
class InceptionModule(nn.Module):
    def __init__(self, in_channels):
        super().__init__()
        self.branch1 = nn.Conv1d(in_channels, 32, kernel_size=1, padding='same')
        self.branch3 = nn.Conv1d(in_channels, 32, kernel_size=3, padding='same')
        self.branch5 = nn.Conv1d(in_channels, 32, kernel_size=5, padding='same')
        self.branch_pool = nn.Sequential(
            nn.MaxPool1d(kernel_size=3, stride=1, padding=1),
            nn.Conv1d(in_channels, 32, kernel_size=1)
        )

    def forward(self, x):
        return torch.cat([self.branch1(x), self.branch3(x), self.branch5(x), self.branch_pool(x)], dim=1)

class Time2Vec(nn.Module):
    def __init__(self, time_dim, kernel_dim=32):
        super().__init__()
        self.linear = nn.Linear(time_dim, 1)
        self.periodic = nn.Linear(time_dim, kernel_dim - 1)

    def forward(self, x):
        linear = self.linear(x)
        periodic = self.periodic(x)
        return torch.cat([linear, periodic], dim=-1)

class CrossAttentionFusion(nn.Module):
    def __init__(self, cnn_dim, transformer_dim):
        super().__init__()
        self.query = nn.Linear(cnn_dim, transformer_dim)
        self.key = nn.Linear(transformer_dim, transformer_dim)
        self.value = nn.Linear(transformer_dim, transformer_dim)
        
    def forward(self, cnn_features, transformer_features):
        Q = self.query(cnn_features).unsqueeze(1)  # [batch, 1, transformer_dim]
        K = self.key(transformer_features)         # [batch, seq_len, transformer_dim]
        V = self.value(transformer_features)       # [batch, seq_len, transformer_dim]
        
        attn_scores = (Q @ K.transpose(-2, -1)) / (K.size(-1) ** 0.5)  # [batch, 1, seq_len]
        attn_weights = torch.softmax(attn_scores, dim=-1)
        
        return torch.bmm(attn_weights, V).squeeze(1)  # [batch, transformer_dim]

class HighwayNetwork(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.gate = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.Sigmoid()
        )
        
    def forward(self, fused, transformer):
        g = self.gate(fused)
        return g * fused + (1 - g) * transformer

class EnhancedHybridModel(nn.Module):
    def __init__(self, num_features, time_dim, num_classes=3, d_model=512, nhead=16, dim_feedforward=1024, num_layers=6):
        super().__init__()
        # 1. InceptionTime Branch
        self.inception = nn.Sequential(
            InceptionModule(num_features),
            nn.ReLU(),
            nn.MaxPool1d(2),
            InceptionModule(128),
            nn.ReLU()
        )
        
        # 2. Transformer Branch
        self.time2vec = Time2Vec(time_dim, d_model//2)
        self.transformer_proj = nn.Linear(num_features, d_model//2)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 3. Fusion
        self.cross_attention = CrossAttentionFusion(128, d_model)
        self.highway = HighwayNetwork(d_model)
        
        # 4. Classifier
        self.classifier = nn.Sequential(
            nn.Linear(d_model, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, x, time_feature):
        # 1. Inception Path
        cnn_features = self.inception(x.permute(0, 2, 1))  # [batch, channels, seq_len//2]
        cnn_features = cnn_features.mean(dim=-1)          # [batch, channels=128]
        
        # 2. Transformer Path
        time_embed = self.time2vec(time_feature)  # [batch, seq_len, d_model // 2]
        x_proj = self.transformer_proj(x)  # [batch, seq_len, d_model // 2]
        combined = torch.cat([time_embed, x_proj], dim=-1) # [batch, seq_len, d_model]
        transformer_features = self.transformer(combined)  # [batch, seq_len, d_model]
        
        # 3. Fusion
        fused = self.cross_attention(cnn_features, transformer_features)  # [batch, d_model]
        output = self.highway(fused, transformer_features.mean(dim=1))   # [batch, d_model]
        
        return self.classifier(output)

N_FEATURES = sample_batch[0].shape[-1]
model = EnhancedHybridModel(
    num_features=N_FEATURES, time_dim=1, num_classes=3, 
    d_model=128, nhead=8, dim_feedforward=256, num_layers=3
)

In [None]:
# print(model(sample_batch[0], sample_batch[1]).shape, sample_batch[2].shape)
# from torchinfo import summary
# print(summary(model, (BATCH_SIZE, SEQ_LEN, N_FEATURES)))

# Training

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001, save_path='best_model.pth'):
        """
        patience: Số epoch chờ mà không cải thiện trước khi dừng
        min_delta: Độ cải thiện tối thiểu để coi là tốt hơn
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.save_path = save_path

    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter}/{self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0
            torch.save(model.state_dict(), self.save_path)
            
def eval_model(model, val_loader, criterion, device):
    model.to(device)
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    loop = tqdm(val_loader, unit='batch', desc='\tEvaluating: ')
    with torch.no_grad():
        for i, (inputs, times, labels) in enumerate(loop):
            inputs, times, labels = inputs.to(device), times.to(device), labels.to(device)
            outputs = model(inputs, times)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, -1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            loop.set_postfix(loss=(running_loss / (i + 1)))
    
    epoch_loss = running_loss / len(val_loader)
    epoch_acc = 100 * correct / total
    
    return epoch_loss, epoch_acc

def train_model(model, train_loader, criterion, optimizer, device):
    model.to(device)
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    loop = tqdm(train_loader, unit='batch', desc=f'\tTraining: ')
    for i, (inputs, times, labels) in enumerate(loop):
        inputs, times, labels = inputs.to(device), times.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs, times)
        loss = criterion(outputs, labels)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, -1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        loop.set_postfix(loss=(running_loss / (i + 1)))
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    
    return epoch_loss, epoch_acc

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True)

NUM_EPOCHS = 10
PATIENCE = 10
MIN_DELTA = 0.0001
SAVE_PATH = 'best_model.pth'
torch.cuda.empty_cache()

# model = nn.DataParallel(model)
    
train_losses = []
train_accs = []
val_losses = []
val_accs = []

early_stopping = EarlyStopping(patience=PATIENCE, min_delta=MIN_DELTA, save_path=SAVE_PATH)
for epoch in range(NUM_EPOCHS):
    print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}]')
    
    train_loss, train_acc = train_model(model, train_loader, criterion, optimizer, DEVICE)
    val_loss, val_acc = eval_model(model, val_loader, criterion, DEVICE)
    
    print(f'\tTrain Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.2f}%')
    print(f'\tVal Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.2f}%')
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    val_losses.append(val_loss)
    val_accs.append(val_acc)
    
    scheduler.step(val_loss)
    
    # Kiểm tra Early Stopping
    early_stopping(model, val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break
    print('===================================================')

# Evaluate

In [None]:
import matplotlib.pyplot as plt
def plot_training_results(train_losses, train_accs, val_losses, val_accs):
    """
    Vẽ biểu đồ kết quả huấn luyện: loss và accuracy cho train và validation.
    
    Parameters:
    - train_losses: List các giá trị loss của train qua các epoch
    - train_accs: List các giá trị accuracy của train qua các epoch
    - val_losses: List các giá trị loss của validation qua các epoch
    - val_accs: List các giá trị accuracy của validation qua các epoch
    """
    epochs = range(1, len(train_losses) + 1)
    
    # Tạo figure với 2x2 subplot
    plt.figure(figsize=(12, 8))
    
    # Subplot 1: Train Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'b-', label='Train Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss')
    plt.legend()
    plt.grid(True)
    
    # Subplot 2: Train Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accs, 'g-', label='Train Accuracy')
    plt.plot(epochs, val_accs, 'm-', label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy')
    plt.legend()
    plt.grid(True)
    
    # Điều chỉnh layout và hiển thị
    plt.tight_layout()
    plt.show()

plot_training_results(train_losses, train_accs, val_losses, val_accs)

In [None]:
def test_model(model, data_loader, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        loop = tqdm(data_loader, unit='batch', desc=f'\tEvaluate metrics: ')
        for i, (inputs, times, labels) in enumerate(loop):
            inputs, times, labels = inputs.to(device), times.to(device), labels.to(device)

            outputs = model(inputs, times)  # Expecting [B, num_classes]
            preds = torch.argmax(outputs, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    target_names = ['BUY', 'SELL', 'HOLD']
    cm = confusion_matrix(all_labels, all_preds)
    print('Confusion matrix:')
    print(pd.DataFrame(cm, columns=target_names, index=target_names))
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=target_names, digits=4, zero_division=0))

In [106]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

SAVE_PATH = '/kaggle/input/inception-transformers-trading/pytorch/default/1/best_model (2).pth'
model.load_state_dict(torch.load(SAVE_PATH, weights_only=True))

<All keys matched successfully>

In [None]:
test_model(model, train_loader, DEVICE)

In [None]:
test_model(model, val_loader, DEVICE)

In [None]:
test_model(model, test_loader, DEVICE)

In [108]:
class PreprocessedStockWithDatetimeDataset(Dataset):
    def __init__(self, df, sequence_length, feature_cols, use_time2vec=True):
        self.sequence_length = sequence_length
        self.feature_cols = [col for col in feature_cols if col in df.columns]
        self.use_time2vec = use_time2vec
        self.has_time_input = False

        print(f"Khởi tạo PreprocessedStockDataset với {len(self.feature_cols)} đặc trưng.")
        if self.use_time2vec:
            if "Minute_Of_Day" in df.columns and pd.api.types.is_numeric_dtype(df['Minute_Of_Day']):
                print("Sử dụng 'minute_of_day' cho Time2Vec.")
                self.time_features = df["Minute_Of_Day"].values[:, np.newaxis].astype(np.float32)
                self.has_time_input = True
            else:
                print("Cảnh báo: Không tìm thấy 'minute_of_day'. Time2Vec sẽ dùng giá trị 0.")
                self.time_features = np.zeros((len(df), 1), dtype=np.float32)
        else:
            self.time_features = np.zeros((len(df), 1), dtype=np.float32)

        self.features = df[self.feature_cols].values.astype(np.float32)
        self.labels = df['Label'].values.astype(np.int64) if 'Label' in df.columns else np.zeros(len(df), dtype=np.int64)

        if 'Datetime' in df.columns:
            dt_parsed = pd.to_datetime(df['Datetime'], format="%Y-%m-%d %H:%M:%S", errors='coerce').apply(encode_datetime)
            self.datetimes = dt_parsed.values.astype(np.int64)
        else:
            self.datetimes = np.zeros(len(df), dtype=np.int64)

        if len(self.features) <= self.sequence_length:
            raise ValueError(f"Độ dài DataFrame ({len(self.features)}) phải lớn hơn sequence_length ({self.sequence_length}).")

    def __len__(self):
        return len(self.features) - self.sequence_length

    def __getitem__(self, idx):
        end_idx = idx + self.sequence_length
        feat_seq = self.features[idx:end_idx]
        time_seq = self.time_features[idx:end_idx]
        label = self.labels[end_idx]
        datetime_encoded = self.datetimes[end_idx]
        return (
            torch.from_numpy(feat_seq).float(), 
            torch.from_numpy(time_seq).float(), 
            torch.tensor(label).long(), 
            torch.tensor(datetime_encoded).long()
        )



def encode_datetime(dt):
    encoded_str = dt.strftime("%y%m%d%H%M")
    encoded_int = int(encoded_str)
    return encoded_int

def decode_datetime(encoded_dt):
    decoded_int = int(round(encoded_dt))
    # decoded_str = str(decoded_int).zfill(10)  # Ensure leading 0s
    decoded_str = str(decoded_int)
    # print('------>', decoded_str)
    return datetime.strptime(decoded_str, "%y%m%d%H%M")

def get_prediction(model, inputs, times, device):
    # inputs = [B, S, N]
    model.to(device)
    model.eval()
    with torch.no_grad():
        inputs = inputs.to(device)
        times = times.to(device)
        outputs = model(inputs, times)
        probs = torch.softmax(outputs, dim=-1)
        probs, preds = torch.max(probs, dim=-1)
    return preds.cpu().numpy(), probs.cpu().numpy()

def get_accuracy_per_batch(model, dataloader, device):
    results = {
        'BUY ACCURACY': [],
        'SELL ACCURACY': [],
        'HOLD ACCURACY': [],
        'OVERALL ACCURACY': []
    }
    
    loop = tqdm(dataloader, desc='Testing', unit='batch')
    for (inputs, times, labels) in loop:
        labels = labels.cpu().numpy()
        preds, _ = get_prediction(model, inputs, times, device)
    
        buy_acc = ((labels == 0) == (preds == 0)).sum() / len(labels)
        sell_acc = ((labels == 1) == (preds == 1)).sum() / len(labels)
        hold_acc = ((labels == 2) == (preds == 2)).sum() / len(labels)
        overall_acc = (labels == preds).sum() / len(labels)
    
        results['BUY ACCURACY'].append(buy_acc)
        results['SELL ACCURACY'].append(sell_acc)
        results['HOLD ACCURACY'].append(hold_acc)
        results['OVERALL ACCURACY'].append(overall_acc)

    return results

def convert2df(results, save_path):
    results_df = pd.DataFrame(results)
    results_df.to_csv(save_path, index=False)
    return results_df

def generate_predictions(model, dataloader, device):
    results = {
        'DATETIME': [],
        'GROUNDTRUTH': [],
        'PREDICTION': [],
        'PROBS': []
    }
    
    loop = tqdm(dataloader, desc='Testing', unit='batch')
    for (inputs, times, labels, datetimes) in loop:
        labels = labels.cpu().numpy()
        datetimes = datetimes.cpu().numpy()
        preds, probs = get_prediction(model, inputs, times, device)

        # Decode từ float back thành datetime object
        datetimes_converted = [decode_datetime(float(dt)) for dt in datetimes]

        results['DATETIME'].extend(datetimes_converted)
        results['GROUNDTRUTH'].extend(labels)
        results['PREDICTION'].extend(preds)
        results['PROBS'].extend(probs)

    return results


In [24]:
train_results_df = convert2df(get_accuracy_per_batch(model, train_loader, DEVICE), 'train_acc_per_batch.csv')
val_results_df = convert2df(get_accuracy_per_batch(model, val_loader, DEVICE), 'val_acc_per_batch.csv')
test_results_df = convert2df(get_accuracy_per_batch(model, test_loader, DEVICE), 'test_acc_per_batch.csv')

Testing: 100%|██████████| 13463/13463 [04:06<00:00, 54.65batch/s]
Testing: 100%|██████████| 686/686 [00:22<00:00, 29.91batch/s]
Testing: 100%|██████████| 2202/2202 [01:13<00:00, 30.14batch/s]


In [110]:
# Tạo Dataset
USE_TIME2VEC = True
SEQ_LEN = 128
print("\n--- Tạo Dataset và DataLoader ---")
train_dataset = PreprocessedStockWithDatetimeDataset(train_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)
val_dataset = PreprocessedStockWithDatetimeDataset(val_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)
test_dataset = PreprocessedStockWithDatetimeDataset(test_df, SEQ_LEN, feature_cols, use_time2vec=USE_TIME2VEC)

BATCH_SIZE = 256
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE * 2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE * 2, shuffle=False)


# sample = next(iter(train_loader))
# datetimes = sample[3].cpu().numpy()
# for dt in datetimes:
#     print(decode_datetime(dt))

train_predictions = convert2df(generate_predictions(model, train_loader, DEVICE), 'train_predictions.csv')
val_predictions = convert2df(generate_predictions(model, val_loader, DEVICE), 'val_predictions.csv')
test_predictions = convert2df(generate_predictions(model, test_loader, DEVICE), 'test_predictions.csv')


--- Tạo Dataset và DataLoader ---
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.
Khởi tạo PreprocessedStockDataset với 34 đặc trưng.
Sử dụng 'minute_of_day' cho Time2Vec.


Testing: 100%|██████████| 13463/13463 [06:00<00:00, 37.29batch/s]
Testing: 100%|██████████| 686/686 [00:35<00:00, 19.31batch/s]
Testing: 100%|██████████| 2202/2202 [01:54<00:00, 19.24batch/s]
