# 1 - Import requirements

In [1]:
# ! pip install ta

In [2]:
import os
import pandas as pd
import numpy as np

from ta import add_all_ta_features
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tqdm import tqdm

# 2 - Prepare data

In [3]:
# 1. Hàm load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Xử lý datetime
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df = df.sort_values('Datetime').drop(['Date', 'Time'], axis=1)
    return df

train_df = load_data("data/dynamic_labeled_train.csv")
val_df = load_data("data/dynamic_labeled_dev.csv")
test_df = load_data("data/dynamic_labeled_test.csv")

train_df = train_df.loc[train_df['Datetime'].dt.year.isin(range(2018, 2021))]

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime'], dtype='object')

In [4]:
train_df['Label'].value_counts()

Label
HOLD    514037
BUY     321644
SELL    214760
Name: count, dtype: int64

In [5]:
label_mapping = {
    'BUY': 0,
    'SELL': 1,
    'HOLD': 2
}

def map_label(x):
    return label_mapping[x] if x in label_mapping else x

train_df['Label'] = train_df['Label'].map(map_label)
val_df['Label'] = val_df['Label'].map(map_label)
test_df['Label'] = test_df['Label'].map(map_label)

In [6]:
# 2. Hàm thêm basic features
def add_basic_features(df):
    df['Price_Spread'] = df['High'] - df['Low']
    df['Price_Change'] = df['Close'] - df['Open']
    df['Body_Ratio'] = (df['Close'] - df['Open']) / (df['Price_Spread'] + 1e-8)
    df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
    return df

for df in [train_df, val_df, test_df]:
    df = add_basic_features(df)
    
train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return'],
      dtype='object')

In [7]:
# 3. Hàm thêm technical indicators
def add_technical_indicators(df):
    # Sử dụng thư viện 'ta'
    # df = add_all_ta_features(
    #     df, open="Open", high="High", low="Low", close="Close", volume="Volume"
    # )
    
    indic_cols = {}
    # Thêm indicators custom
    rsi = RSIIndicator(close=df['Close'], window=14)
    indic_cols['RSI_14'] = rsi.rsi()
    
    macd = MACD(close=df['Close'])
    indic_cols['MACD'] = macd.macd()
    indic_cols['MACD_Signal'] = macd.macd_signal()
    
    bb = BollingerBands(close=df['Close'], window=20, window_dev=2)
    indic_cols['BB_Upper'] = bb.bollinger_hband()
    indic_cols['BB_Lower'] = bb.bollinger_lband()
    
    df = pd.concat([df, pd.DataFrame(indic_cols)], axis=1)
    
    return df

train_df = add_technical_indicators(train_df)
val_df = add_technical_indicators(val_df)
test_df = add_technical_indicators(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower'],
      dtype='object')

In [8]:
# 4. Hàm thêm statistical features
def add_statistical_features(df, window=20):
    new_cols = {
        'Rolling_Mean': df['Close'].rolling(window).mean(),
        'Rolling_Std': df['Close'].rolling(window).std(),
        'Rolling_Max': df['High'].rolling(window).max(),
        'Rolling_Min': df['Low'].rolling(window).min()
    }
    
    # Lag features
    for lag in [1, 3, 5]:
        df[f'Close_Lag_{lag}'] = df['Close'].shift(lag)
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    return df

train_df = add_statistical_features(train_df)
val_df = add_statistical_features(val_df)
test_df = add_statistical_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'Close_Lag_1',
       'Close_Lag_3', 'Close_Lag_5', 'Rolling_Mean', 'Rolling_Std',
       'Rolling_Max', 'Rolling_Min'],
      dtype='object')

In [9]:
# 5. Hàm thêm time-based features
def add_time_features(df):
    new_cols = {
        'Hour': df['Datetime'].dt.hour,
        'DayOfWeek': df['Datetime'].dt.dayofweek  # 0=Monday
    }
    
    # Cyclical encoding cho giờ và ngày
    new_cols['Hour_sin'] = np.sin(2 * np.pi * new_cols['Hour']/24)
    new_cols['Hour_cos'] = np.cos(2 * np.pi * new_cols['Hour']/24)
    new_cols['Day_sin'] = np.sin(2 * np.pi * new_cols['DayOfWeek']/7)
    new_cols['Day_cos'] = np.cos(2 * np.pi * new_cols['DayOfWeek']/7)
    
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    return df

train_df = add_time_features(train_df)
val_df = add_time_features(val_df)
test_df = add_time_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'Close_Lag_1',
       'Close_Lag_3', 'Close_Lag_5', 'Rolling_Mean', 'Rolling_Std',
       'Rolling_Max', 'Rolling_Min', 'Hour', 'DayOfWeek', 'Hour_sin',
       'Hour_cos', 'Day_sin', 'Day_cos'],
      dtype='object')

In [10]:
def drop_na_cols(df, threshold=0.01):
    for col in df.columns:
        na = df[[col]].isna().sum()
        if na.values > len(df) * threshold:
            df.drop(col, axis=1, inplace=True)
    return df

# 6. Hàm xử lý missing values
def handle_missing_data(df, threshold=0.01):
    df = drop_na_cols(df, threshold)
    
    # Xóa các hàng có NaN sinh ra bởi indicators
    df = df.dropna()
    # # Forward fill cho các features nhất định
    # df.loc[:, ['Volume', 'Open']] = df[['Volume', 'Open']].ffill()
    return df

# for df in [train_df, val_df, test_df]:
#     df = handle_missing_data(df)

# train_df
train_df = handle_missing_data(train_df)
val_df = handle_missing_data(val_df)
test_df = handle_missing_data(test_df)
train_df

Unnamed: 0,Open,High,Low,Close,Volume,Label,Datetime,Price_Spread,Price_Change,Body_Ratio,...,Rolling_Mean,Rolling_Std,Rolling_Max,Rolling_Min,Hour,DayOfWeek,Hour_sin,Hour_cos,Day_sin,Day_cos
2396141,1309.41,1309.41,1309.29,1309.37,98,1,2018-01-02 08:33:00,0.12,-0.04,-0.333333,...,1309.2365,0.187961,1309.69,1308.79,8,1,0.866025,-0.500000,0.781831,0.623490
2396142,1309.37,1309.40,1309.27,1309.29,103,2,2018-01-02 08:34:00,0.13,-0.08,-0.615385,...,1309.2270,0.179622,1309.62,1308.79,8,1,0.866025,-0.500000,0.781831,0.623490
2396143,1309.28,1309.31,1309.09,1309.12,80,2,2018-01-02 08:35:00,0.22,-0.16,-0.727273,...,1309.2085,0.169900,1309.62,1308.79,8,1,0.866025,-0.500000,0.781831,0.623490
2396144,1309.11,1309.24,1309.09,1309.17,102,1,2018-01-02 08:36:00,0.15,0.06,0.400000,...,1309.1980,0.165167,1309.62,1308.79,8,1,0.866025,-0.500000,0.781831,0.623490
2396145,1309.17,1309.19,1308.79,1308.80,82,2,2018-01-02 08:37:00,0.40,-0.37,-0.925000,...,1309.1650,0.175664,1309.62,1308.79,8,1,0.866025,-0.500000,0.781831,0.623490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446544,1891.01,1892.01,1890.74,1892.01,112,0,2020-12-31 19:13:00,1.27,1.00,0.787402,...,1893.5010,1.209710,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446545,1892.01,1892.64,1892.01,1892.28,77,0,2020-12-31 19:14:00,0.63,0.27,0.428571,...,1893.4400,1.240140,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446546,1892.19,1892.21,1891.90,1892.02,68,0,2020-12-31 19:15:00,0.31,-0.17,-0.548387,...,1893.3585,1.278578,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446547,1892.02,1892.06,1891.43,1891.43,75,0,2020-12-31 19:16:00,0.63,-0.59,-0.936508,...,1893.2280,1.337211,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969


In [11]:
# 7. Hàm chuẩn hóa dữ liệu (Áp dụng riêng cho từng tập)
def scale_features(train_df, val_df, test_df):
    # Chọn các cột cần chuẩn hóa (bỏ các cột không phải số)
    feature_columns = [col for col in train_df.columns if col not in ['Label', 'Datetime']]
    
    # Chuẩn hóa theo train
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_df[feature_columns])
    val_scaled = scaler.transform(val_df[feature_columns])
    test_scaled = scaler.transform(test_df[feature_columns])
    
    # Tạo DataFrame mới đã scaled
    scaled_train_df = pd.DataFrame(train_scaled, columns=feature_columns, index=train_df.index)
    scaled_val_df = pd.DataFrame(val_scaled, columns=feature_columns, index=val_df.index)
    scaled_test_df = pd.DataFrame(test_scaled, columns=feature_columns, index=test_df.index)
    
    # Thêm lại các cột không phải feature
    for scaled_df, df in zip([scaled_train_df, scaled_val_df, scaled_test_df], [train_df, val_df, test_df]):
        scaled_df['Label'] = df['Label'].values
        scaled_df['Datetime'] = df['Datetime'].values
    
    return scaled_train_df, scaled_val_df, scaled_test_df

train_df, val_df, test_df = scale_features(train_df, val_df, test_df)
train_df

Unnamed: 0,Open,High,Low,Close,Volume,Price_Spread,Price_Change,Body_Ratio,Log_Return,RSI_14,...,Rolling_Max,Rolling_Min,Hour,DayOfWeek,Hour_sin,Hour_cos,Day_sin,Day_cos,Label,Datetime
2396141,-0.712700,-0.713310,-0.712598,-0.712866,0.494859,-0.610265,-0.095840,-0.554125,-0.059719,0.029206,...,-0.714585,-0.712152,-0.441101,-0.702881,1.170249,-0.643938,0.831741,0.894966,1,2018-01-02 08:33:00
2396142,-0.712869,-0.713352,-0.712682,-0.713203,0.576712,-0.590408,-0.192304,-1.026242,-0.234871,-0.237589,...,-0.714879,-0.712152,-0.441101,-0.702881,1.170249,-0.643938,0.831741,0.894966,2,2018-01-02 08:34:00
2396143,-0.713247,-0.713730,-0.713440,-0.713919,0.200190,-0.411691,-0.385233,-1.213528,-0.497642,-0.741384,...,-0.714879,-0.712152,-0.441101,-0.702881,1.170249,-0.643938,0.831741,0.894966,2,2018-01-02 08:35:00
2396144,-0.713963,-0.714025,-0.713440,-0.713708,0.560341,-0.550693,0.145322,0.673379,0.144640,-0.552951,...,-0.714879,-0.712152,-0.441101,-0.702881,1.170249,-0.643938,0.831741,0.894966,1,2018-01-02 08:36:00
2396145,-0.713711,-0.714235,-0.714704,-0.715266,0.232931,-0.054259,-0.891673,-1.544498,-1.081710,-1.425979,...,-0.714879,-0.712152,-0.441101,-0.702881,1.170249,-0.643938,0.831741,0.894966,2,2018-01-02 08:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446544,1.735681,1.738290,1.736161,1.739889,0.724046,1.673331,2.412240,1.321839,2.019343,-1.125665,...,1.744823,1.742692,1.227247,0.695809,-1.380261,0.450675,0.156717,-1.065932,0,2020-12-31 19:13:00
3446545,1.739891,1.740941,1.741509,1.741026,0.151079,0.402460,0.651761,0.721204,0.544062,-0.845462,...,1.744823,1.742692,1.227247,0.695809,-1.380261,0.450675,0.156717,-1.065932,0,2020-12-31 19:14:00
3446546,1.740649,1.739131,1.741046,1.739931,0.003744,-0.232975,-0.409349,-0.914097,-0.526538,-1.023903,...,1.744823,1.742692,1.227247,0.695809,-1.380261,0.450675,0.156717,-1.065932,0,2020-12-31 19:15:00
3446547,1.739933,1.738500,1.739067,1.737448,0.118338,0.402460,-1.422228,-1.563761,-1.193404,-1.392451,...,1.744823,1.742692,1.227247,0.695809,-1.380261,0.450675,0.156717,-1.065932,0,2020-12-31 19:16:00


In [37]:
def create_sequences_sequential(X, y, sequence_length, save_path, idx_file, stride=1):
    """
    Tạo sequences từng sample một, dùng memmap để lưu đúng shape
    """
    try:
        n_samples = int(np.ceil((len(X) - sequence_length) / stride))
        n_features = X.shape[1]
        if n_samples <= 0:
            raise ValueError("Input array too short for given sequence_length")
        
        # Chuẩn bị file memmap
        os.makedirs(save_path, exist_ok=True)
        sequences_file = f'{save_path}/sequences.dat'
        labels_file = f'{save_path}/labels.dat'
        shape_file = f'{save_path}/shape.txt'
        
        # Đọc index khởi đầu
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        else:
            if os.path.exists(sequences_file):
                os.remove(sequences_file)
            if os.path.exists(labels_file):
                os.remove(labels_file)
            if os.path.exists(shape_file):
                os.remove(shape_file)
        
        # Tạo memmap với shape đầy đủ
        sequences = np.memmap(sequences_file, dtype=np.float32, mode='w+', 
                            shape=(n_samples, sequence_length, n_features))
        labels = np.memmap(labels_file, dtype=np.int64, mode='w+', shape=(n_samples,))
        
        # Ghi dữ liệu từ start_idx
        for i in tqdm(range(start_idx, (len(X) - sequence_length), stride)):
            sequences[i // stride] = X[i:i+sequence_length]
            labels[i // stride] = y['Label'].values[i + sequence_length]
            
            # Ghi index hiện tại
            with open(idx_file, 'w') as f:
                f.write(str(i + 1))
        
        # Lưu shape vào file
        with open(shape_file, 'w') as f:
            f.write(f"{n_samples}\n{sequence_length}\n{n_features}")
        
        # Flush để đảm bảo dữ liệu được ghi
        sequences.flush()
        labels.flush()
        print(f"Sequences saved to {save_path}, shape: {sequences.shape}")
        return n_samples, n_features
    except Exception as e:
        print(e)
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        return start_idx, n_features

In [38]:
save_path = 'data'
sequence_length = 128

feature_columns = [col for col in train_df.columns if col not in ['Label', 'Datetime']]
X_train = train_df[feature_columns].values
X_val = val_df[feature_columns].values
X_test = test_df[feature_columns].values

train_path = f'{save_path}/train'
val_path = f'{save_path}/val'
test_path = f'{save_path}/test' 

train_idx_file = f'{save_path}/train_idx.txt'
val_idx_file = f'{save_path}/val_idx.txt'
test_idx_file = f'{save_path}/test_idx.txt'

print("Processing train data...")
n_train_samples, n_features = create_sequences_sequential(X_train, train_df, sequence_length, train_path, train_idx_file, stride=10)
print("Processing validation data...")
n_val_samples, _ = create_sequences_sequential(X_val, val_df, sequence_length, val_path, val_idx_file, stride=10)
print("Processing test data...")
n_test_samples, _ = create_sequences_sequential(X_test, test_df, sequence_length, test_path, test_idx_file, stride=10)

Processing train data...


100%|██████████| 105028/105028 [00:30<00:00, 3430.26it/s]


Sequences saved to data/train, shape: (105028, 128, 27)
Processing validation data...


100%|██████████| 35083/35083 [00:09<00:00, 3560.97it/s]


Sequences saved to data/val, shape: (35083, 128, 27)
Processing test data...


100%|██████████| 112721/112721 [00:32<00:00, 3505.60it/s]


Sequences saved to data/test, shape: (112721, 128, 27)


In [34]:
print(len(test_df))
array = np.arange(0, (len(test_df) - sequence_length), 10)
array.shape

1127334


(112721,)