# 1 - Import requirements

In [None]:
! pip install ta

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [None]:
import os
import pandas as pd
import numpy as np

from ta import add_all_ta_features
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import MACD
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tqdm import tqdm

# 2 - Prepare data

In [None]:
# 1. Hàm load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Xử lý datetime
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df = df.sort_values('Datetime').drop(['Date', 'Time'], axis=1)
    return df

train_df = load_data("data/dynamic_labeled_train.csv")
val_df = load_data("data/dynamic_labeled_dev.csv")
test_df = load_data("data/dynamic_labeled_test.csv")

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime'], dtype='object')

In [None]:
train_df['Label'].value_counts()

Label
HOLD    1610302
BUY     1101095
SELL     735152
Name: count, dtype: int64

In [None]:
label_mapping = {
    'BUY': 0,
    'SELL': 1,
    'HOLD': 2
}

def map_label(x):
    return label_mapping[x] if x in label_mapping else x

train_df['Label'] = train_df['Label'].map(map_label)
val_df['Label'] = val_df['Label'].map(map_label)
test_df['Label'] = test_df['Label'].map(map_label)

In [None]:
# 2. Hàm thêm basic features
def add_basic_features(df):
    df['Price_Spread'] = df['High'] - df['Low']
    df['Price_Change'] = df['Close'] - df['Open']
    df['Body_Ratio'] = (df['Close'] - df['Open']) / (df['Price_Spread'] + 1e-8)
    df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
    return df

for df in [train_df, val_df, test_df]:
    df = add_basic_features(df)
    
train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return'],
      dtype='object')

In [None]:
# 3. Hàm thêm technical indicators
def add_technical_indicators(df):
    # Sử dụng thư viện 'ta'
    # df = add_all_ta_features(
    #     df, open="Open", high="High", low="Low", close="Close", volume="Volume"
    # )
    
    indic_cols = {}
    # Thêm indicators custom
    rsi = RSIIndicator(close=df['Close'], window=14)
    indic_cols['RSI_14'] = rsi.rsi()
    
    macd = MACD(close=df['Close'])
    indic_cols['MACD'] = macd.macd()
    indic_cols['MACD_Signal'] = macd.macd_signal()
    
    bb = BollingerBands(close=df['Close'], window=20, window_dev=2)
    indic_cols['BB_Upper'] = bb.bollinger_hband()
    indic_cols['BB_Lower'] = bb.bollinger_lband()
    
    df = pd.concat([df, pd.DataFrame(indic_cols)], axis=1)
    
    return df

train_df = add_technical_indicators(train_df)
val_df = add_technical_indicators(val_df)
test_df = add_technical_indicators(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower'],
      dtype='object')

In [None]:
# 4. Hàm thêm statistical features
def add_statistical_features(df, window=20):
    new_cols = {
        'Rolling_Mean': df['Close'].rolling(window).mean(),
        'Rolling_Std': df['Close'].rolling(window).std(),
        'Rolling_Max': df['High'].rolling(window).max(),
        'Rolling_Min': df['Low'].rolling(window).min()
    }
    
    # # Lag features
    # for lag in [1, 3, 5]:
    #     df[f'Close_Lag_{lag}'] = df['Close'].shift(lag)
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    return df

train_df = add_statistical_features(train_df)
val_df = add_statistical_features(val_df)
test_df = add_statistical_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'Rolling_Mean',
       'Rolling_Std', 'Rolling_Max', 'Rolling_Min'],
      dtype='object')

In [None]:
# 5. Hàm thêm time-based features
def add_time_features(df):
    new_cols = {
        'Hour': df['Datetime'].dt.hour,
        'DayOfWeek': df['Datetime'].dt.dayofweek  # 0=Monday
    }
    
    # Cyclical encoding cho giờ và ngày
    new_cols['Hour_sin'] = np.sin(2 * np.pi * new_cols['Hour']/24)
    new_cols['Hour_cos'] = np.cos(2 * np.pi * new_cols['Hour']/24)
    new_cols['Day_sin'] = np.sin(2 * np.pi * new_cols['DayOfWeek']/7)
    new_cols['Day_cos'] = np.cos(2 * np.pi * new_cols['DayOfWeek']/7)
    
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    return df

train_df = add_time_features(train_df)
val_df = add_time_features(val_df)
test_df = add_time_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return', 'RSI_14',
       'MACD', 'MACD_Signal', 'BB_Upper', 'BB_Lower', 'Rolling_Mean',
       'Rolling_Std', 'Rolling_Max', 'Rolling_Min', 'Hour', 'DayOfWeek',
       'Hour_sin', 'Hour_cos', 'Day_sin', 'Day_cos'],
      dtype='object')

In [None]:
def drop_na_cols(df, threshold=0.01):
    for col in df.columns:
        na = df[[col]].isna().sum()
        if na.values > len(df) * threshold:
            df.drop(col, axis=1, inplace=True)
    return df

# 6. Hàm xử lý missing values
def handle_missing_data(df, threshold=0.01):
    df = drop_na_cols(df, threshold)
    
    # Xóa các hàng có NaN sinh ra bởi indicators
    df = df.dropna()
    # # Forward fill cho các features nhất định
    # df.loc[:, ['Volume', 'Open']] = df[['Volume', 'Open']].ffill()
    return df

# for df in [train_df, val_df, test_df]:
#     df = handle_missing_data(df)

# train_df
train_df = handle_missing_data(train_df)
val_df = handle_missing_data(val_df)
test_df = handle_missing_data(test_df)
train_df

Unnamed: 0,Open,High,Low,Close,Volume,Label,Datetime,Price_Spread,Price_Change,Body_Ratio,...,Rolling_Mean,Rolling_Std,Rolling_Max,Rolling_Min,Hour,DayOfWeek,Hour_sin,Hour_cos,Day_sin,Day_cos
33,1408.40,1408.87,1393.00,1395.30,3869,0,2011-01-04 13:00:00,15.87,-13.10,-0.825457,...,1412.9990,5.890067,1422.67,1393.00,13,1,-0.258819,-0.965926,0.781831,0.623490
34,1395.33,1397.10,1385.37,1387.66,3990,0,2011-01-04 14:00:00,11.73,-7.67,-0.653879,...,1411.4550,8.022496,1422.67,1385.37,14,1,-0.500000,-0.866025,0.781831,0.623490
35,1387.64,1388.98,1381.63,1382.92,4405,0,2011-01-04 15:00:00,7.35,-4.72,-0.642177,...,1409.7075,10.091339,1422.67,1381.63,15,1,-0.707107,-0.707107,0.781831,0.623490
36,1382.95,1385.64,1382.54,1384.64,3066,1,2011-01-04 16:00:00,3.10,1.69,0.545161,...,1407.9535,11.242463,1422.67,1381.63,16,1,-0.866025,-0.500000,0.781831,0.623490
37,1384.71,1385.57,1374.80,1376.25,3885,2,2011-01-04 17:00:00,10.77,-8.46,-0.785515,...,1405.7625,12.904016,1420.10,1374.80,17,1,-0.965926,-0.258819,0.781831,0.623490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446544,1891.01,1892.01,1890.74,1892.01,112,0,2020-12-31 19:13:00,1.27,1.00,0.787402,...,1893.5010,1.209710,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446545,1892.01,1892.64,1892.01,1892.28,77,0,2020-12-31 19:14:00,0.63,0.27,0.428571,...,1893.4400,1.240140,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446546,1892.19,1892.21,1891.90,1892.02,68,0,2020-12-31 19:15:00,0.31,-0.17,-0.548387,...,1893.3585,1.278578,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969
3446547,1892.02,1892.06,1891.43,1891.43,75,0,2020-12-31 19:16:00,0.63,-0.59,-0.936508,...,1893.2280,1.337211,1894.99,1890.74,19,3,-0.965926,0.258819,0.433884,-0.900969


In [None]:
# # 7. Hàm chuẩn hóa dữ liệu (Áp dụng riêng cho từng tập)
# def scale_features(train_df, val_df, test_df):
#     # Chọn các cột cần chuẩn hóa (bỏ các cột không phải số)
#     feature_columns = [col for col in train_df.columns if col not in ['Label', 'Datetime']]
    
#     # Chuẩn hóa theo train
#     scaler = StandardScaler()
#     train_scaled = scaler.fit_transform(train_df[feature_columns])
#     val_scaled = scaler.transform(val_df[feature_columns])
#     test_scaled = scaler.transform(test_df[feature_columns])
    
#     # Tạo DataFrame mới đã scaled
#     scaled_train_df = pd.DataFrame(train_scaled, columns=feature_columns, index=train_df.index)
#     scaled_val_df = pd.DataFrame(val_scaled, columns=feature_columns, index=val_df.index)
#     scaled_test_df = pd.DataFrame(test_scaled, columns=feature_columns, index=test_df.index)
    
#     # Thêm lại các cột không phải feature
#     for scaled_df, df in zip([scaled_train_df, scaled_val_df, scaled_test_df], [train_df, val_df, test_df]):
#         scaled_df['Label'] = df['Label'].values
#         scaled_df['Datetime'] = df['Datetime'].values
    
#     return scaled_train_df, scaled_val_df, scaled_test_df

# train_df, val_df, test_df = scale_features(train_df, val_df, test_df)
# train_df

In [None]:
def create_sequences_sequential(X, y, sequence_length, save_path, idx_file, strike=1):
    """
    Tạo sequences từng sample một, dùng memmap để lưu đúng shape
    """
    try:
        n_samples = len(X) - sequence_length
        n_features = X.shape[1]
        if n_samples <= 0:
            raise ValueError("Input array too short for given sequence_length")
        
        # Chuẩn bị file memmap
        os.makedirs(save_path, exist_ok=True)
        sequences_file = f'{save_path}/sequences.dat'
        labels_file = f'{save_path}/labels.dat'
        shape_file = f'{save_path}/shape.txt'
        
        # Đọc index khởi đầu
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        else:
            if os.path.exists(sequences_file):
                os.remove(sequences_file)
            if os.path.exists(labels_file):
                os.remove(labels_file)
            if os.path.exists(shape_file):
                os.remove(shape_file)
        
        # Tạo memmap với shape đầy đủ
        sequences = np.memmap(sequences_file, dtype=np.float32, mode='w+', 
                            shape=(n_samples, sequence_length, n_features))
        labels = np.memmap(labels_file, dtype=np.int64, mode='w+', shape=(n_samples,))
        
        # Ghi dữ liệu từ start_idx
        for i in tqdm(range(start_idx, n_samples, strike)):
            sequences[i] = X[i:i+sequence_length]
            labels[i] = y['Label'].values[i + sequence_length]
            
            # Ghi index hiện tại
            with open(idx_file, 'w') as f:
                f.write(str(i + 1))
        
        # Lưu shape vào file
        with open(shape_file, 'w') as f:
            f.write(f"{n_samples}\n{sequence_length}\n{n_features}")
        
        # Flush để đảm bảo dữ liệu được ghi
        sequences.flush()
        labels.flush()
        print(f"Sequences saved to {save_path}, shape: {sequences.shape}")
        return n_samples, n_features
    except Exception as e:
        print(e)
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        return start_idx, n_features

In [None]:
# save_path = 'data'
# sequence_length = 60

# feature_columns = [col for col in train_df.columns if col not in ['Label', 'Datetime']]
# X_train = train_df[feature_columns].values
# X_val = val_df[feature_columns].values
# X_test = test_df[feature_columns].values

# train_path = f'{save_path}/train'
# val_path = f'{save_path}/val'
# test_path = f'{save_path}/test' 

# train_idx_file = f'{save_path}/train_idx.txt'
# val_idx_file = f'{save_path}/val_idx.txt'
# test_idx_file = f'{save_path}/test_idx.txt'

# print("Processing train data...")
# n_train_samples, n_features = create_sequences_sequential(X_train, train_df, sequence_length, train_path, train_idx_file)
# print("Processing validation data...")
# n_val_samples, _ = create_sequences_sequential(X_val, val_df, sequence_length, val_path, val_idx_file)
# print("Processing test data...")
# n_test_samples, _ = create_sequences_sequential(X_test, test_df, sequence_length, test_path, test_idx_file)

Processing train data...


100%|██████████| 3446456/3446456 [16:32<00:00, 3473.38it/s]


Sequences saved to data/train, shape: (3446456, 60, 24)
Processing validation data...


100%|██████████| 350891/350891 [01:40<00:00, 3480.69it/s]


Sequences saved to data/val, shape: (350891, 60, 24)
Processing test data...


100%|██████████| 1127274/1127274 [05:19<00:00, 3523.26it/s]


Sequences saved to data/test, shape: (1127274, 60, 24)
