# 1 - Import requirements

In [17]:
# ! pip install ta

In [1]:
import os
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

from ta import add_all_ta_features
from ta.momentum import RSIIndicator, StochasticOscillator, WilliamsRIndicator
from ta.volatility import BollingerBands, AverageTrueRange
from ta.trend import MACD, ADXIndicator

from tqdm import tqdm

# 2 - Prepare data

In [2]:
# 1. Hàm load data
def load_data(file_path):
    df = pd.read_csv(file_path)
    # Xử lý datetime
    df['Datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
    df = df.sort_values('Datetime').drop(['Date', 'Time'], axis=1)
    return df

train_df = load_data("data/dynamic_labeled_train.csv")
val_df = load_data("data/dynamic_labeled_dev.csv")
test_df = load_data("data/dynamic_labeled_test.csv")

train_df = train_df.loc[train_df['Datetime'].dt.year.isin(range(2018, 2021))]

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime'], dtype='object')

In [3]:
train_df['Label'].value_counts()

Label
HOLD    514037
BUY     321644
SELL    214760
Name: count, dtype: int64

In [4]:
label_mapping = {
    'BUY': 0,
    'SELL': 1,
    'HOLD': 2
}

def map_label(x):
    return label_mapping[x] if x in label_mapping else x

train_df['Label'] = train_df['Label'].map(map_label)
val_df['Label'] = val_df['Label'].map(map_label)
test_df['Label'] = test_df['Label'].map(map_label)

In [6]:
# 2. Hàm thêm basic features
def add_basic_features(df):
    df['Price_Spread'] = df['High'] - df['Low']
    df['Price_Change'] = df['Close'] - df['Open']
    df['Body_Ratio'] = (df['Close'] - df['Open']) / (df['Price_Spread'] + 1e-8)
    df['Log_Return'] = np.log(df['Close'] / df['Close'].shift(1))
    
    df['Cumulative_Return_5D'] = np.exp(df['Log_Return'].rolling(5).sum()) - 1  # Lũy kế 5 ngày
    
    return df

for df in [train_df, val_df, test_df]:
    df = add_basic_features(df)
    
train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return',
       'Cumulative_Return_5D'],
      dtype='object')

In [7]:
# 3. Hàm thêm technical indicators
def add_technical_indicators(df):
    # Sử dụng thư viện 'ta'
    # df = add_all_ta_features(
    #     df, open="Open", high="High", low="Low", close="Close", volume="Volume"
    # )
    
    # Volume-Weighted Momentum
    df['VW_Momentum'] = (df['Volume'] * (df['Close'] - df['Close'].shift(1))).rolling(5).sum()

    # On-Balance Volume (OBV)
    df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).cumsum()
    
    # Stochastic Oscillator
    stoch = StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14)
    df['Stoch_%K'] = stoch.stoch()
    df['Stoch_%D'] = stoch.stoch_signal()

    # Williams %R
    williams = WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'], lbp=14)
    df['Williams_%R'] = williams.williams_r()

    # Average True Range (ATR)
    atr = AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14)
    df['ATR'] = atr.average_true_range()
    
    indic_cols = {}
    # Thêm indicators custom
    rsi = RSIIndicator(close=df['Close'], window=14)
    indic_cols['RSI_14'] = rsi.rsi()
    
    macd = MACD(close=df['Close'])
    indic_cols['MACD'] = macd.macd()
    indic_cols['MACD_Signal'] = macd.macd_signal()
    
    bb = BollingerBands(close=df['Close'], window=20, window_dev=2)
    indic_cols['BB_Upper'] = bb.bollinger_hband()
    indic_cols['BB_Lower'] = bb.bollinger_lband()
    
    df = pd.concat([df, pd.DataFrame(indic_cols)], axis=1)
    df['MACD_Histogram'] = df['MACD'] - df['MACD_Signal']
    
    # Kết hợp RSI và MACD
    df['RSI_MACD_Interaction'] = df['RSI_14'] * df['MACD_Histogram']

    # Tương quan Volume-Giá
    df['Volume_Price_Correlation'] = df['Volume'].rolling(10).corr(df['Close'])
    
    # ADX để xác định thị trường có trend hay range
    adx = ADXIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=14)
    df['ADX'] = adx.adx()
    df['Market_Trend'] = (df['ADX'] > 25).astype(int)  # >25: có xu hướng
    
    return df

train_df = add_technical_indicators(train_df)
val_df = add_technical_indicators(val_df)
test_df = add_technical_indicators(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return',
       'Cumulative_Return_5D', 'VW_Momentum', 'OBV', 'Stoch_%K', 'Stoch_%D',
       'Williams_%R', 'ATR', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Upper',
       'BB_Lower', 'MACD_Histogram', 'RSI_MACD_Interaction',
       'Volume_Price_Correlation', 'ADX', 'Market_Trend'],
      dtype='object')

In [8]:
# 4. Hàm thêm statistical features
def add_statistical_features(df, window=20):
    new_cols = {
        'Rolling_Mean': df['Close'].rolling(window).mean(),
        'Rolling_Std': df['Close'].rolling(window).std(),
        'Rolling_Max': df['High'].rolling(window).max(),
        'Rolling_Min': df['Low'].rolling(window).min()
    }
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    # Biến động "nén" trong ngắn hạn
    df['Volatility_Clustering'] = df['Rolling_Std'] / df['Rolling_Std'].shift(5)
    
    df['Rolling_Skew'] = df['Close'].rolling(window).skew()
    df['Rolling_Kurtosis'] = df['Close'].rolling(window).kurt()
    df['Rolling_Q80'] = df['Close'].rolling(window).quantile(0.8)
    df['Rolling_Q20'] = df['Close'].rolling(window).quantile(0.2)
    df['Quantile_Spread'] = df['Rolling_Q80'] - df['Rolling_Q20']  # Độ phân tán phân vị
    
    return df

train_df = add_statistical_features(train_df)
val_df = add_statistical_features(val_df)
test_df = add_statistical_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return',
       'Cumulative_Return_5D', 'VW_Momentum', 'OBV', 'Stoch_%K', 'Stoch_%D',
       'Williams_%R', 'ATR', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Upper',
       'BB_Lower', 'MACD_Histogram', 'RSI_MACD_Interaction',
       'Volume_Price_Correlation', 'ADX', 'Market_Trend', 'Rolling_Mean',
       'Rolling_Std', 'Rolling_Max', 'Rolling_Min', 'Volatility_Clustering',
       'Rolling_Skew', 'Rolling_Kurtosis', 'Rolling_Q80', 'Rolling_Q20',
       'Quantile_Spread'],
      dtype='object')

In [9]:
# 5. Hàm thêm time-based features
def add_time_features(df):
    new_cols = {
        'Hour': df['Datetime'].dt.hour,
        'DayOfWeek': df['Datetime'].dt.dayofweek  # 0=Monday
    }
    
    # Cyclical encoding cho giờ và ngày
    new_cols['Hour_sin'] = np.sin(2 * np.pi * new_cols['Hour']/24)
    new_cols['Hour_cos'] = np.cos(2 * np.pi * new_cols['Hour']/24)
    new_cols['Day_sin'] = np.sin(2 * np.pi * new_cols['DayOfWeek']/7)
    new_cols['Day_cos'] = np.cos(2 * np.pi * new_cols['DayOfWeek']/7)
    
    df = pd.concat([df, pd.DataFrame(new_cols)], axis=1)
    
    return df

train_df = add_time_features(train_df)
val_df = add_time_features(val_df)
test_df = add_time_features(test_df)

train_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return',
       'Cumulative_Return_5D', 'VW_Momentum', 'OBV', 'Stoch_%K', 'Stoch_%D',
       'Williams_%R', 'ATR', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Upper',
       'BB_Lower', 'MACD_Histogram', 'RSI_MACD_Interaction',
       'Volume_Price_Correlation', 'ADX', 'Market_Trend', 'Rolling_Mean',
       'Rolling_Std', 'Rolling_Max', 'Rolling_Min', 'Volatility_Clustering',
       'Rolling_Skew', 'Rolling_Kurtosis', 'Rolling_Q80', 'Rolling_Q20',
       'Quantile_Spread', 'Hour', 'DayOfWeek', 'Hour_sin', 'Hour_cos',
       'Day_sin', 'Day_cos'],
      dtype='object')

In [27]:
def drop_na_cols(df, threshold=0.01):
    cnt = 0
    for col in df.columns:
        na = df[[col]].isna().sum()
        if na.values > len(df) * threshold:
            df.drop(col, axis=1, inplace=True)
            cnt += 1
    print(f'Deleted {cnt} cols')
    return df

# 6. Hàm xử lý missing values
def handle_missing_data(df, threshold=0.01):
    df = drop_na_cols(df, threshold)
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Xóa các hàng có NaN sinh ra bởi indicators
    # print(df.isna().sum())
    df = df.dropna()
    # # Forward fill cho các features nhất định
    # df.loc[:, ['Volume', 'Open']] = df[['Volume', 'Open']].ffill()
    return df

# for df in [train_df, val_df, test_df]:
#     df = handle_missing_data(df)

# train_df
train_df = handle_missing_data(train_df)
val_df = handle_missing_data(val_df)
test_df = handle_missing_data(test_df)
train_df.columns

Deleted 0 cols
Deleted 0 cols
Deleted 0 cols


Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Label', 'Datetime',
       'Price_Spread', 'Price_Change', 'Body_Ratio', 'Log_Return',
       'Cumulative_Return_5D', 'VW_Momentum', 'OBV', 'Stoch_%K', 'Stoch_%D',
       'Williams_%R', 'ATR', 'RSI_14', 'MACD', 'MACD_Signal', 'BB_Upper',
       'BB_Lower', 'MACD_Histogram', 'RSI_MACD_Interaction',
       'Volume_Price_Correlation', 'ADX', 'Market_Trend', 'Rolling_Mean',
       'Rolling_Std', 'Rolling_Max', 'Rolling_Min', 'Volatility_Clustering',
       'Rolling_Skew', 'Rolling_Kurtosis', 'Rolling_Q80', 'Rolling_Q20',
       'Quantile_Spread', 'Hour', 'DayOfWeek', 'Hour_sin', 'Hour_cos',
       'Day_sin', 'Day_cos'],
      dtype='object')

In [30]:
# 7. Hàm chuẩn hóa dữ liệu (Áp dụng riêng cho từng tập)
def scale_features(train_df, val_df, test_df, excepts=['Label', 'Datetime'], scaler_path='data/scaler.pkl'):
    # Chọn các cột cần chuẩn hóa (bỏ các cột không phải số)
    feature_columns = [col for col in train_df.columns if col not in excepts]
    
    # Chuẩn hóa theo train
    scaler = StandardScaler()
    train_scaled = scaler.fit_transform(train_df[feature_columns])
    val_scaled = scaler.transform(val_df[feature_columns])
    test_scaled = scaler.transform(test_df[feature_columns])
    
    # Tạo DataFrame mới đã scaled
    scaled_train_df = pd.DataFrame(train_scaled, columns=feature_columns, index=train_df.index)
    scaled_val_df = pd.DataFrame(val_scaled, columns=feature_columns, index=val_df.index)
    scaled_test_df = pd.DataFrame(test_scaled, columns=feature_columns, index=test_df.index)
    
    # Thêm lại các cột không phải feature
    for scaled_df, df in zip([scaled_train_df, scaled_val_df, scaled_test_df], [train_df, val_df, test_df]):
        for col in excepts:
            scaled_df[col] = df[col].values
    
    pickle.dump(scaler, open(scaler_path, 'wb'))
    
    return scaled_train_df, scaled_val_df, scaled_test_df

train_df, val_df, test_df = scale_features(train_df, val_df, test_df, excepts=['Label', 'Datetime'], scaler_path='data/classfication_scaler.pkl')
train_df

Unnamed: 0,Open,High,Low,Close,Volume,Price_Spread,Price_Change,Body_Ratio,Log_Return,Cumulative_Return_5D,VW_Momentum,OBV,Stoch_%K,Stoch_%D,Williams_%R,ATR,RSI_14,MACD,MACD_Signal,BB_Upper,BB_Lower,MACD_Histogram,RSI_MACD_Interaction,Volume_Price_Correlation,ADX,Market_Trend,Rolling_Mean,Rolling_Std,Rolling_Max,Rolling_Min,Volatility_Clustering,Rolling_Skew,Rolling_Kurtosis,Rolling_Q80,Rolling_Q20,Quantile_Spread,Hour,DayOfWeek,Hour_sin,Hour_cos,Day_sin,Day_cos,Label,Datetime
2396141,-0.712705,-0.713314,-0.712603,-0.712871,0.494826,-0.610331,-0.095836,-0.554122,-0.059718,0.242306,0.120225,-0.577757,0.634333,0.728754,0.634333,-0.528046,0.029220,-0.135452,-0.260792,-0.714778,-0.712049,0.363982,0.245423,0.588673,0.218295,1.189041,-0.713422,-0.590735,-0.714590,-0.712157,-0.971608,-0.274949,-0.472147,-0.713984,-0.712778,-0.552486,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,1,2018-01-02 08:33:00
2396142,-0.712873,-0.713356,-0.712687,-0.713208,0.576678,-0.590471,-0.192314,-1.026234,-0.234867,0.119446,0.069247,-0.578294,0.316972,0.600076,0.316972,-0.544858,-0.237574,-0.126870,-0.235594,-0.714887,-0.712021,0.313907,0.178203,-0.097271,0.094308,1.189041,-0.713462,-0.606078,-0.714884,-0.712157,-0.919277,-0.271900,-0.346332,-0.714076,-0.712778,-0.574571,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,2,2018-01-02 08:34:00
2396143,-0.713252,-0.713735,-0.713445,-0.713923,0.200156,-0.411739,-0.385271,-1.213518,-0.497634,-0.235459,-0.071475,-0.578711,-0.357421,0.214043,-0.357421,-0.544176,-0.741369,-0.149608,-0.220267,-0.715044,-0.712019,0.190936,0.044544,0.804433,-0.052720,-0.841014,-0.713540,-0.623967,-0.714884,-0.712157,-0.837325,-0.128141,-0.262117,-0.714085,-0.712778,-0.576579,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,2,2018-01-02 08:35:00
2396144,-0.713968,-0.714029,-0.713445,-0.713713,0.560308,-0.550753,0.145360,0.673368,0.144638,-0.290039,-0.097356,-0.578179,-0.403612,-0.160043,-0.403612,-0.556215,-0.552936,-0.156758,-0.209525,-0.715127,-0.712024,0.133825,0.004254,0.630140,-0.189245,-0.841014,-0.713584,-0.632678,-0.714884,-0.712157,-0.626491,0.076823,-0.105941,-0.714118,-0.712778,-0.584610,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,1,2018-01-02 08:36:00
2396145,-0.713715,-0.714240,-0.714708,-0.715270,0.232897,-0.054273,-0.891783,-1.544484,-1.081693,-0.808693,-0.273371,-0.578606,-1.626866,-0.860622,-1.626866,-0.522134,-1.425963,-0.227726,-0.216012,-0.715180,-0.712250,-0.080799,-0.161435,1.237873,-0.182121,-0.841014,-0.713723,-0.613361,-0.714884,-0.712157,-0.487304,-0.291167,0.139512,-0.714396,-0.712821,-0.640826,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,2,2018-01-02 08:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446544,1.735689,1.738298,1.736169,1.739897,0.724013,1.673478,2.412602,1.321821,2.019309,-1.183364,-0.628034,2.253417,-0.682623,-1.251874,-0.682623,0.536540,-1.125649,-1.356892,-0.689335,1.748847,1.743563,-2.343039,-1.791346,-1.049998,0.279462,1.189041,1.746224,1.289358,1.744831,1.742700,2.875561,-1.583226,0.127446,1.747086,1.745507,1.025596,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,0,2020-12-31 19:13:00
3446545,1.739899,1.740949,1.741518,1.741034,0.151045,0.402489,0.651872,0.721193,0.544053,-0.418873,-0.305863,2.253818,-0.473444,-0.957817,-0.473444,0.534208,-0.845447,-1.325432,-0.833120,1.748840,1.743056,-1.791151,-1.505844,-1.415363,0.217948,1.189041,1.745968,1.345351,1.744831,1.742700,2.375891,-1.303921,-0.202342,1.747086,1.744024,1.378958,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,0,2020-12-31 19:14:00
3446546,1.740657,1.739139,1.741054,1.739940,0.003710,-0.233006,-0.409391,-0.914090,-0.526529,0.714759,0.535670,2.253463,-0.674875,-0.659889,-0.674875,0.486781,-1.023888,-1.332898,-0.949735,1.748812,1.742396,-1.452682,-1.184607,-1.674262,0.174227,1.189041,1.745625,1.416079,1.744831,1.742700,1.020119,-0.987346,-0.520754,1.747086,1.743147,1.587762,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,0,2020-12-31 19:15:00
3446547,1.739941,1.738508,1.739075,1.737456,0.118304,0.402489,-1.422414,-1.563746,-1.193385,0.138332,0.278818,2.253073,-1.131969,-0.821838,-1.131969,0.488003,-1.392436,-1.430580,-1.063784,1.748744,1.741364,-1.420719,-1.045322,-1.432637,0.187915,1.189041,1.745075,1.523968,1.744831,1.742700,0.585217,-0.642874,-0.784960,1.747086,1.742625,1.712242,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,0,2020-12-31 19:16:00


In [31]:
def create_sequences_sequential(X, y, sequence_length, save_path, idx_file, target='Label',stride=1):
    """
    Tạo sequences từng sample một, dùng memmap để lưu đúng shape
    """
    try:
        n_samples = int(np.ceil((len(X) - sequence_length) / stride))
        n_features = X.shape[1]
        if n_samples <= 0:
            raise ValueError("Input array too short for given sequence_length")
        
        # Chuẩn bị file memmap
        os.makedirs(save_path, exist_ok=True)
        sequences_file = f'{save_path}/sequences.dat'
        labels_file = f'{save_path}/labels.dat'
        shape_file = f'{save_path}/shape.txt'
        
        # Đọc index khởi đầu
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        else:
            if os.path.exists(sequences_file):
                os.remove(sequences_file)
            if os.path.exists(labels_file):
                os.remove(labels_file)
            if os.path.exists(shape_file):
                os.remove(shape_file)
        
        # Tạo memmap với shape đầy đủ
        sequences = np.memmap(sequences_file, dtype=np.float32, mode='w+', 
                            shape=(n_samples, sequence_length, n_features))
        labels = np.memmap(labels_file, dtype=np.int64, mode='w+', shape=(n_samples,))
        
        # Ghi dữ liệu từ start_idx
        for i in tqdm(range(start_idx, (len(X) - sequence_length), stride)):
            sequences[i // stride] = X[i:i+sequence_length]
            labels[i // stride] = y[target].values[i + sequence_length]
            
            # Ghi index hiện tại
            with open(idx_file, 'w') as f:
                f.write(str(i + 1))
        
        # Lưu shape vào file
        with open(shape_file, 'w') as f:
            f.write(f"{n_samples}\n{sequence_length}\n{n_features}")
        
        # Flush để đảm bảo dữ liệu được ghi
        sequences.flush()
        labels.flush()
        print(f"Sequences saved to {save_path}, shape: {sequences.shape}")
        return n_samples, n_features
    except Exception as e:
        print(e)
        start_idx = 0
        if os.path.exists(idx_file):
            with open(idx_file, 'r') as f:
                start_idx = int(f.read().strip() or 0)
        return start_idx, n_features

In [32]:
save_path = 'data/classification'
sequence_length = 128

feature_columns = [col for col in train_df.columns if col not in ['Datetime', 'Label', 'Open', 'High', 'Low', 'Close', 'Volume']]
X_train = train_df[feature_columns].values
X_val = val_df[feature_columns].values
X_test = test_df[feature_columns].values

train_path = f'{save_path}/train'
val_path = f'{save_path}/val'
test_path = f'{save_path}/test' 

train_idx_file = f'{save_path}/train_idx.txt'
val_idx_file = f'{save_path}/val_idx.txt'
test_idx_file = f'{save_path}/test_idx.txt'

target='Label'
# print("Processing train data...")
# n_train_samples, n_features = create_sequences_sequential(X_train, train_df, sequence_length, train_path, train_idx_file, target=target, stride=10)
# print("Processing validation data...")
# n_val_samples, _ = create_sequences_sequential(X_val, val_df, sequence_length, val_path, val_idx_file, target=target, stride=10)
# print("Processing test data...")
# n_test_samples, _ = create_sequences_sequential(X_test, test_df, sequence_length, test_path, test_idx_file, target=target, stride=10)

Processing train data...


100%|██████████| 105025/105025 [00:29<00:00, 3555.77it/s]


Sequences saved to data/classification/train, shape: (105025, 128, 37)
Processing validation data...


100%|██████████| 35083/35083 [00:09<00:00, 3654.55it/s]


Sequences saved to data/classification/val, shape: (35083, 128, 37)
Processing test data...


100%|██████████| 112721/112721 [00:31<00:00, 3601.55it/s]


Sequences saved to data/classification/test, shape: (112721, 128, 37)


# Reg

In [34]:
print('Load data')
train_df = load_data("data/regression_train.csv")
val_df = load_data("data/regression_dev.csv")
test_df = load_data("data/regression_test.csv")

train_df = train_df.loc[train_df['Datetime'].dt.year.isin(range(2018, 2021))]

for df in [train_df, val_df, test_df]:
    df = add_basic_features(df)

print('add_technical_indicators')
train_df = add_technical_indicators(train_df)
val_df = add_technical_indicators(val_df)
test_df = add_technical_indicators(test_df)

print('add_statistical_features')
train_df = add_statistical_features(train_df)
val_df = add_statistical_features(val_df)
test_df = add_statistical_features(test_df)

print('add_time_features')
train_df = add_time_features(train_df)
val_df = add_time_features(val_df)
test_df = add_time_features(test_df)

print('handle_missing_data')
train_df = handle_missing_data(train_df)
val_df = handle_missing_data(val_df)
test_df = handle_missing_data(test_df)

print('scale_features')
train_df, val_df, test_df = scale_features(train_df, val_df, test_df, excepts=['Price_t_plus_30', 'Datetime'], scaler_path='data/regression_scaler_x.pkl')

print('scale_outputs')
scaler = StandardScaler()
train_df['Price_t_plus_30'] = scaler.fit_transform(train_df[['Price_t_plus_30']])
val_df['Price_t_plus_30'] = scaler.transform(val_df[['Price_t_plus_30']])
test_df['Price_t_plus_30'] = scaler.transform(test_df[['Price_t_plus_30']])

pickle.dump(scaler, open('data/regression_scaler_y.pkl', 'wb'))

Load data
add_technical_indicators
add_statistical_features
add_time_features
handle_missing_data
Deleted 0 cols
Deleted 0 cols
Deleted 0 cols
scale_features
scale_outputs


In [35]:
train_df

Unnamed: 0,Open,High,Low,Close,Volume,Price_Spread,Price_Change,Body_Ratio,Log_Return,Cumulative_Return_5D,VW_Momentum,OBV,Stoch_%K,Stoch_%D,Williams_%R,ATR,RSI_14,MACD,MACD_Signal,BB_Upper,BB_Lower,MACD_Histogram,RSI_MACD_Interaction,Volume_Price_Correlation,ADX,Market_Trend,Rolling_Mean,Rolling_Std,Rolling_Max,Rolling_Min,Volatility_Clustering,Rolling_Skew,Rolling_Kurtosis,Rolling_Q80,Rolling_Q20,Quantile_Spread,Hour,DayOfWeek,Hour_sin,Hour_cos,Day_sin,Day_cos,Price_t_plus_30,Datetime
2396141,-0.712705,-0.713314,-0.712603,-0.712871,0.494826,-0.610331,-0.095836,-0.554122,-0.059718,0.242306,0.120225,-0.577757,0.634333,0.728754,0.634333,-0.528046,0.029220,-0.135452,-0.260792,-0.714778,-0.712049,0.363982,0.245423,0.588673,0.218295,1.189041,-0.713422,-0.590735,-0.714590,-0.712157,-0.971608,-0.274949,-0.472147,-0.713984,-0.712778,-0.552486,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,-0.716536,2018-01-02 08:33:00
2396142,-0.712873,-0.713356,-0.712687,-0.713208,0.576678,-0.590471,-0.192314,-1.026234,-0.234867,0.119446,0.069247,-0.578294,0.316972,0.600076,0.316972,-0.544858,-0.237574,-0.126870,-0.235594,-0.714887,-0.712021,0.313907,0.178203,-0.097271,0.094308,1.189041,-0.713462,-0.606078,-0.714884,-0.712157,-0.919277,-0.271900,-0.346332,-0.714076,-0.712778,-0.574571,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,-0.715315,2018-01-02 08:34:00
2396143,-0.713252,-0.713735,-0.713445,-0.713923,0.200156,-0.411739,-0.385271,-1.213518,-0.497634,-0.235459,-0.071475,-0.578711,-0.357421,0.214043,-0.357421,-0.544176,-0.741369,-0.149608,-0.220267,-0.715044,-0.712019,0.190936,0.044544,0.804433,-0.052720,-0.841014,-0.713540,-0.623967,-0.714884,-0.712157,-0.837325,-0.128141,-0.262117,-0.714085,-0.712778,-0.576579,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,-0.715820,2018-01-02 08:35:00
2396144,-0.713968,-0.714029,-0.713445,-0.713713,0.560308,-0.550753,0.145360,0.673368,0.144638,-0.290039,-0.097356,-0.578179,-0.403612,-0.160043,-0.403612,-0.556215,-0.552936,-0.156758,-0.209525,-0.715127,-0.712024,0.133825,0.004254,0.630140,-0.189245,-0.841014,-0.713584,-0.632678,-0.714884,-0.712157,-0.626491,0.076823,-0.105941,-0.714118,-0.712778,-0.584610,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,-0.716872,2018-01-02 08:36:00
2396145,-0.713715,-0.714240,-0.714708,-0.715270,0.232897,-0.054273,-0.891783,-1.544484,-1.081693,-0.808693,-0.273371,-0.578606,-1.626866,-0.860622,-1.626866,-0.522134,-1.425963,-0.227726,-0.216012,-0.715180,-0.712250,-0.080799,-0.161435,1.237873,-0.182121,-0.841014,-0.713723,-0.613361,-0.714884,-0.712157,-0.487304,-0.291167,0.139512,-0.714396,-0.712821,-0.640826,-0.441128,-0.702877,1.170257,-0.643908,0.831735,0.894962,-0.716115,2018-01-02 08:37:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3446544,1.735689,1.738298,1.736169,1.739897,0.724013,1.673478,2.412602,1.321821,2.019309,-1.183364,-0.628034,2.253417,-0.682623,-1.251874,-0.682623,0.536540,-1.125649,-1.356892,-0.689335,1.748847,1.743563,-2.343039,-1.791346,-1.049998,0.279462,1.189041,1.746224,1.289358,1.744831,1.742700,2.875561,-1.583226,0.127446,1.747086,1.745507,1.025596,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,1.744606,2020-12-31 19:13:00
3446545,1.739899,1.740949,1.741518,1.741034,0.151045,0.402489,0.651872,0.721193,0.544053,-0.418873,-0.305863,2.253818,-0.473444,-0.957817,-0.473444,0.534208,-0.845447,-1.325432,-0.833120,1.748840,1.743056,-1.791151,-1.505844,-1.415363,0.217948,1.189041,1.745968,1.345351,1.744831,1.742700,2.375891,-1.303921,-0.202342,1.747086,1.744024,1.378958,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,1.746163,2020-12-31 19:14:00
3446546,1.740657,1.739139,1.741054,1.739940,0.003710,-0.233006,-0.409391,-0.914090,-0.526529,0.714759,0.535670,2.253463,-0.674875,-0.659889,-0.674875,0.486781,-1.023888,-1.332898,-0.949735,1.748812,1.742396,-1.452682,-1.184607,-1.674262,0.174227,1.189041,1.745625,1.416079,1.744831,1.742700,1.020119,-0.987346,-0.520754,1.747086,1.743147,1.587762,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,1.745743,2020-12-31 19:15:00
3446547,1.739941,1.738508,1.739075,1.737456,0.118304,0.402489,-1.422414,-1.563746,-1.193385,0.138332,0.278818,2.253073,-1.131969,-0.821838,-1.131969,0.488003,-1.392436,-1.430580,-1.063784,1.748744,1.741364,-1.420719,-1.045322,-1.432637,0.187915,1.189041,1.745075,1.523968,1.744831,1.742700,0.585217,-0.642874,-0.784960,1.747086,1.742625,1.712242,1.227246,0.695816,-1.380247,0.450710,0.156709,-1.065939,1.745743,2020-12-31 19:16:00


In [None]:
save_path = 'data/regression'
sequence_length = 128

feature_columns = [col for col in train_df.columns if col not in ['Datetime', 'Price_t_plus_30', 'Open', 'High', 'Low', 'Close', 'Volume']]
X_train = train_df[feature_columns].values
X_val = val_df[feature_columns].values
X_test = test_df[feature_columns].values

train_path = f'{save_path}/train'
val_path = f'{save_path}/val'
test_path = f'{save_path}/test' 

train_idx_file = f'{save_path}/train_idx.txt'
val_idx_file = f'{save_path}/val_idx.txt'
test_idx_file = f'{save_path}/test_idx.txt'

# target='Price_t_plus_30'
# print("Processing train data...")
# n_train_samples, n_features = create_sequences_sequential(X_train, train_df, sequence_length, train_path, train_idx_file, target=target, stride=10)
# print("Processing validation data...")
# n_val_samples, _ = create_sequences_sequential(X_val, val_df, sequence_length, val_path, val_idx_file, target=target, stride=10)
# print("Processing test data...")
# n_test_samples, _ = create_sequences_sequential(X_test, test_df, sequence_length, test_path, test_idx_file, target=target, stride=10)

Processing train data...


100%|██████████| 105025/105025 [00:29<00:00, 3620.55it/s]


Sequences saved to data/regression/train, shape: (105025, 128, 37)
Processing validation data...


100%|██████████| 35083/35083 [00:09<00:00, 3612.86it/s]


Sequences saved to data/regression/val, shape: (35083, 128, 37)
Processing test data...


100%|██████████| 112721/112721 [00:35<00:00, 3200.60it/s]


Sequences saved to data/regression/test, shape: (112721, 128, 37)
