This notebook builds features for XGBoost + LSTM models.
It expects cleaned OHLCV data from Notebook 01: `ohlc_clean`

In [13]:
# Cell 0 — Load the cleaned OHLCV dataset from Notebook 01
import pandas as pd

ohlc_clean = pd.read_csv("data/processed/ohlc_clean.csv", parse_dates=['Open time'], index_col='Open time')

print("Loaded cleaned OHLCV data:", ohlc_clean.shape)
ohlc_clean.head()


Loaded cleaned OHLCV data: (69332, 5)


Unnamed: 0_level_0,open,high,low,close,volume
Open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-01 00:00:00+00:00,13715.65,13715.65,13400.01,13529.01,443.356199
2018-01-01 01:00:00+00:00,13528.99,13595.89,13155.38,13203.06,383.697006
2018-01-01 02:00:00+00:00,13203.0,13418.43,13200.0,13330.18,429.064572
2018-01-01 03:00:00+00:00,13330.26,13611.27,13290.0,13410.03,420.08703
2018-01-01 04:00:00+00:00,13434.98,13623.29,13322.15,13601.01,340.807329


Technical Indicators (SMA, EMA, RSI)

In [14]:
def add_indicators(df):
    df = df.copy()

    # Moving averages
    df['sma_10'] = df['close'].rolling(10).mean()
    df['sma_20'] = df['close'].rolling(20).mean()
    df['ema_10'] = df['close'].ewm(span=10, adjust=False).mean()
    df['ema_20'] = df['close'].ewm(span=20, adjust=False).mean()

    # RSI
    delta = df['close'].diff()

    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(14).mean()
    avg_loss = loss.rolling(14).mean()

    rs = avg_gain / avg_loss
    df['rsi_14'] = 100 - (100 / (1 + rs))

    return df


Volatility Features

In [15]:

def add_volatility(df):
    df = df.copy()
    df['returns'] = df['close'].pct_change()
    df['log_returns'] = np.log(df['close']).diff()
    df['volatility_24h'] = df['returns'].rolling(24).std()
    return df


Lag Features for XGBoost

In [16]:

def add_lags(df, n_lags=12):
    df = df.copy()
    for i in range(1, n_lags + 1):
        df[f'lag_{i}'] = df['close'].shift(i)
    return df


Trend Label (target for classification)

In [17]:
# Trend Label (UP = 1, DOWN = 0)
def add_trend_label(df, horizon=3):
    df = df.copy()
    df['future_close'] = df['close'].shift(-horizon)
    df['trend_label'] = (df['future_close'] > df['close']).astype(int)
    return df

Build Full Feature Set

In [18]:
def build_features(df):
    df = add_indicators(df)
    df = add_volatility(df)
    df = add_lags(df)
    df = add_trend_label(df)
    df = df.dropna()
    return df

Run feature engineering

In [19]:
# Run feature engineering
features = build_features(ohlc_clean)

print("Feature set shape:", features.shape)
features.head()


Feature set shape: (69305, 27)


Unnamed: 0_level_0,open,high,low,close,volume,sma_10,sma_20,ema_10,ema_20,rsi_14,...,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,future_close,trend_label
Open time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 00:00:00+00:00,13382.16,13850.0,13382.16,13750.01,466.596114,13312.463,13367.137,13405.411963,13381.724521,54.624776,...,13240.37,13135.0,13022.0,13018.0,13247.0,13211.39,13017.0,13172.42,13353.78,0
2018-01-02 01:00:00+00:00,13769.98,13800.0,13590.0,13600.0,517.461925,13347.763,13369.1875,13440.791606,13402.512662,60.862247,...,13399.24,13240.37,13135.0,13022.0,13018.0,13247.0,13211.39,13017.0,13127.31,0
2018-01-02 02:00:00+00:00,13611.93,13678.33,13398.0,13522.0,714.534781,13398.163,13356.267,13455.556769,13413.892408,59.839231,...,13481.01,13399.24,13240.37,13135.0,13022.0,13018.0,13247.0,13211.39,13166.95,0
2018-01-02 03:00:00+00:00,13520.32,13539.98,13231.96,13353.78,666.940106,13431.341,13345.4385,13437.051902,13408.167417,59.411153,...,13452.0,13481.01,13399.24,13240.37,13135.0,13022.0,13018.0,13247.0,13255.98,0
2018-01-02 04:00:00+00:00,13353.78,13480.84,12890.02,13127.31,992.418927,13430.572,13326.8045,13380.735192,13381.419091,47.691809,...,13380.0,13452.0,13481.01,13399.24,13240.37,13135.0,13022.0,13018.0,13343.0,1
