In [2]:
%load_ext autoreload
%autoreload 2

In [21]:
import pandas as pd
import numpy as np
import mplfinance as mpf
from matplotlib import pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
# sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [6]:
df = pd.read_csv('../data/binance/hbar.csv', parse_dates=['time_stamp'])

In [8]:
df

Unnamed: 0,time_stamp,open,high,low,close,volume,close_time,quote_volume,trades,taker_base_volume,taker_quote_volume,ignore
0,2023-12-18 09:00:00,0.07570,0.07570,0.07380,0.07470,15300374.0,1702893599999,1.140326e+06,3957,6753460.0,503090.67190,0
1,2023-12-18 10:00:00,0.07470,0.07490,0.07230,0.07340,30523118.0,1702897199999,2.241224e+06,5720,12988980.0,954068.83820,0
2,2023-12-18 11:00:00,0.07340,0.07470,0.07330,0.07450,9499019.0,1702900799999,7.035924e+05,2084,4561700.0,338001.14630,0
3,2023-12-18 12:00:00,0.07450,0.07480,0.07360,0.07370,5979821.0,1702904399999,4.434351e+05,1677,2098387.0,155645.49590,0
4,2023-12-18 13:00:00,0.07380,0.07590,0.07360,0.07570,12358946.0,1702907999999,9.272564e+05,2841,8168082.0,612573.56890,0
...,...,...,...,...,...,...,...,...,...,...,...,...
17515,2025-12-17 04:00:00,0.11379,0.11393,0.11313,0.11326,1535636.0,1765947599999,1.741385e+05,2510,847253.0,96072.97671,0
17516,2025-12-17 05:00:00,0.11327,0.11360,0.11275,0.11343,2758941.0,1765951199999,3.121800e+05,4170,1403068.0,158787.56053,0
17517,2025-12-17 06:00:00,0.11343,0.11377,0.11298,0.11298,1716344.0,1765954799999,1.947132e+05,2733,924557.0,104926.27277,0
17518,2025-12-17 07:00:00,0.11299,0.11319,0.11244,0.11256,5090034.0,1765958399999,5.737895e+05,4629,3299682.0,372008.65484,0


### Predict Bullish Day

In [166]:
df_daily = df.resample('D', on='time_stamp').agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume': 'sum',
    'quote_volume': 'sum',
    'taker_base_volume': 'sum',
    'taker_quote_volume': 'sum',
    'trades': 'sum'
}).reset_index()

In [167]:
df_daily['target'] = df_daily['close'].shift(-1)

In [168]:
df_daily['target_class'] = np.where(df_daily['target'] > df_daily['close'], 1, 0)

In [169]:
df_daily['target_class'].value_counts()

target_class
0    387
1    344
Name: count, dtype: int64

In [170]:
df_daily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   time_stamp          731 non-null    datetime64[ns]
 1   open                731 non-null    float64       
 2   high                731 non-null    float64       
 3   low                 731 non-null    float64       
 4   close               731 non-null    float64       
 5   volume              731 non-null    float64       
 6   quote_volume        731 non-null    float64       
 7   taker_base_volume   731 non-null    float64       
 8   taker_quote_volume  731 non-null    float64       
 9   trades              731 non-null    int64         
 10  target              730 non-null    float64       
 11  target_class        731 non-null    int64         
dtypes: datetime64[ns](1), float64(9), int64(2)
memory usage: 68.7 KB


#### Feature Engineering

In [None]:
df_daily['yesterdat_bullish_bearish'] = np.where(df_daily['close'] > df_daily['open'], 1, 0)
df_daily['volume_change_pct'] = df_daily['volume'].pct_change()
df_daily['log_return'] = np.log(df_daily['close'] / df_daily['close'].shift(1))
df_daily['log_high_low'] = np.log(df_daily['high'] / df_daily['low'])
df_daily['true_range_1'] = df_daily['high'] - df_daily['low']
df_daily['true_range_2'] = abs(df_daily['high'] - df_daily['open'])
df_daily['true_range_3'] = abs(df_daily['low'] - df_daily['open'])

# lags
for lag in range(1, 4):
    df_daily[f'volume_change_pct_lag_{lag}'] = df_daily['volume_change_pct'].shift(lag)
    df_daily[f'true_range_1_lag_{lag}'] = df_daily['true_range_1'].shift(lag)
    df_daily[f'true_range_2_lag_{lag}'] = df_daily['true_range_2'].shift(lag)
    df_daily[f'true_range_3_lag_{lag}'] = df_daily['true_range_3'].shift(lag)
    df_daily[f'yesterdat_bullish_bearish_lag_{lag}'] = df_daily['yesterdat_bullish_bearish'].shift(lag)
    df_daily[f'open_lag_{lag}'] = df_daily['open'].shift(lag)
    df_daily[f'high_lag_{lag}'] = df_daily['high'].shift(lag)
    df_daily[f'low_lag_{lag}'] = df_daily['low'].shift(lag)

In [None]:
# ---------- PRICE RETURNS ----------
# Short-term momentum
df_daily['return_1'] = df_daily['close'].pct_change(1)
df_daily['return_5'] = df_daily['close'].pct_change(5)
df_daily['return_10'] = df_daily['close'].pct_change(10)

# ---------- MOVING AVERAGES ----------
# Trend direction indicators
df_daily['sma_5'] = df_daily['close'].rolling(window=5).mean()
df_daily['sma_10'] = df_daily['close'].rolling(window=10).mean()
df_daily['sma_20'] = df_daily['close'].rolling(window=20).mean()
df_daily['ema_10'] = df_daily['close'].ewm(span=10, adjust=False).mean()
df_daily['ema_20'] = df_daily['close'].ewm(span=20, adjust=False).mean()

In [None]:
# ---------- MOMENTUM INDICATORS ----------
# Relative Strength Index
delta = df_daily['close'].diff()
gain = np.where(delta > 0, delta, 0)
loss = np.where(delta < 0, -delta, 0)
avg_gain = pd.Series(gain).rolling(window=14).mean()
avg_loss = pd.Series(loss).rolling(window=14).mean()
rs = avg_gain / avg_loss
df_daily['rsi_14'] = 100 - (100 / (1 + rs))

# Moving Average Convergence Divergence
ema_12 = df_daily['close'].ewm(span=12, adjust=False).mean()
ema_26 = df_daily['close'].ewm(span=26, adjust=False).mean()
df_daily['macd'] = ema_12 - ema_26
df_daily['macd_signal'] = df_daily['macd'].ewm(span=9, adjust=False).mean()

# Rate of Change
df_daily['roc_10'] = df_daily['close'].pct_change(10)

# ---------- VOLATILITY INDICATORS ----------
# Average True Range
high_low = df_daily['high'] - df_daily['low']
high_close = np.abs(df_daily['high'] - df_daily['close'].shift())
low_close = np.abs(df_daily['low'] - df_daily['close'].shift())
df_daily['tr'] = high_low.combine(high_close, max).combine(low_close, max)
df_daily['atr_14'] = df_daily['tr'].rolling(window=14).mean()

# Bollinger Bands
df_daily['bb_middle'] = df_daily['close'].rolling(20).mean()
df_daily['bb_std'] = df_daily['close'].rolling(20).std()
df_daily['bb_upper'] = df_daily['bb_middle'] + 2 * df_daily['bb_std']
df_daily['bb_lower'] = df_daily['bb_middle'] - 2 * df_daily['bb_std']

# ---------- VOLUME INDICATORS ----------
# Volume moving average
df_daily['vol_sma_20'] = df_daily['volume'].rolling(window=20).mean()

In [None]:
# ---------- 1. Price Crosses Above EMA 20 ----------
df_daily['buy_ema20'] = np.where(df_daily['close'] > df_daily['ema_20'], 1, 0)

# ---------- 2. RSI Oversold Reversal (RSI < 30 and rising) ----------
df_daily['rsi_prev'] = df_daily['rsi_14'].shift(1)
df_daily['buy_rsi'] = np.where((df_daily['rsi_prev'] < 30) & (df_daily['rsi_14'] > df_daily['rsi_prev']), 1, 0)

# ---------- 3. MACD Bullish Crossover ----------
df_daily['macd_prev'] = df_daily['macd'].shift(1)
df_daily['macd_signal_prev'] = df_daily['macd_signal'].shift(1)
df_daily['buy_macd'] = np.where((df_daily['macd_prev'] < df_daily['macd_signal_prev']) & 
                                (df_daily['macd'] > df_daily['macd_signal']), 1, 0)

# ---------- 4. Bullish Engulfing Candlestick ----------
df_daily['buy_engulfing'] = np.where(
    (df_daily['close'] > df_daily['open']) &
    (df_daily['close'].shift(1) < df_daily['open'].shift(1)) &
    (df_daily['close'] > df_daily['open'].shift(1)) &
    (df_daily['open'] < df_daily['close'].shift(1)),
    1, 0
)

# ---------- 5. Breakout from Recent High (e.g., last 20 bars) ----------
df_daily['rolling_high20'] = df_daily['close'].rolling(20).max().shift(1)
df_daily['buy_breakout'] = np.where(df_daily['close'] > df_daily['rolling_high20'], 1, 0)

# ---------- 6. High Volume Up Move ----------
df_daily['vol_ma20'] = df_daily['volume'].rolling(20).mean()
df_daily['buy_volume'] = np.where((df_daily['close'] > df_daily['close'].shift(1)) & 
                                 (df_daily['volume'] > df_daily['vol_ma20']), 1, 0)

# ---------- 7. Bollinger Band Lower Bounce ----------
df_daily['buy_bb'] = np.where((df_daily['close'] < df_daily['bb_lower']) & 
                              (df_daily['close'] > df_daily['close'].shift(1)), 1, 0)

# ---------- 8. Gap Up Confirmation ----------
df_daily['gap_up'] = df_daily['open'] > df_daily['close'].shift(1)
df_daily['buy_gap'] = np.where(df_daily['gap_up'] & (df_daily['close'] > df_daily['open']), 1, 0)

# ---------- 9. Trendline Support Bounce ----------
# Approximate using 20-period SMA as dynamic support
df_daily['buy_trendline'] = np.where((df_daily['close'].shift(1) < df_daily['sma_20'].shift(1)) & 
                                    (df_daily['close'] > df_daily['sma_20']), 1, 0)

# ---------- 10. Pivot Point Support Bounce ----------
# Simple pivot calculation: pivot = (high + low + close)/3
df_daily['pivot'] = (df_daily['high'] + df_daily['low'] + df_daily['close']) / 3
df_daily['buy_pivot'] = np.where((df_daily['close'].shift(1) < df_daily['pivot'].shift(1)) &
                                 (df_daily['close'] > df_daily['pivot']), 1, 0)


In [176]:
df_daily = df_daily.dropna().reset_index(drop=True)

In [177]:
df_daily.shape

(692, 31)

In [178]:
#df_daily.yesterdat_bullish_bearish = df_daily.yesterdat_bullish_bearish.astype('category')

#### Model

In [179]:
X_train, X_test, y_train, y_test = train_test_split(df_daily.drop(['time_stamp', 'target', 'target_class'], axis=1), 
                                                    df_daily['target_class'], 
                                                    test_size=0.3, 
                                                    random_state=18)

In [180]:
scaling_needed = True
if scaling_needed:  
    # columns to scale
    cols_to_scale = df_daily.drop(['time_stamp', 'target', 'target_class'], axis=1).select_dtypes(include=['float64', 'int64']).columns

    # sclaler
    scaler = StandardScaler()

    # fit scaler on training data
    X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

    # transform test data
    X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

In [193]:
model = HistGradientBoostingClassifier(random_state=18,
                                       max_iter=400,
                                       learning_rate=0.1,
                                       validation_fraction=0.2)
model.fit(X_train, y_train)

0,1,2
,"loss  loss: {'log_loss'}, default='log_loss' The loss function to use in the boosting process. For binary classification problems, 'log_loss' is also known as logistic loss, binomial deviance or binary crossentropy. Internally, the model fits one tree per boosting iteration and uses the logistic sigmoid function (expit) as inverse link function to compute the predicted positive class probability. For multiclass classification problems, 'log_loss' is also known as multinomial deviance or categorical crossentropy. Internally, the model fits one tree per boosting iteration and per class and uses the softmax function as inverse link function to compute the predicted probabilities of the classes.",'log_loss'
,"learning_rate  learning_rate: float, default=0.1 The learning rate, also known as *shrinkage*. This is used as a multiplicative factor for the leaves values. Use ``1`` for no shrinkage.",0.1
,"max_iter  max_iter: int, default=100 The maximum number of iterations of the boosting process, i.e. the maximum number of trees for binary classification. For multiclass classification, `n_classes` trees per iteration are built.",400
,"max_leaf_nodes  max_leaf_nodes: int or None, default=31 The maximum number of leaves for each tree. Must be strictly greater than 1. If None, there is no maximum limit.",31
,"max_depth  max_depth: int or None, default=None The maximum depth of each tree. The depth of a tree is the number of edges to go from the root to the deepest leaf. Depth isn't constrained by default.",
,"min_samples_leaf  min_samples_leaf: int, default=20 The minimum number of samples per leaf. For small datasets with less than a few hundred samples, it is recommended to lower this value since only very shallow trees would be built.",20
,"l2_regularization  l2_regularization: float, default=0 The L2 regularization parameter penalizing leaves with small hessians. Use ``0`` for no regularization (default).",0.0
,"max_features  max_features: float, default=1.0 Proportion of randomly chosen features in each and every node split. This is a form of regularization, smaller values make the trees weaker learners and might prevent overfitting. If interaction constraints from `interaction_cst` are present, only allowed features are taken into account for the subsampling. .. versionadded:: 1.4",1.0
,"max_bins  max_bins: int, default=255 The maximum number of bins to use for non-missing values. Before training, each feature of the input array `X` is binned into integer-valued bins, which allows for a much faster training stage. Features with a small number of unique values may use less than ``max_bins`` bins. In addition to the ``max_bins`` bins, one more bin is always reserved for missing values. Must be no larger than 255.",255
,"categorical_features  categorical_features: array-like of {bool, int, str} of shape (n_features) or shape (n_categorical_features,), default='from_dtype' Indicates the categorical features. - None : no feature will be considered categorical. - boolean array-like : boolean mask indicating categorical features. - integer array-like : integer indices indicating categorical  features. - str array-like: names of categorical features (assuming the training  data has feature names). - `""from_dtype""`: dataframe columns with dtype ""category"" are  considered to be categorical features. The input must be an object  exposing a ``__dataframe__`` method such as pandas or polars  DataFrames to use this feature. For each categorical feature, there must be at most `max_bins` unique categories. Negative values for categorical features encoded as numeric dtypes are treated as missing values. All categorical values are converted to floating point numbers. This means that categorical values of 1.0 and 1 are treated as the same category. Read more in the :ref:`User Guide `. .. versionadded:: 0.24 .. versionchanged:: 1.2  Added support for feature names. .. versionchanged:: 1.4  Added `""from_dtype""` option. .. versionchanged:: 1.6  The default value changed from `None` to `""from_dtype""`.",'from_dtype'


In [194]:
y_pred = model.predict(X_test)

In [195]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.53      0.56       119
           1       0.44      0.49      0.47        89

    accuracy                           0.51       208
   macro avg       0.51      0.51      0.51       208
weighted avg       0.52      0.51      0.52       208



In [196]:
confusion_matrix(y_test, y_pred)

array([[63, 56],
       [45, 44]])