# Rolling Calibration + Validation — Data Prep

Purpose: resolve the specific run by name, then use local run artifacts via analysis/mlflow_file_fetch (no MLflow artifact downloads) and fetch OOS predictions from DuckDB for downstream rolling-window calibration/validation.

In [32]:
# moved: plotting cell relocated to bottom


In [33]:
# Config
EXPERIMENT_NAME = "lgbm-btcusdt-1h-202002-202508-binary4u2d-24h-rolling-window-13feature-12m"
TARGET_RUN_NAME = "run_20251102_231428_lgbm_y_tp_before_sl_u0.04_d0.02_24h_binary"
PREDICTIONS_DB = "/Volumes/Extreme SSD/trading_data/cex/db/binance_btcusdt_perp_prediction_classifier.duckdb"

import os
import sys
from pathlib import Path
import pandas as pd
# Ensure repo root on sys.path for 'analysis' package imports
_cwd = Path.cwd()
_cands = [_cwd, _cwd.parent, _cwd.parent.parent]
for _cand in _cands:
    if (_cand / 'analysis' / 'mlflow_file_fetch.py').exists():
        if str(_cand) not in sys.path:
            sys.path.insert(0, str(_cand))
        break
# Ensure 'feature_engineering' module local imports (targets/utils) work when importing build_targets
for _cand in _cands:
    _fe = _cand / 'feature_engineering'
    if (_fe / 'build_targets.py').exists():
        if str(_fe) not in sys.path:
            sys.path.insert(0, str(_fe))
        break
from analysis.mlflow_file_fetch import (
    set_tracking_uri,
    load_local_artifacts_from_experiment,
    load_booster_local,
    expected_model_path,
    read_oos_predictions_for_model,
)

# Set or confirm MLflow tracking URI (used only for run/registry metadata)
MLFLOW_TRACKING_URI = os.environ.get("MLFLOW_TRACKING_URI", "http://127.0.0.1:5000")
print("Tracking URI:", set_tracking_uri(MLFLOW_TRACKING_URI))

Tracking URI: http://127.0.0.1:5000


In [34]:
# Resolve run and load local artifacts
arts = load_local_artifacts_from_experiment(EXPERIMENT_NAME, TARGET_RUN_NAME)
print('Run ID:', arts.run_id)
print('Run dir:', arts.run_dir)
print('Model file:', arts.model_file)

# Booster (from local model.txt)
booster = load_booster_local(arts.run_dir)
print('Booster features:', len(booster.feature_name()))

# Training predictions and metadata
pred_train_df = arts.pred_train
pcfg = arts.pipeline_config
metrics = arts.metrics
fi_df = arts.feature_importance
print('pred_train:', 'yes' if pred_train_df is not None else 'no',
      '| pcfg:', 'yes' if pcfg else 'no',
      '| metrics:', 'yes' if metrics else 'no',
      '| fi:', 'yes' if fi_df is not None else 'no')
if pred_train_df is not None:
    display(pred_train_df.head())

Run ID: 7d9d87bad07d4f05b3ffe0dd38b55bb5
Run dir: /Volumes/Extreme SSD/trading_data/cex/models/binance_btcusdt_perp_1h_original/run_20251102_231428_lgbm_y_tp_before_sl_u0.04_d0.02_24h_binary
Model file: /Volumes/Extreme SSD/trading_data/cex/models/binance_btcusdt_perp_1h_original/run_20251102_231428_lgbm_y_tp_before_sl_u0.04_d0.02_24h_binary/model.txt
Booster features: 13
pred_train: yes | pcfg: yes | metrics: yes | fi: yes


Unnamed: 0,timestamp,y_true,y_pred
0,2023-04-26 00:00:00,1.0,0.1362
1,2023-04-26 01:00:00,1.0,0.1362
2,2023-04-26 02:00:00,1.0,0.1362
3,2023-04-26 03:00:00,1.0,0.1362
4,2023-04-26 04:00:00,1.0,0.134459


In [35]:
# OOS predictions for this model from DuckDB
model_path = expected_model_path(arts.run_dir)
oos_df = read_oos_predictions_for_model(PREDICTIONS_DB, model_path)
print('OOS predictions rows:', len(oos_df))
display(oos_df.head())

OOS predictions rows: 5349


Unnamed: 0,ts,y_pred,model_path,feature_key,created_at
0,2025-04-01 00:00:00,0.112009,/Volumes/Extreme SSD/trading_data/cex/models/b...,binance_btcusdt_perp_1h__backfill_v1,2025-11-09 22:03:25.615
1,2025-04-01 01:00:00,0.112009,/Volumes/Extreme SSD/trading_data/cex/models/b...,binance_btcusdt_perp_1h__backfill_v1,2025-11-09 22:03:25.633
2,2025-04-01 02:00:00,0.112009,/Volumes/Extreme SSD/trading_data/cex/models/b...,binance_btcusdt_perp_1h__backfill_v1,2025-11-09 22:03:25.641
3,2025-04-01 03:00:00,0.112009,/Volumes/Extreme SSD/trading_data/cex/models/b...,binance_btcusdt_perp_1h__backfill_v1,2025-11-09 22:03:25.645
4,2025-04-01 04:00:00,0.112009,/Volumes/Extreme SSD/trading_data/cex/models/b...,binance_btcusdt_perp_1h__backfill_v1,2025-11-09 22:03:25.649


In [36]:
# Load OHLCV from DuckDB and build the y_tp_before_sl_u0.04_d0.02_24h target
OHLCV_DB = "/Volumes/Extreme SSD/trading_data/cex/db/binance_btcusdt_perp_ohlcv.duckdb"
OHLCV_TABLE = "ohlcv_btcusdt_1h"

from run.data_loader import load_ohlcv_duckdb
from feature_engineering.targets import TargetGenerationConfig, generate_targets_for_row, extract_forward_window
try:
    from feature_engineering.build_targets import _horizon_labels_for_freq
except Exception:
    def _horizon_labels_for_freq(freq, horizons_bars):
        f = str(freq).upper()
        labels = {}
        if f.endswith("H"):
            for h in horizons_bars:
                labels[h] = f"{h}h"
        elif f in ("T", "MIN", "MINUTE") or f.endswith("T"):
            for h in horizons_bars:
                labels[h] = f"{h}min"
        elif f.endswith("D"):
            for h in horizons_bars:
                labels[h] = f"{h}d"
        else:
            for h in horizons_bars:
                labels[h] = f"{h}b"
        return labels
import numpy as np

df_ohlcv = load_ohlcv_duckdb(OHLCV_DB, table=OHLCV_TABLE)
print('OHLCV rows:', len(df_ohlcv), 'range:', df_ohlcv['timestamp'].min(), '->', df_ohlcv['timestamp'].max())

# Configure generation for 24 bars (1H * 24 = 24h) and TP/SL 0.04/0.02
H = 24
h_labels = _horizon_labels_for_freq('1H', [H])
cfg = TargetGenerationConfig(
    horizons_bars=[H],
    barrier_pairs=[(0.04, 0.02)],
    tie_policy='conservative',
    horizon_labels=h_labels,
    include_returns=False,
    include_mfe_mae=False,
    include_barriers=True,
    log_returns=True,
)

# Build forward-window DataFrame once for convenience (no timestamp col)
ohlcv_core = df_ohlcv[['open','high','low','close','volume']].copy()

# Expected output key
target_key = 'y_tp_before_sl_u0.04_d0.02_24h'
vals = np.full(len(ohlcv_core), np.nan, dtype=float)

for i in range(len(ohlcv_core)):
    entry_price = float(ohlcv_core['close'].iloc[i])
    fwd = extract_forward_window(ohlcv_core, i, H)
    res = generate_targets_for_row(fwd, entry_price, cfg)
    v = res.get(target_key, np.nan)
    vals[i] = v

df_ohlcv[target_key] = vals
display(df_ohlcv[["timestamp", target_key]].head(30))
print('Non-null target rows:', int(df_ohlcv[target_key].notna().sum()))

OHLCV rows: 51357 range: 2020-01-01 00:00:00 -> 2025-11-09 20:00:00


Unnamed: 0,timestamp,y_tp_before_sl_u0.04_d0.02_24h
0,2020-01-01 00:00:00,0.0
1,2020-01-01 01:00:00,0.0
2,2020-01-01 02:00:00,0.0
3,2020-01-01 03:00:00,0.0
4,2020-01-01 04:00:00,0.0
5,2020-01-01 05:00:00,0.0
6,2020-01-01 06:00:00,0.0
7,2020-01-01 07:00:00,0.0
8,2020-01-01 08:00:00,0.0
9,2020-01-01 09:00:00,0.0


Non-null target rows: 51333


In [37]:
# Merge pred_train + OOS with targets in [2023-04-26 00:00:00, 2025-10-31 23:00:00] (prefer pred_train on overlaps)
from analysis.calval_utils import merge_predictions_with_targets
pred_union, targets_window, df_calval = merge_predictions_with_targets(
    pred_train_df,
    oos_df if 'oos_df' in locals() else None,
    df_ohlcv,
    target_key,
    start='2023-04-26 00:00:00',
    end='2025-10-31 23:00:00',
)
print('pred_union:', len(pred_union), 'targets:', len(targets_window), 'merged:', len(df_calval))
display(df_calval.head(20))


pred_union: 22078 targets: 22080 merged: 22078


Unnamed: 0,timestamp,y_pred,y_true_train,source,y_true
0,2023-04-26 00:00:00,0.1362,1.0,pred_train,1.0
1,2023-04-26 01:00:00,0.1362,1.0,pred_train,1.0
2,2023-04-26 02:00:00,0.1362,1.0,pred_train,1.0
3,2023-04-26 03:00:00,0.1362,1.0,pred_train,1.0
4,2023-04-26 04:00:00,0.134459,1.0,pred_train,1.0
5,2023-04-26 05:00:00,0.134459,1.0,pred_train,1.0
6,2023-04-26 06:00:00,0.1362,1.0,pred_train,1.0
7,2023-04-26 07:00:00,0.1362,1.0,pred_train,1.0
8,2023-04-26 08:00:00,0.1362,1.0,pred_train,1.0
9,2023-04-26 09:00:00,0.1362,1.0,pred_train,1.0


In [43]:
# Rolling monthly calibration using helpers (verbose logging inside the loop)
import pandas as pd
import importlib, analysis.calval_utils as cal_utils
importlib.reload(cal_utils)

assert 'df_calval' in locals(), 'df_calval not found; run merge cell first'

FIRST_MONTH = pd.Timestamp('2025-05-01')
CAL_MONTHS = 12
N_BINS = 20
CAL_METHOD = 'platt'

dfc = df_calval.copy()[['timestamp','y_pred','y_true']]
df_roll, df_roll_cal = cal_utils.rolling_calibrate_and_bin_monthly(
    dfc, first_month=FIRST_MONTH, lookback_months=CAL_MONTHS, method=CAL_METHOD, n_bins=N_BINS, verbose=True, return_cal=True
)
print('Rolling calibration months:', (0 if df_roll.empty else df_roll["month"].nunique()))
print('Calibration windows returned:', (0 if df_roll_cal.empty else df_roll_cal["month"].nunique()))
display(df_roll.head())


[roll] month=2025-05 cal=(2024-05-01 00:00:00 .. 2025-04-30 23:00:00) cal_rows=8758 mon_rows=744
  edges[19]: [0.0186, 0.0221, 0.0263, 0.0321, 0.0361, 0.0427, 0.0479, 0.0533, 0.0583,
 0.0649, 0.0718, 0.0833, 0.0961, 0.1101, 0.137 , 0.1734, 0.2228, 0.286 ,
 0.4809]
  val bin counts: {1: 40, 2: 24, 3: 0, 4: 37, 5: 99, 6: 49, 7: 15, 8: 13, 9: 2, 10: 2, 11: 14, 12: 20, 13: 6, 14: 67, 15: 62, 16: 71, 17: 83, 18: 3, 19: 89, 20: 48}
  val p_cal range: min=0.013091 max=0.678380 rows=744
  cal bin counts: {1: 430, 2: 433, 3: 447, 4: 436, 5: 444, 6: 435, 7: 434, 8: 438, 9: 441, 10: 441, 11: 438, 12: 438, 13: 434, 14: 440, 15: 438, 16: 418, 17: 456, 18: 436, 19: 437, 20: 444}
  cal p_cal range: min=0.011966 max=0.774801 rows=8758
[roll] month=2025-06 cal=(2024-06-01 00:00:00 .. 2025-05-31 23:00:00) cal_rows=8758 mon_rows=720
  edges[19]: [0.0206, 0.025 , 0.0293, 0.0354, 0.0389, 0.0442, 0.0496, 0.0554, 0.062 ,
 0.068 , 0.0767, 0.0876, 0.0989, 0.1127, 0.1371, 0.1658, 0.2031, 0.2648,
 0.4194]
  val 

Unnamed: 0,timestamp,y_pred,y_true,p_cal,q_bin,month,cal_window_start,cal_window_end
0,2025-05-01 00:00:00,0.103746,0.0,0.03477,5,2025-05,2024-05-01,2025-04-30 23:00:00
1,2025-05-01 01:00:00,0.103746,0.0,0.03477,5,2025-05,2024-05-01,2025-04-30 23:00:00
2,2025-05-01 02:00:00,0.103746,0.0,0.03477,5,2025-05,2024-05-01,2025-04-30 23:00:00
3,2025-05-01 03:00:00,0.103746,0.0,0.03477,5,2025-05,2024-05-01,2025-04-30 23:00:00
4,2025-05-01 04:00:00,0.103746,0.0,0.03477,5,2025-05,2024-05-01,2025-04-30 23:00:00


In [44]:
# Bin distribution per month (counts and percentages)
import importlib, analysis.calval_utils as cal_utils
importlib.reload(cal_utils)

assert 'df_roll' in locals() and not df_roll.empty, 'df_roll is empty or missing'
# Validation month distribution
dist_counts_val, dist_pct_val = cal_utils.bin_distribution_by_month(df_roll, n_bins=(N_BINS if 'N_BINS' in locals() else None))
print('Validation month — bin counts per month:')
display(dist_counts_val)
print('Validation month — bin percentages per month:')
display(dist_pct_val.round(4))

# Calibration-window (12-month) distribution used to derive edges
if 'df_roll_cal' in locals() and not df_roll_cal.empty:
    dist_counts_cal, dist_pct_cal = cal_utils.bin_distribution_by_month(df_roll_cal, n_bins=(N_BINS if 'N_BINS' in locals() else None))
    print('Calibration window — bin counts per month:')
    display(dist_counts_cal)
    print('Calibration window — bin percentages per month:')
    display(dist_pct_cal.round(4))
else:
    print('No calibration-window rows returned to summarize.')


Validation month — bin counts per month:


q_bin,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2025-05,40,24,0,37,99,49,15,13,2,2,14,20,6,67,62,71,83,3,89,48
2025-06,92,136,91,22,33,63,71,23,19,3,15,19,39,24,34,0,17,7,12,0
2025-07,56,28,40,51,49,31,0,2,6,43,62,24,44,22,84,14,4,28,123,33
2025-08,186,54,73,31,152,14,35,59,15,0,9,0,5,0,24,3,34,26,24,0
2025-09,24,72,33,16,28,48,27,24,27,78,52,58,84,12,17,28,36,24,8,24
2025-10,0,4,17,0,14,9,15,15,46,20,35,22,128,58,97,52,64,8,80,60


Validation month — bin percentages per month:


q_bin,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2025-05,0.0538,0.0323,0.0,0.0497,0.1331,0.0659,0.0202,0.0175,0.0027,0.0027,0.0188,0.0269,0.0081,0.0901,0.0833,0.0954,0.1116,0.004,0.1196,0.0645
2025-06,0.1278,0.1889,0.1264,0.0306,0.0458,0.0875,0.0986,0.0319,0.0264,0.0042,0.0208,0.0264,0.0542,0.0333,0.0472,0.0,0.0236,0.0097,0.0167,0.0
2025-07,0.0753,0.0376,0.0538,0.0685,0.0659,0.0417,0.0,0.0027,0.0081,0.0578,0.0833,0.0323,0.0591,0.0296,0.1129,0.0188,0.0054,0.0376,0.1653,0.0444
2025-08,0.25,0.0726,0.0981,0.0417,0.2043,0.0188,0.047,0.0793,0.0202,0.0,0.0121,0.0,0.0067,0.0,0.0323,0.004,0.0457,0.0349,0.0323,0.0
2025-09,0.0333,0.1,0.0458,0.0222,0.0389,0.0667,0.0375,0.0333,0.0375,0.1083,0.0722,0.0806,0.1167,0.0167,0.0236,0.0389,0.05,0.0333,0.0111,0.0333
2025-10,0.0,0.0054,0.0228,0.0,0.0188,0.0121,0.0202,0.0202,0.0618,0.0269,0.047,0.0296,0.172,0.078,0.1304,0.0699,0.086,0.0108,0.1075,0.0806


Calibration window — bin counts per month:


q_bin,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2025-05,430,433,447,436,444,435,434,438,441,441,438,438,434,440,438,418,456,436,437,444
2025-06,430,435,439,444,438,433,446,435,440,438,439,438,437,436,439,424,450,441,432,444
2025-07,434,442,428,434,444,437,446,437,433,439,442,429,441,440,433,420,459,441,435,444
2025-08,433,436,415,464,434,446,431,443,439,428,446,438,423,443,449,436,440,432,443,439
2025-09,438,427,435,451,439,435,431,445,439,423,451,440,428,443,420,459,434,434,439,447
2025-10,435,435,444,426,450,435,440,427,442,442,441,438,438,427,428,454,437,439,429,451


Calibration window — bin percentages per month:


q_bin,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2025-05,0.0491,0.0494,0.051,0.0498,0.0507,0.0497,0.0496,0.05,0.0504,0.0504,0.05,0.05,0.0496,0.0502,0.05,0.0477,0.0521,0.0498,0.0499,0.0507
2025-06,0.0491,0.0497,0.0501,0.0507,0.05,0.0494,0.0509,0.0497,0.0502,0.05,0.0501,0.05,0.0499,0.0498,0.0501,0.0484,0.0514,0.0504,0.0493,0.0507
2025-07,0.0496,0.0505,0.0489,0.0496,0.0507,0.0499,0.0509,0.0499,0.0494,0.0501,0.0505,0.049,0.0504,0.0502,0.0494,0.048,0.0524,0.0504,0.0497,0.0507
2025-08,0.0494,0.0498,0.0474,0.053,0.0496,0.0509,0.0492,0.0506,0.0501,0.0489,0.0509,0.05,0.0483,0.0506,0.0513,0.0498,0.0502,0.0493,0.0506,0.0501
2025-09,0.05,0.0488,0.0497,0.0515,0.0501,0.0497,0.0492,0.0508,0.0501,0.0483,0.0515,0.0502,0.0489,0.0506,0.048,0.0524,0.0496,0.0496,0.0501,0.051
2025-10,0.0497,0.0497,0.0507,0.0486,0.0514,0.0497,0.0502,0.0488,0.0505,0.0505,0.0504,0.05,0.05,0.0488,0.0489,0.0518,0.0499,0.0501,0.049,0.0515


In [47]:
# Simulated returns: Long at bin 10 per validation month
# Rules: evaluate t+1..t+24 (24 bars). TP=+4% if high hits first; SL=-2% if low hits first;
# otherwise natural close at t+24 using close/open ratio (t+24 close / t+1 open - 1).
import pandas as pd
import numpy as np
from feature_engineering.targets import compute_barrier_outcomes

assert 'df_roll' in locals() and 'df_ohlcv' in locals(), 'Expected df_roll and df_ohlcv from previous cells'

TP = 0.04
SL = 0.02
H = 24
TOP_BIN = 19

# Normalize timestamps and build index mapping into OHLCV
ohlcv = df_ohlcv.copy().sort_values('timestamp').reset_index(drop=True)
ohlcv['timestamp'] = pd.to_datetime(ohlcv['timestamp'])
ts_to_idx = {t: i for i, t in enumerate(ohlcv['timestamp'])}

val = df_roll[df_roll['q_bin'] == TOP_BIN].copy()
val['timestamp'] = pd.to_datetime(val['timestamp'])

rows = []
skipped = 0
for ts, mon in zip(val['timestamp'], val['month']):
    i = ts_to_idx.get(ts)
    if i is None or i + H >= len(ohlcv):
        skipped += 1
        continue
    fwd = ohlcv.iloc[i+1:i+1+H]  # t+1 .. t+24
    entry = float(fwd['open'].iloc[0])
    if not np.isfinite(entry) or entry <= 0:
        skipped += 1
        continue
    out = compute_barrier_outcomes(
        forward_high=fwd['high'], forward_low=fwd['low'], entry_price=entry,
        up_pct=TP, down_pct=SL, horizon_bars=H, horizon_label='24h', tie_policy='conservative',
        forward_open=fwd['open']
    )
    # ternary: +1 TP first, -1 SL first, 0 natural close
    ternary = None
    for k in out.keys():
        if k.startswith('y_tb_label_u') and k.endswith('_24h'):
            ternary = float(out[k])
            break
    # natural close return
    close_t24 = float(fwd['close'].iloc[-1])
    ret_nc = (close_t24 / entry) - 1.0 if (np.isfinite(close_t24) and close_t24 > 0) else np.nan
    if ternary == 1.0:
        scen = 'TP'
        ret_hybrid = TP
    elif ternary == -1.0:
        scen = 'SL'
        ret_hybrid = -SL
    else:
        scen = 'NC'
        ret_hybrid = ret_nc
    rows.append({
        'timestamp': ts, 'month': mon, 'scenario': scen,
        'ret_hybrid': ret_hybrid, 'ret_natural': ret_nc
    })

res = pd.DataFrame(rows)
print(f'Sim rows: {len(res)} (skipped={skipped}) across months:', (sorted(res['month'].unique()) if not res.empty else []))
display(res.head())

# Per-month counts
if not res.empty:
    counts = res.pivot_table(index='month', columns='scenario', values='timestamp', aggfunc='count', fill_value=0)
    counts = counts.assign(total=counts.sum(axis=1))
    print('Per-month scenario counts (bin=TOP_BIN):')
    display(counts)

    # Aggregated returns (hybrid: TP/SL/Natural)
    agg_h = res.groupby('month', as_index=False)['ret_hybrid'].agg(total_return=lambda s: float(np.nansum(s)), mean_return=lambda s: float(np.nanmean(s)))
    print('Per-month hybrid returns (TP/SL/Natural):')
    display(agg_h)

    # Aggregated returns (natural-close only)
    agg_nc = res.groupby('month', as_index=False)['ret_natural'].agg(total_return=lambda s: float(np.nansum(s)), mean_return=lambda s: float(np.nanmean(s)))
    print('Per-month natural-close returns (ignore TP/SL):')
    display(agg_nc)

    # Overall aggregates
    print('Overall hybrid total:', float(np.nansum(res['ret_hybrid'])))
    print('Overall natural-close total:', float(np.nansum(res['ret_natural'])))


Sim rows: 336 (skipped=0) across months: ['2025-05', '2025-06', '2025-07', '2025-08', '2025-09', '2025-10']


Unnamed: 0,timestamp,month,scenario,ret_hybrid,ret_natural
0,2025-05-05 00:00:00,2025-05,NC,-0.001072,-0.001072
1,2025-05-05 01:00:00,2025-05,NC,0.00399,0.00399
2,2025-05-05 02:00:00,2025-05,NC,0.000969,0.000969
3,2025-05-05 03:00:00,2025-05,NC,0.004437,0.004437
4,2025-05-05 04:00:00,2025-05,NC,0.001754,0.001754


Per-month scenario counts (bin=TOP_BIN):


scenario,NC,SL,TP,total
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-05,67,7,15,89
2025-06,0,12,0,12
2025-07,118,1,4,123
2025-08,4,12,8,24
2025-09,8,0,0,8
2025-10,50,22,8,80


Per-month hybrid returns (TP/SL/Natural):


Unnamed: 0,month,total_return,mean_return
0,2025-05,1.183595,0.013299
1,2025-06,-0.24,-0.02
2,2025-07,0.751312,0.006108
3,2025-08,0.114002,0.00475
4,2025-09,0.102498,0.012812
5,2025-10,0.46141,0.005768


Per-month natural-close returns (ignore TP/SL):


Unnamed: 0,month,total_return,mean_return
0,2025-05,1.396935,0.015696
1,2025-06,-0.212442,-0.017704
2,2025-07,0.778234,0.006327
3,2025-08,-0.102845,-0.004285
4,2025-09,0.102498,0.012812
5,2025-10,0.449599,0.00562


Overall hybrid total: 2.3728170845601073
Overall natural-close total: 2.411977419238283


In [41]:
# Simulated returns: Short at bin 1 per validation month
# Rules: evaluate t+1..t+24 (24 bars). TP for short = -2% (down barrier first); SL for short = +4% (up barrier first).
# Otherwise natural close at t+24 using short return (entry/close - 1).
import pandas as pd
import numpy as np
from feature_engineering.targets import compute_barrier_outcomes

assert 'df_roll' in locals() and 'df_ohlcv' in locals(), 'Expected df_roll and df_ohlcv from previous cells'

UP_PCT = 0.04   # adverse move for short (SL)
DOWN_PCT = 0.02 # favorable move for short (TP)
H = 24
BOTTOM_BIN = 1

ohlcv = df_ohlcv.copy().sort_values('timestamp').reset_index(drop=True)
ohlcv['timestamp'] = pd.to_datetime(ohlcv['timestamp'])
ts_to_idx = {t: i for i, t in enumerate(ohlcv['timestamp'])}

val_s = df_roll[df_roll['q_bin'] == BOTTOM_BIN].copy()
val_s['timestamp'] = pd.to_datetime(val_s['timestamp'])

rows_s = []
skipped_s = 0
for ts, mon in zip(val_s['timestamp'], val_s['month']):
    i = ts_to_idx.get(ts)
    if i is None or i + H >= len(ohlcv):
        skipped_s += 1
        continue
    fwd = ohlcv.iloc[i+1:i+1+H]
    entry = float(fwd['open'].iloc[0])
    if not np.isfinite(entry) or entry <= 0:
        skipped_s += 1
        continue
    out = compute_barrier_outcomes(
        forward_high=fwd['high'], forward_low=fwd['low'], entry_price=entry,
        up_pct=UP_PCT, down_pct=DOWN_PCT, horizon_bars=H, horizon_label='24h', tie_policy='conservative',
        forward_open=fwd['open']
    )
    ternary = None
    for k in out.keys():
        if k.startswith('y_tb_label_u') and k.endswith('_24h'):
            ternary = float(out[k])
            break
    close_t24 = float(fwd['close'].iloc[-1])
    ret_nc_short = (entry / close_t24) - 1.0 if (np.isfinite(close_t24) and close_t24 > 0) else np.nan
    if ternary == -1.0:
        scen = 'TP'
        ret_hybrid = DOWN_PCT
    elif ternary == 1.0:
        scen = 'SL'
        ret_hybrid = -UP_PCT
    else:
        scen = 'NC'
        ret_hybrid = ret_nc_short
    rows_s.append({
        'timestamp': ts, 'month': mon, 'scenario': scen,
        'ret_hybrid': ret_hybrid, 'ret_natural': ret_nc_short
    })

res_s = pd.DataFrame(rows_s)
print(f'Short sim rows: {len(res_s)} (skipped={skipped_s}) across months:', (sorted(res_s['month'].unique()) if not res_s.empty else []))
display(res_s.head())

# Per-month counts (short)
if not res_s.empty:
    counts_s = res_s.pivot_table(index='month', columns='scenario', values='timestamp', aggfunc='count', fill_value=0)
    counts_s = counts_s.assign(total=counts_s.sum(axis=1))
    print('Per-month scenario counts (short, bin=1):')
    display(counts_s)

    # Aggregated returns (hybrid)
    agg_h_s = res_s.groupby('month', as_index=False)['ret_hybrid'].agg(total_return=lambda s: float(np.nansum(s)), mean_return=lambda s: float(np.nanmean(s)))
    print('Per-month short hybrid returns (TP/SL/Natural):')
    display(agg_h_s)

    # Aggregated returns (natural-close only)
    agg_nc_s = res_s.groupby('month', as_index=False)['ret_natural'].agg(total_return=lambda s: float(np.nansum(s)), mean_return=lambda s: float(np.nanmean(s)))
    print('Per-month short natural-close returns (ignore TP/SL):')
    display(agg_nc_s)

    # Overall aggregates
    print('Overall short hybrid total:', float(np.nansum(res_s['ret_hybrid'])))
    print('Overall short natural-close total:', float(np.nansum(res_s['ret_natural'])))


Short sim rows: 716 (skipped=0) across months: ['2025-05', '2025-06', '2025-07', '2025-08', '2025-09', '2025-10']


Unnamed: 0,timestamp,month,scenario,ret_hybrid,ret_natural
0,2025-05-29 00:00:00,2025-05,TP,0.02,0.031797
1,2025-05-29 01:00:00,2025-05,TP,0.02,0.023495
2,2025-05-29 02:00:00,2025-05,TP,0.02,0.022943
3,2025-05-29 03:00:00,2025-05,TP,0.02,0.019039
4,2025-05-29 04:00:00,2025-05,TP,0.02,0.013682


Per-month scenario counts (short, bin=1):


scenario,NC,SL,TP,total
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-05,29,0,35,64
2025-06,176,5,47,228
2025-07,54,0,30,84
2025-08,142,0,98,240
2025-09,94,0,2,96
2025-10,1,0,3,4


Per-month short hybrid returns (TP/SL/Natural):


Unnamed: 0,month,total_return,mean_return
0,2025-05,0.611983,0.009562
1,2025-06,0.151828,0.000666
2,2025-07,0.538449,0.00641
3,2025-08,1.816716,0.00757
4,2025-09,-0.249927,-0.002603
5,2025-10,0.066632,0.016658


Per-month short natural-close returns (ignore TP/SL):


Unnamed: 0,month,total_return,mean_return
0,2025-05,0.637763,0.009965
1,2025-06,-0.053061,-0.000233
2,2025-07,0.529449,0.006303
3,2025-08,1.991512,0.008298
4,2025-09,-0.255618,-0.002663
5,2025-10,0.046722,0.011681


Overall short hybrid total: 2.935681323691862
Overall short natural-close total: 2.896768231660129


In [42]:
# Plot candlestick with short signals (bin 1) from 2025-05-01 to 2025-10-31
import pandas as pd
import plotly.graph_objects as go

assert 'df_ohlcv' in locals() and 'df_roll' in locals(), 'Expected df_ohlcv and df_roll'

START = pd.Timestamp('2025-05-01 00:00:00')
END = pd.Timestamp('2025-10-31 23:00:00')

dfp = df_ohlcv.copy().sort_values('timestamp')
dfp['timestamp'] = pd.to_datetime(dfp['timestamp'])
dfp = dfp[(dfp['timestamp'] >= START) & (dfp['timestamp'] <= END)].copy()

sig = df_roll[(df_roll['q_bin'] == 1)].copy()
sig['timestamp'] = pd.to_datetime(sig['timestamp'])
sig = sig[(sig['timestamp'] >= START) & (sig['timestamp'] <= END)].copy()

fig = go.Figure()
fig.add_trace(go.Candlestick(
    x=dfp['timestamp'], open=dfp['open'], high=dfp['high'], low=dfp['low'], close=dfp['close'],
    name='OHLCV'
))

if not sig.empty:
    # Place markers slightly above high to be visible
    price_map = dfp.set_index('timestamp')['high']
    y_sig = price_map.reindex(sig['timestamp']).fillna(method='ffill').values * 1.001
    fig.add_trace(go.Scatter(
        x=sig['timestamp'], y=y_sig, mode='markers',
        marker=dict(symbol='triangle-down', color='red', size=9, line=dict(width=0)),
        name='Short signal (bin 1)'
    ))

fig.update_layout(
    title='Short Signals on Candlestick (2025-05-01 to 2025-10-31)',
    xaxis_title='Time', yaxis_title='Price',
    xaxis_rangeslider_visible=False,
    template='plotly_white',
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
)
fig.show()



Series.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.

