In [1]:
import pandas as pd
import numpy as np

import tubesml as tml

from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor

import optuna
from optuna.samplers import TPESampler

import lightgbm as lgb
import xgboost as xgb

from sklearn.pipeline import Pipeline

from src.model_validation import TSCrossValidate, summary_evaluation, fold_evaluation
from src.model_helpers import DailyModel
from src.sharpe import score_sharpe
from src.features import FeatureEng

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
df = pd.read_csv("data_raw/train.csv")

df = df[df["date_id"] > 1000].copy()
df.head()

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,E10,E11,E12,E13,E14,E15,E16,E17,E18,E19,E2,E20,E3,E4,E5,E6,E7,E8,E9,I1,I2,I3,I4,I5,I6,I7,I8,I9,M1,M10,M11,M12,M13,M14,M15,M16,M17,M18,M2,M3,M4,M5,M6,M7,M8,M9,P1,P10,P11,P12,P13,P2,P3,P4,P5,P6,P7,P8,P9,S1,S10,S11,S12,S2,S3,S4,S5,S6,S7,S8,S9,V1,V10,V11,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
1001,1001,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.003361,0.00012,0.002932
1002,1002,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.005496,0.00012,0.005066
1003,1003,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001342,0.00012,0.000912
1004,1004,1,1,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.00335,0.000121,-0.003781
1005,1005,0,0,0,1,0,-1,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.006725,0.000122,0.006293


In [3]:
TARGET = "market_forward_excess_returns"
DROP = ['is_scored', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
FEATURES = [c for c in df.columns if c not in DROP]

In [4]:
ts_folds = TimeSeriesSplit(n_splits=10, test_size=180)

In [5]:
processing = Pipeline([("imputer", tml.DfImputer(strategy="constant", fill_value=0)), ("scaler", tml.DfScaler())])

In [6]:
def add_streak_features(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Add rolling features counting consecutive positive/negative streaks.
    
    Args:
        df: DataFrame with the column to analyze
        column: Name of the column to track streaks for
    
    Returns:
        DataFrame with added streak columns
    """
    df = df.copy()

    df["temp_col"] = df[column].shift()
    
    # Create binary indicators for positive/negative
    df['_is_positive'] = (df["temp_col"] > 0).astype(int)
    df['_is_negative'] = (df["temp_col"] < 0).astype(int)
    
    # Create streak groups (changes when sign changes)
    df['_pos_streak_group'] = (df['_is_positive'] != df['_is_positive'].shift()).cumsum()
    df['_neg_streak_group'] = (df['_is_negative'] != df['_is_negative'].shift()).cumsum()
    
    # Count consecutive occurrences within each group
    df[f'{column}_positive_streak'] = df.groupby('_pos_streak_group')['_is_positive'].cumsum() * df['_is_positive']
    df[f'{column}_negative_streak'] = df.groupby('_neg_streak_group')['_is_negative'].cumsum() * df['_is_negative']
    
    # Clean up temporary columns
    df = df.drop(columns=['_is_positive', '_is_negative', '_pos_streak_group', '_neg_streak_group', "temp_col"])
    
    return df


def make_lags_train(data):
    df = data.copy()

    targets = ['forward_returns', 'risk_free_rate']

    for col in targets:
        for lag in [1]: # 5, 22
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)
        
        for w in [5, 22, 220]:
            df[f"{col}_mean_{w}"] = df[col].shift(1).rolling(w, min_periods=w).mean()
            df[f"{col}_std_{w}"] = df[col].shift(1).rolling(w, min_periods=w).std()

        if col == "forward_returns":
            df = add_streak_features(df, col)

    return df

# Light GBM

In [10]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "D" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

In [11]:
sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 977, 'max_depth': 45, 'num_leaves': 13, 'reg_lambda': 95.91787591296908, 'reg_alpha': 88.85659291036258, 'colsample_bytree': 0.7317388602272695, 'subsample': 0.4972520454947898, 'min_child_weight': 48.10381288954999}


In [12]:
study.trials_dataframe().sort_values('value', ascending=False).head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
341,341,0.858813,2025-12-04 00:58:18.394186,2025-12-04 00:59:02.796613,0 days 00:00:44.402427,0.731739,977,45,48.103813,13,88.856593,95.917876,True,mean,0.497252,COMPLETE
188,188,0.856878,2025-12-04 00:50:06.784474,2025-12-04 00:50:44.860092,0 days 00:00:38.075618,0.665068,973,55,47.77278,13,97.693737,94.488591,True,mean,0.818128,COMPLETE
186,186,0.855532,2025-12-04 00:50:04.443308,2025-12-04 00:50:38.242008,0 days 00:00:33.798700,0.655215,971,54,43.103651,13,97.746186,93.964592,True,mean,0.731044,COMPLETE
264,264,0.853579,2025-12-04 00:54:21.364071,2025-12-04 00:55:04.345532,0 days 00:00:42.981461,0.715037,933,97,111.310391,13,88.537642,93.26473,True,mean,0.520766,COMPLETE
216,216,0.851814,2025-12-04 00:51:59.811131,2025-12-04 00:52:37.099030,0 days 00:00:37.287899,0.706377,947,71,51.38059,14,93.716429,90.290955,True,mean,0.512803,COMPLETE
354,354,0.851754,2025-12-04 00:58:59.468588,2025-12-04 00:59:36.053262,0 days 00:00:36.584674,0.693754,984,43,13.406253,13,88.837356,83.145063,True,constant,0.572934,COMPLETE
271,271,0.851374,2025-12-04 00:54:31.355737,2025-12-04 00:55:06.033596,0 days 00:00:34.677859,0.719429,934,68,46.177885,13,91.48249,96.831105,True,mean,0.523274,COMPLETE
200,200,0.851088,2025-12-04 00:50:44.860667,2025-12-04 00:51:20.124669,0 days 00:00:35.264002,0.721979,960,54,42.398623,14,85.200685,94.793301,True,mean,0.694606,COMPLETE
462,462,0.850641,2025-12-04 01:05:04.247808,2025-12-04 01:05:41.125359,0 days 00:00:36.877551,0.686487,930,77,214.925548,14,86.845965,89.249453,True,mean,0.814756,COMPLETE
220,220,0.850591,2025-12-04 00:52:18.862353,2025-12-04 00:52:55.969902,0 days 00:00:37.107549,0.710144,944,80,51.042548,14,93.710959,90.805465,True,mean,0.505869,COMPLETE


In [13]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "E" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 948, 'max_depth': 77, 'num_leaves': 10, 'reg_lambda': 88.44074015259685, 'reg_alpha': 22.26835880842902, 'colsample_bytree': 0.6975179006773923, 'subsample': 0.4274959577464371, 'min_child_weight': 14.520552701824453}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
186,186,1.073307,2025-12-04 01:32:39.845699,2025-12-04 01:33:30.012155,0 days 00:00:50.166456,0.697518,948,77,14.520553,10,22.268359,88.44074,True,mean,0.427496,COMPLETE
272,272,1.019621,2025-12-04 01:42:57.124669,2025-12-04 01:43:51.813999,0 days 00:00:54.689330,0.877088,846,39,270.981809,17,22.05509,30.500496,True,constant,0.438601,COMPLETE
239,239,0.977982,2025-12-04 01:39:52.480579,2025-12-04 01:41:02.695340,0 days 00:01:10.214761,0.703725,919,48,98.342205,20,15.104428,21.353985,True,constant,0.479441,COMPLETE
331,331,0.967142,2025-12-04 01:46:28.280571,2025-12-04 01:47:22.116532,0 days 00:00:53.835961,0.827494,962,36,270.365642,17,12.035476,32.063068,True,median,0.403415,COMPLETE
249,249,0.954075,2025-12-04 01:40:41.413208,2025-12-04 01:41:24.463232,0 days 00:00:43.050024,0.701521,946,99,273.735281,14,20.192388,30.687329,True,mean,0.43801,COMPLETE
57,57,0.934656,2025-12-04 01:14:54.372436,2025-12-04 01:16:04.766891,0 days 00:01:10.394455,0.720358,663,89,2.656532,21,19.560068,89.296623,True,constant,0.460892,COMPLETE
94,94,0.927765,2025-12-04 01:18:44.121823,2025-12-04 01:19:35.119535,0 days 00:00:50.997712,0.808168,606,180,11.084153,17,14.865103,86.471261,True,mean,0.642592,COMPLETE
396,396,0.927433,2025-12-04 01:49:34.535040,2025-12-04 01:50:22.944744,0 days 00:00:48.409704,0.538127,750,52,293.969292,14,25.029081,28.867803,True,mean,0.508569,COMPLETE
320,320,0.925272,2025-12-04 01:45:59.130800,2025-12-04 01:46:51.737188,0 days 00:00:52.606388,0.815,918,40,261.29859,14,28.935835,35.091249,True,constant,0.418796,COMPLETE
245,245,0.917228,2025-12-04 01:40:23.822329,2025-12-04 01:41:02.099512,0 days 00:00:38.277183,0.609311,946,48,280.136088,14,20.310488,30.146779,True,mean,0.400498,COMPLETE


In [14]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "I" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'median', 'sample_weight': True, 'conversion': 851, 'max_depth': 78, 'num_leaves': 200, 'reg_lambda': 12.515396975424965, 'reg_alpha': 7.856598478172739, 'colsample_bytree': 0.7986284144927324, 'subsample': 0.7136521722331859, 'min_child_weight': 207.7254497724693}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
392,392,1.068119,2025-12-04 03:12:51.683528,2025-12-04 03:18:09.764997,0 days 00:05:18.081469,0.798628,851,78,207.72545,200,7.856598,12.515397,True,median,0.713652,COMPLETE
389,389,1.037753,2025-12-04 03:12:00.935219,2025-12-04 03:17:05.692167,0 days 00:05:04.756948,0.800859,797,76,162.600544,200,7.525169,11.747582,True,median,0.579877,COMPLETE
327,327,1.016354,2025-12-04 02:57:49.305681,2025-12-04 03:02:17.845431,0 days 00:04:28.539750,0.819047,944,69,165.995259,200,14.145013,10.476922,True,median,0.562455,COMPLETE
311,311,1.009934,2025-12-04 02:53:21.703354,2025-12-04 02:57:59.642801,0 days 00:04:37.939447,0.832747,850,68,203.174432,200,1.108399,10.70646,True,median,0.568353,COMPLETE
213,213,1.009377,2025-12-04 02:31:19.307405,2025-12-04 02:36:02.285277,0 days 00:04:42.977872,0.803056,919,149,129.981665,195,13.424405,90.316805,True,median,0.446144,COMPLETE
409,409,1.003069,2025-12-04 03:16:06.447562,2025-12-04 03:20:31.760850,0 days 00:04:25.313288,0.808719,849,73,216.287485,190,2.243115,11.086477,True,median,0.567303,COMPLETE
499,499,1.000876,2025-12-04 03:36:31.333922,2025-12-04 03:40:38.677767,0 days 00:04:07.343845,0.794728,948,45,212.99788,197,0.038251,18.864298,True,median,0.579541,COMPLETE
446,446,1.000332,2025-12-04 03:24:47.900162,2025-12-04 03:28:56.646212,0 days 00:04:08.746050,0.759824,979,55,154.577575,196,5.577463,13.666693,True,median,0.568087,COMPLETE
441,441,0.999469,2025-12-04 03:23:04.110746,2025-12-04 03:27:07.083174,0 days 00:04:02.972428,0.761956,979,81,219.813588,196,5.94328,14.604586,True,median,0.566594,COMPLETE
261,261,0.998726,2025-12-04 02:42:48.191435,2025-12-04 02:47:02.272215,0 days 00:04:14.080780,0.830344,926,70,175.780897,193,9.563183,86.761401,True,median,0.620106,COMPLETE


In [15]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "M" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'median', 'sample_weight': True, 'conversion': 402, 'max_depth': 300, 'num_leaves': 123, 'reg_lambda': 63.58233484304232, 'reg_alpha': 12.03458587840111, 'colsample_bytree': 0.6587163635705163, 'subsample': 0.6781891884372988, 'min_child_weight': 259.89397693443004}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
311,311,0.935271,2025-12-04 04:25:54.051409,2025-12-04 04:29:10.497486,0 days 00:03:16.446077,0.658716,402,300,259.893977,123,12.034586,63.582335,True,median,0.678189,COMPLETE
434,434,0.928777,2025-12-04 04:50:30.321842,2025-12-04 04:53:36.026619,0 days 00:03:05.704777,0.62062,426,271,255.683256,95,50.491293,64.587534,True,median,0.754862,COMPLETE
268,268,0.912022,2025-12-04 04:17:36.827464,2025-12-04 04:20:36.708533,0 days 00:02:59.881069,0.608704,455,286,125.936191,119,42.133055,71.512689,True,median,0.468036,COMPLETE
419,419,0.909329,2025-12-04 04:47:26.679445,2025-12-04 04:50:49.440243,0 days 00:03:22.760798,0.630556,399,274,266.215148,111,40.863121,70.275416,True,median,0.721657,COMPLETE
429,429,0.90117,2025-12-04 04:49:20.036939,2025-12-04 04:51:55.800986,0 days 00:02:35.764047,0.630433,560,271,282.439619,96,50.451375,69.723599,True,median,0.719678,COMPLETE
406,406,0.900508,2025-12-04 04:45:21.284346,2025-12-04 04:48:17.164537,0 days 00:02:55.880191,0.674294,377,257,273.109945,102,6.273617,67.109417,True,median,0.731147,COMPLETE
425,425,0.897898,2025-12-04 04:48:17.165215,2025-12-04 04:51:19.356133,0 days 00:03:02.190918,0.626658,334,269,279.432774,96,46.934406,70.173896,True,median,0.725011,COMPLETE
333,333,0.897867,2025-12-04 04:31:25.629118,2025-12-04 04:34:19.168509,0 days 00:02:53.539391,0.680512,435,245,246.77328,117,18.392095,77.310584,True,median,0.470701,COMPLETE
382,382,0.89701,2025-12-04 04:40:55.667588,2025-12-04 04:44:12.806831,0 days 00:03:17.139243,0.649963,411,259,267.187472,107,22.944537,67.179683,True,median,0.709221,COMPLETE
316,316,0.896202,2025-12-04 04:27:45.564448,2025-12-04 04:31:11.360229,0 days 00:03:25.795781,0.652763,433,248,259.108808,123,12.107186,68.646704,True,median,0.62857,COMPLETE


In [16]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "P" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 971, 'max_depth': 66, 'num_leaves': 191, 'reg_lambda': 97.94625550671162, 'reg_alpha': 39.400812997904815, 'colsample_bytree': 0.5734790801883719, 'subsample': 0.9800511912010585, 'min_child_weight': 76.42118832946905}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
361,361,1.187933,2025-12-04 06:16:35.816091,2025-12-04 06:21:42.155474,0 days 00:05:06.339383,0.573479,971,66,76.421188,191,39.400813,97.946256,True,mean,0.980051,COMPLETE
416,416,1.181564,2025-12-04 06:32:44.070295,2025-12-04 06:37:43.101260,0 days 00:04:59.030965,0.591283,977,27,52.780518,189,53.097545,92.138452,True,mean,0.990111,COMPLETE
284,284,1.14894,2025-12-04 05:55:46.797376,2025-12-04 06:00:32.905018,0 days 00:04:46.107642,0.593272,982,73,59.645959,176,38.081754,86.416816,True,mean,0.981479,COMPLETE
297,297,1.135414,2025-12-04 05:59:29.895251,2025-12-04 06:04:31.534825,0 days 00:05:01.639574,0.606235,945,34,94.518328,187,32.236706,80.503595,True,mean,0.711573,COMPLETE
304,304,1.13398,2025-12-04 06:01:27.455289,2025-12-04 06:05:51.240621,0 days 00:04:23.785332,0.606253,948,35,90.009815,176,43.601441,88.477246,True,mean,0.97019,COMPLETE
366,366,1.131495,2025-12-04 06:18:11.386559,2025-12-04 06:23:35.804422,0 days 00:05:24.417863,0.598102,998,43,71.701933,200,40.707578,98.482395,True,mean,0.978554,COMPLETE
483,483,1.130341,2025-12-04 06:49:29.787793,2025-12-04 06:54:27.312326,0 days 00:04:57.524533,0.590806,970,26,140.49059,188,54.191113,86.524192,True,mean,0.722615,COMPLETE
372,372,1.129734,2025-12-04 06:19:30.648420,2025-12-04 06:24:39.603665,0 days 00:05:08.955245,0.595064,997,43,60.090304,195,39.864748,88.332539,True,mean,0.978153,COMPLETE
359,359,1.126903,2025-12-04 06:16:11.579033,2025-12-04 06:21:39.930808,0 days 00:05:28.351775,0.574686,968,66,75.187154,192,50.969013,87.93253,True,mean,0.978432,COMPLETE
495,495,1.125887,2025-12-04 06:53:22.873302,2025-12-04 06:58:34.015606,0 days 00:05:11.142304,0.592555,979,28,63.496823,187,54.25554,28.473075,True,mean,0.967529,COMPLETE


In [17]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "S" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 886, 'max_depth': 172, 'num_leaves': 84, 'reg_lambda': 27.767882579137407, 'reg_alpha': 16.994806178840328, 'colsample_bytree': 0.9321590234306153, 'subsample': 0.5938689386193434, 'min_child_weight': 61.98360009747156}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
475,475,1.220089,2025-12-04 08:06:49.099183,2025-12-04 08:09:33.588686,0 days 00:02:44.489503,0.932159,886,172,61.9836,84,16.994806,27.767883,True,mean,0.593869,COMPLETE
375,375,1.217112,2025-12-04 07:52:13.131850,2025-12-04 07:54:46.565425,0 days 00:02:33.433575,0.951863,879,221,143.758146,87,15.984092,36.331427,True,mean,0.847782,COMPLETE
456,456,1.20942,2025-12-04 08:04:17.010588,2025-12-04 08:07:05.040885,0 days 00:02:48.030297,0.93496,886,142,67.728477,90,11.656691,29.902396,True,mean,0.971779,COMPLETE
420,420,1.204283,2025-12-04 07:58:32.746920,2025-12-04 08:01:13.080002,0 days 00:02:40.333082,0.94164,923,187,20.665811,96,13.849454,33.600828,True,mean,0.951405,COMPLETE
472,472,1.202513,2025-12-04 08:06:44.449571,2025-12-04 08:09:22.291283,0 days 00:02:37.841712,0.933969,887,171,65.669615,89,13.164764,26.052667,True,mean,0.661947,COMPLETE
439,439,1.202306,2025-12-04 08:01:22.548818,2025-12-04 08:04:29.929695,0 days 00:03:07.380877,0.947734,917,173,14.247201,96,15.314598,31.380725,True,mean,0.959372,COMPLETE
426,426,1.201366,2025-12-04 07:59:17.362005,2025-12-04 08:02:20.186330,0 days 00:03:02.824325,0.9363,914,196,148.339214,94,14.119278,55.379641,True,mean,0.450457,COMPLETE
338,338,1.195458,2025-12-04 07:46:28.597147,2025-12-04 07:49:00.172478,0 days 00:02:31.575331,0.924283,908,184,127.208136,86,16.88216,36.787492,True,mean,0.858388,COMPLETE
441,441,1.191555,2025-12-04 08:01:38.530514,2025-12-04 08:04:26.198783,0 days 00:02:47.668269,0.951414,942,152,14.526111,96,16.546067,30.991131,True,mean,0.958297,COMPLETE
299,299,1.19096,2025-12-04 07:41:02.478660,2025-12-04 07:43:53.020665,0 days 00:02:50.542005,0.943569,950,263,120.408295,93,20.621812,56.60841,True,mean,0.905397,COMPLETE


In [7]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        # ("fe", FeatureEng(u1=param["u1"], u2=param["u2"], add_ts=param["add_ts"],
        #                                      riskadj_m4=param["riskadj_m4"], quant_p11=param["quant_p11"], fed_model=param["fed_model"],
        #                                      glob_vol=param["glob_vol"], relmom=param["relmom"], mom_div=param["mom_div"],
        #                                      mompersistence=param["mompersistence"], momregime=param["mom_regime"], vol_spread=param["vol_spread"],
        #                                      price_mom=param["price_mom"], rate_sens=param["rate_sens"])),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = data.copy()

    FEATURES = [c for c in train.columns if c not in DROP and "V" in c] + ["date_id"]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

[I 2025-12-04 19:54:28,075] A new study created in memory with name: no-name-684dcbcf-10d4-4078-9f57-16d355234b82


Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 865, 'max_depth': 9, 'num_leaves': 54, 'reg_lambda': 91.2184754250311, 'reg_alpha': 55.719133239841426, 'colsample_bytree': 0.9998612904387537, 'subsample': 0.55911961339005, 'min_child_weight': 235.04440328216708}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
197,197,1.216493,2025-12-04 20:19:14.807537,2025-12-04 20:20:38.893065,0 days 00:01:24.085528,0.999861,865,9,235.044403,54,55.719133,91.218475,True,mean,0.55912,COMPLETE
188,188,1.17863,2025-12-04 20:18:45.577119,2025-12-04 20:20:07.694216,0 days 00:01:22.117097,0.989426,870,10,256.041195,49,23.981953,80.316513,True,mean,0.556426,COMPLETE
223,223,1.177034,2025-12-04 20:21:02.364885,2025-12-04 20:22:31.657051,0 days 00:01:29.292166,0.971283,921,9,271.072291,64,25.33511,90.565643,True,mean,0.596087,COMPLETE
210,210,1.172398,2025-12-04 20:20:12.010771,2025-12-04 20:21:15.597251,0 days 00:01:03.586480,0.99913,934,8,274.644774,34,24.689299,74.17545,True,mean,0.588136,COMPLETE
371,371,1.158018,2025-12-04 20:32:07.603598,2025-12-04 20:33:29.332784,0 days 00:01:21.729186,0.968814,950,9,245.579057,53,30.191239,93.058273,True,mean,0.591536,COMPLETE
205,205,1.157174,2025-12-04 20:20:05.981786,2025-12-04 20:21:18.533799,0 days 00:01:12.552013,0.999326,936,9,268.894559,55,24.075241,91.438639,True,mean,0.586908,COMPLETE
326,326,1.148158,2025-12-04 20:29:09.526859,2025-12-04 20:30:11.843697,0 days 00:01:02.316838,0.987763,922,7,250.664241,32,27.77201,93.399323,True,mean,0.515473,COMPLETE
189,189,1.145604,2025-12-04 20:18:45.892084,2025-12-04 20:20:05.981132,0 days 00:01:20.089048,0.99142,864,9,235.413564,56,23.98652,73.23125,True,mean,0.588112,COMPLETE
225,225,1.140004,2025-12-04 20:21:05.427301,2025-12-04 20:22:26.493229,0 days 00:01:21.065928,0.96842,926,9,267.961415,64,54.411909,92.619707,True,mean,0.540365,COMPLETE
266,266,1.135197,2025-12-04 20:24:28.112170,2025-12-04 20:25:42.567849,0 days 00:01:14.455679,0.978944,882,10,228.444632,36,25.828425,95.820812,True,mean,0.565317,COMPLETE


In [10]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([("fe", FeatureEng()),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = processing.fit_transform(data)

    FEATURES = ["U1", "U2", 'Quant_RiskAdj_M4', 'Quant_Regime_P11', "Quant_FedModel_P11", "Quant_Global_Vol",
                "Quant_RelMom_M4", "Quant_RelMom_M1", "Quant_MomDiv_Tech", "Quant_MomDiv_Value", 
                "Quant_M4_Persistence", "Quant_M1_Persistence", "Quant_VolRegime_High", 
                "Quant_VolRegime_Low", "Quant_Price_Mom_Align", "Quant_Price_Mom_Align_Broad",
                "Quant_Rate_Sensitivity_P11", "Quant_Vol_Rate_Response"] + ["date_id"] + [c for c in train if "cos" in c or "sin" in c]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([#("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=model, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'mean', 'sample_weight': True, 'conversion': 507, 'max_depth': 99, 'num_leaves': 64, 'reg_lambda': 53.22502274170274, 'reg_alpha': 4.485851482007155, 'colsample_bytree': 0.560497110851551, 'subsample': 0.565643038496917, 'min_child_weight': 41.021978797144754}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
495,495,0.905361,2025-12-05 00:01:51.569932,2025-12-05 00:03:37.093684,0 days 00:01:45.523752,0.560497,507,99,41.021979,64,4.485851,53.225023,True,mean,0.565643,COMPLETE
265,265,0.894444,2025-12-04 23:13:00.555200,2025-12-04 23:16:10.424355,0 days 00:03:09.869155,0.543197,578,151,21.389867,64,3.355881,17.541165,True,mean,0.569627,COMPLETE
352,352,0.887414,2025-12-04 23:29:51.066459,2025-12-04 23:33:08.842805,0 days 00:03:17.776346,0.538779,516,151,27.505004,61,2.819412,27.488866,True,mean,0.601627,COMPLETE
494,494,0.885114,2025-12-05 00:01:49.375445,2025-12-05 00:03:46.066461,0 days 00:01:56.691016,0.561379,597,101,42.639291,60,4.083431,54.546044,True,mean,0.563636,COMPLETE
473,473,0.883508,2025-12-04 23:59:29.077190,2025-12-05 00:01:33.531157,0 days 00:02:04.453967,0.558298,562,109,49.727445,73,3.429481,24.860104,True,mean,0.617599,COMPLETE
278,278,0.880563,2025-12-04 23:15:32.832404,2025-12-04 23:18:39.986880,0 days 00:03:07.154476,0.550108,478,166,22.587937,64,2.2204,10.19621,True,mean,0.445638,COMPLETE
490,490,0.876947,2025-12-05 00:01:38.459753,2025-12-05 00:03:26.299239,0 days 00:01:47.839486,0.564504,517,100,117.5643,60,3.75986,52.745898,True,mean,0.551441,COMPLETE
403,403,0.87627,2025-12-04 23:41:48.368132,2025-12-04 23:44:47.142947,0 days 00:02:58.774815,0.559996,579,134,206.18843,63,50.775394,19.898107,True,mean,0.484631,COMPLETE
334,334,0.875661,2025-12-04 23:26:11.447300,2025-12-04 23:29:03.912451,0 days 00:02:52.465151,0.53789,529,158,32.099472,61,49.910411,57.643916,True,mean,0.604016,COMPLETE
387,387,0.874181,2025-12-04 23:38:07.227668,2025-12-04 23:44:05.191234,0 days 00:05:57.963566,0.521146,504,134,33.87639,116,45.275899,60.351715,True,mean,0.49151,COMPLETE


In [11]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        #("fe", FeatureEng()),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = make_lags_train(data)

    FEATURES = ["date_id"] + [c for c in train if "lag" in c or "streak" in c or "_mean_" in c or "_std_" in c]  
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

Number of finished trials: 500
Best trial: {'strategy': 'median', 'sample_weight': True, 'conversion': 898, 'max_depth': 200, 'num_leaves': 199, 'reg_lambda': 13.258372165224355, 'reg_alpha': 18.46506844268196, 'colsample_bytree': 0.3588344073217721, 'subsample': 0.41336388316055794, 'min_child_weight': 218.579265072281}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
336,336,0.968649,2025-12-05 02:06:19.506372,2025-12-05 02:11:10.817557,0 days 00:04:51.311185,0.358834,898,200,218.579265,199,18.465068,13.258372,True,median,0.413364,COMPLETE
316,316,0.958442,2025-12-05 01:59:16.567470,2025-12-05 02:04:30.611924,0 days 00:05:14.044454,0.355071,931,181,215.977859,195,24.437716,8.263306,True,constant,0.570666,COMPLETE
306,306,0.950501,2025-12-05 01:56:45.744211,2025-12-05 02:02:19.437613,0 days 00:05:33.693402,0.354367,936,189,116.477762,193,22.663909,11.925228,True,constant,0.408867,COMPLETE
303,303,0.949998,2025-12-05 01:55:48.788071,2025-12-05 02:01:05.735919,0 days 00:05:16.947848,0.320803,933,182,217.751356,183,30.7399,11.862422,True,constant,0.412758,COMPLETE
327,327,0.945568,2025-12-05 02:03:02.071919,2025-12-05 02:07:53.797799,0 days 00:04:51.725880,0.322536,952,178,216.403416,192,33.395228,10.0414,True,constant,0.591223,COMPLETE
485,485,0.945531,2025-12-05 02:48:47.705844,2025-12-05 02:53:32.419135,0 days 00:04:44.713291,0.333816,957,171,7.408961,187,22.756772,0.983088,True,constant,0.411498,COMPLETE
272,272,0.944657,2025-12-05 01:45:18.655733,2025-12-05 01:50:50.513508,0 days 00:05:31.857775,0.315522,959,181,231.916196,189,28.444672,89.99572,True,constant,0.606132,COMPLETE
275,275,0.942661,2025-12-05 01:46:32.978034,2025-12-05 01:52:03.563414,0 days 00:05:30.585380,0.3166,935,177,96.761082,189,29.963458,87.607475,True,constant,0.606365,COMPLETE
443,443,0.942236,2025-12-05 02:37:28.759030,2025-12-05 02:42:09.778348,0 days 00:04:41.019318,0.344611,987,178,216.87813,191,24.108981,11.762639,True,constant,0.587122,COMPLETE
396,396,0.9422,2025-12-05 02:23:50.104157,2025-12-05 02:28:42.742400,0 days 00:04:52.638243,0.326602,972,174,197.644375,177,12.482513,91.071116,True,constant,0.41568,COMPLETE


In [7]:
def objective(trial, data=df, target=df[TARGET]):
    param = {
        "strategy": trial.suggest_categorical("strategy", ["constant", "mean", "median"]),
        "sample_weight": trial.suggest_categorical("sample_weight", [True, False]),
        "conversion": trial.suggest_int("conversion", 1, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 300),
        "num_leaves": trial.suggest_int("num_leaves", 10, 200),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 100.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 100.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'subsample': trial.suggest_float('subsample', 0.4, 1),
        'min_child_weight': trial.suggest_float('min_child_weight', 1e-3 , 300),
        # "u1": trial.suggest_categorical("u1", [True, False]),
        # "u2": trial.suggest_categorical("u2", [True, False]),
        # "add_ts": trial.suggest_categorical("add_ts", [True, False]),
        # "riskadj_m4": trial.suggest_categorical("riskadj_m4", [True, False]),
        # "quant_p11": trial.suggest_categorical("quant_p11", [True, False]),
        # "fed_model": trial.suggest_categorical("fed_model", [True, False]),
        # "glob_vol": trial.suggest_categorical("glob_vol", [True, False]),
        # "relmom": trial.suggest_categorical("relmom", [True, False]),
        # "mom_div": trial.suggest_categorical("mom_div", [True, False]),
        # "mompersistence": trial.suggest_categorical("mompersistence", [True, False]),
        # "mom_regime": trial.suggest_categorical("mom_regime", [True, False]),
        # "vol_spread": trial.suggest_categorical("vol_spread", [True, False]),
        # "price_mom": trial.suggest_categorical("price_mom", [True, False]),
        # "rate_sens": trial.suggest_categorical("rate_sens", [True, False]),
    }
    processing = Pipeline([
        #("fe", FeatureEng()),
                            ("imputer", tml.DfImputer(strategy=param["strategy"], fill_value=0)),
                            # ("scaler", tml.DfScaler())
                            ])
    
    train = make_lags_train(data)

    FEATURES = ["date_id"] + [c for c in train if "lag" in c or "streak" in c or "_mean_" in c or "_std_" in c] + [c for c in train if "sin" in c or "cos" in c] + [c for c in train.columns if c not in DROP and c.startswith("P")] + [c for c in train.columns if c not in DROP and c.startswith("V")] + [c for c in train.columns if c not in DROP and c.startswith("U")]
    
    model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                              learning_rate=0.01,
                             colsample_bytree=param["colsample_bytree"],
                             min_child_weight=param['min_child_weight'],
                             reg_lambda=param['reg_lambda'],
                             reg_alpha=param['reg_alpha'],
                             subsample=param['subsample'],
                             num_leaves=param["num_leaves"],
                             max_depth=param['max_depth'],
                             eval_metric="rmse")

    pipe = Pipeline([("processing", processing),
                    ("model", model)])
    
    callbacks = [lgb.early_stopping(100, verbose=0)]
    
    if param["sample_weight"]:
        fit_params = {"sample_weight": "date_id", "callbacks":callbacks, "eval_metric": "rmse"}
    else:
        fit_params = {"callbacks":callbacks, "eval_metric": "rmse"}

    cvscore = TSCrossValidate(data=train[FEATURES], target=target, cv=ts_folds, estimator=pipe, fit_params=fit_params, early_stopping=True)
    oof, res = cvscore.score()

    sub = res["folds_eval"].copy()
    sub["prediction"] = np.clip(sub["predictions"] * param["conversion"] + 1, 0, 2)
    sharpe = score_sharpe(solution=df[df["date_id"] >= sub["date_id"].min()].reset_index(drop=True), submission=sub, row_id_column_name='')
    
    return sharpe

sampler = TPESampler(seed=645)  # Make the sampler behave in a deterministic way.

study = optuna.create_study(direction='maximize', sampler=sampler)
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(objective, n_trials=500, n_jobs=-1)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

study.trials_dataframe().sort_values('value', ascending=False).head(10)

[I 2025-12-05 18:55:13,013] A new study created in memory with name: no-name-1aaa1fad-5d69-450c-9926-b75574914e08


Number of finished trials: 500
Best trial: {'strategy': 'constant', 'sample_weight': True, 'conversion': 998, 'max_depth': 82, 'num_leaves': 143, 'reg_lambda': 27.955483425287007, 'reg_alpha': 29.906784166671272, 'colsample_bytree': 0.47637248296830637, 'subsample': 0.9509772198758214, 'min_child_weight': 131.71331565352557}


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_colsample_bytree,params_conversion,params_max_depth,params_min_child_weight,params_num_leaves,params_reg_alpha,params_reg_lambda,params_sample_weight,params_strategy,params_subsample,state
272,272,1.130271,2025-12-05 19:52:10.158131,2025-12-05 19:55:25.474272,0 days 00:03:15.316141,0.476372,998,82,131.713316,143,29.906784,27.955483,True,constant,0.950977,COMPLETE
305,305,1.00282,2025-12-05 19:58:51.590617,2025-12-05 20:01:58.637195,0 days 00:03:07.046578,0.455001,897,139,124.854186,139,26.389167,38.935239,True,constant,0.964116,COMPLETE
411,411,0.997866,2025-12-05 20:20:25.571503,2025-12-05 20:23:57.786715,0 days 00:03:32.215212,0.443303,940,49,16.711445,144,95.822364,43.328956,True,constant,0.890945,COMPLETE
277,277,0.997225,2025-12-05 19:52:59.969641,2025-12-05 19:56:39.568338,0 days 00:03:39.598697,0.475335,934,30,140.545123,152,15.67869,36.680082,True,constant,0.951006,COMPLETE
227,227,0.994293,2025-12-05 19:42:47.219452,2025-12-05 19:46:36.948263,0 days 00:03:49.728811,0.476283,915,113,12.73952,149,16.539432,37.581953,True,constant,0.78891,COMPLETE
329,329,0.987677,2025-12-05 20:04:29.195412,2025-12-05 20:08:06.429254,0 days 00:03:37.233842,0.495428,852,23,128.984731,146,31.059788,21.28774,True,constant,0.798013,COMPLETE
372,372,0.984422,2025-12-05 20:12:12.907471,2025-12-05 20:16:43.561566,0 days 00:04:30.654095,0.483053,891,36,21.527657,166,73.450475,48.607845,True,constant,0.878621,COMPLETE
304,304,0.981138,2025-12-05 19:58:35.641149,2025-12-05 20:01:43.571489,0 days 00:03:07.930340,0.456143,900,43,119.65084,139,26.76634,24.931593,True,constant,0.963283,COMPLETE
252,252,0.980012,2025-12-05 19:47:49.564596,2025-12-05 19:51:03.087975,0 days 00:03:13.523379,0.459505,959,38,25.50217,151,24.957883,41.049211,True,constant,0.97677,COMPLETE
222,222,0.978955,2025-12-05 19:41:12.188958,2025-12-05 19:44:26.501194,0 days 00:03:14.312236,0.468529,917,113,12.916388,155,20.515896,36.407554,True,constant,0.996309,COMPLETE
