In [1]:
import pandas as pd
import numpy as np

import tubesml as tml

from sklearn.model_selection import TimeSeriesSplit, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge


import lightgbm as lgb

from sklearn.pipeline import Pipeline

from src.model_validation import TSCrossValidate, summary_evaluation, fold_evaluation
from src.model_helpers import DailyModel, ColumnSelector
from src.sharpe import score_sharpe
from src.features import FeatureEng

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
df = pd.read_csv("data_raw/train.csv")

df = df[df["date_id"] > 1000].copy()
df.head()

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,E10,E11,E12,E13,E14,E15,E16,E17,E18,E19,E2,E20,E3,E4,E5,E6,E7,E8,E9,I1,I2,I3,I4,I5,I6,I7,I8,I9,M1,M10,M11,M12,M13,M14,M15,M16,M17,M18,M2,M3,M4,M5,M6,M7,M8,M9,P1,P10,P11,P12,P13,P2,P3,P4,P5,P6,P7,P8,P9,S1,S10,S11,S12,S2,S3,S4,S5,S6,S7,S8,S9,V1,V10,V11,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns
1001,1001,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.003361,0.00012,0.002932
1002,1002,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.005496,0.00012,0.005066
1003,1003,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001342,0.00012,0.000912
1004,1004,1,1,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.00335,0.000121,-0.003781
1005,1005,0,0,0,1,0,-1,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.006725,0.000122,0.006293


In [3]:
TARGET = "market_forward_excess_returns"
DROP = ['is_scored', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
FEATURES = [c for c in df.columns if c not in DROP]

In [4]:
ts_folds = TimeSeriesSplit(n_splits=10, test_size=180)

In [5]:
def add_streak_features(df: pd.DataFrame, column: str) -> pd.DataFrame:
    """
    Add rolling features counting consecutive positive/negative streaks.
    
    Args:
        df: DataFrame with the column to analyze
        column: Name of the column to track streaks for
    
    Returns:
        DataFrame with added streak columns
    """
    df = df.copy()

    df["temp_col"] = df[column].shift()
    
    # Create binary indicators for positive/negative
    df['_is_positive'] = (df["temp_col"] > 0).astype(int)
    df['_is_negative'] = (df["temp_col"] < 0).astype(int)
    
    # Create streak groups (changes when sign changes)
    df['_pos_streak_group'] = (df['_is_positive'] != df['_is_positive'].shift()).cumsum()
    df['_neg_streak_group'] = (df['_is_negative'] != df['_is_negative'].shift()).cumsum()
    
    # Count consecutive occurrences within each group
    df[f'{column}_positive_streak'] = df.groupby('_pos_streak_group')['_is_positive'].cumsum() * df['_is_positive']
    df[f'{column}_negative_streak'] = df.groupby('_neg_streak_group')['_is_negative'].cumsum() * df['_is_negative']
    
    # Clean up temporary columns
    df = df.drop(columns=['_is_positive', '_is_negative', '_pos_streak_group', '_neg_streak_group', "temp_col"])
    
    return df


def make_lags_train(data):
    df = data.copy()

    targets = ['forward_returns', 'risk_free_rate']

    for col in targets:
        for lag in [1, 5, 22]:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)
        
        for w in [5, 22, 220]:
            df[f"{col}_mean_{w}"] = df[col].shift(1).rolling(w, min_periods=w).mean()
            df[f"{col}_std_{w}"] = df[col].shift(1).rolling(w, min_periods=w).std()

        if col == "forward_returns":
            df = add_streak_features(df, col)

    return df

In [20]:
train = make_lags_train(df)
FEATURES = [c for c in train.columns if c not in DROP]
fe = FeatureEng()
train = fe.fit_transform(train)
FEATURES = [c for c in train.columns if c not in DROP]
train.head()

Unnamed: 0,date_id,D1,D2,D3,D4,D5,D6,D7,D8,D9,E1,E10,E11,E12,E13,E14,E15,E16,E17,E18,E19,E2,E20,E3,E4,E5,E6,E7,E8,E9,I1,I2,I3,I4,I5,I6,I7,I8,I9,M1,M10,M11,M12,M13,M14,M15,M16,M17,M18,M2,M3,M4,M5,M6,M7,M8,M9,P1,P10,P11,P12,P13,P2,P3,P4,P5,P6,P7,P8,P9,S1,S10,S11,S12,S2,S3,S4,S5,S6,S7,S8,S9,V1,V10,V11,V12,V13,V2,V3,V4,V5,V6,V7,V8,V9,forward_returns,risk_free_rate,market_forward_excess_returns,forward_returns_lag_1,forward_returns_lag_5,forward_returns_lag_22,forward_returns_mean_5,forward_returns_std_5,forward_returns_mean_22,forward_returns_std_22,forward_returns_mean_220,forward_returns_std_220,forward_returns_positive_streak,forward_returns_negative_streak,risk_free_rate_lag_1,risk_free_rate_lag_5,risk_free_rate_lag_22,risk_free_rate_mean_5,risk_free_rate_std_5,risk_free_rate_mean_22,risk_free_rate_std_22,risk_free_rate_mean_220,risk_free_rate_std_220,U1,U2,sin_1_5_,cos_1_5_,sin_2_5_,cos_2_5_,sin_1_22_,cos_1_22_,sin_2_22_,cos_2_22_,sin_3_22_,cos_3_22_,sin_4_22_,cos_4_22_,Quant_RiskAdj_M4,Quant_Regime_P11,Quant_FedModel_P11,Quant_Global_Vol,Quant_RelMom_M4,Quant_RelMom_M1,Quant_MomDiv_Tech,Quant_MomDiv_Value,Quant_M4_Persistence,Quant_M1_Persistence,Quant_VolRegime_High,Quant_VolRegime_Low,Quant_Vol_Spread,Quant_Price_Mom_Align,Quant_Price_Mom_Align_Broad,Quant_Rate_Sensitivity_P11,Quant_Vol_Rate_Response
0,1001,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.003361,0.00012,0.002932,,,,,,,,,,0,0,,,,,,,,,,,,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,,,,,,,,,,,0,0,,,,,
1,1002,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.005496,0.00012,0.005066,0.003361,,,,,,,,,1,0,0.00012,,,,,,,,,,,0.951057,0.309017,0.587785,-0.809017,0.281733,0.959493,0.540641,0.841254,0.75575,0.654861,0.909632,0.415415,,,,,,,,,,,0,0,,,,,
2,1003,0,0,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.001342,0.00012,0.000912,0.005496,,,,,,,,,2,0,0.00012,,,,,,,,,,,0.587785,-0.809017,-0.951057,0.309017,0.540641,0.841254,0.909632,0.415415,0.989821,-0.142315,0.75575,-0.654861,,,,,,,,,,,0,0,,,,,
3,1004,1,1,0,1,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.00335,0.000121,-0.003781,0.001342,,,,,,,,,3,0,0.00012,,,,,,,,,,,-0.587785,-0.809017,0.951057,0.309017,0.75575,0.654861,0.989821,-0.142315,0.540641,-0.841254,-0.281733,-0.959493,,,,,,,,,,,0,0,,,,,
4,1005,0,0,0,1,0,-1,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.006725,0.000122,0.006293,-0.00335,,,,,,,,,0,1,0.000121,,,,,,,,,,,-0.951057,0.309017,-0.587785,-0.809017,0.909632,0.415415,0.75575,-0.654861,-0.281733,-0.959493,-0.989821,-0.142315,,,,,,,,,,,0,0,,,,,


In [8]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.731,
                             min_child_weight=48.1,
                             reg_lambda=95.91,
                             reg_alpha=88.85,
                             subsample=0.497,
                             num_leaves=13,
                             max_depth=45)

processing = Pipeline([#("fe", FeatureEng()),
                     #   ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("D", "startswith")])),
                     #   ("scaler", tml.DfScaler(method="robust"))
                            ])

pipe_d = Pipeline([("processing", processing),
                ("model", model)])

In [9]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.697,
                             min_child_weight=14.5,
                             reg_lambda=88.44,
                             reg_alpha=22.26,
                             subsample=0.427,
                             num_leaves=10,
                             max_depth=77)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("E", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_e = Pipeline([("processing", processing),
                ("model", model)])

In [10]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.798,
                             min_child_weight=207,
                             reg_lambda=12.5,
                             reg_alpha=7.85,
                             subsample=0.713,
                             num_leaves=200,
                             max_depth=78)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="median", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("I", "startswith")])),
                       #("scaler", tml.DfScaler(method="robust"))
                            ])

pipe_i = Pipeline([("processing", processing),
                ("model", model)])

In [11]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.658,
                             min_child_weight=259.89,
                             reg_lambda=63.58,
                             reg_alpha=12.03,
                             subsample=0.678,
                             num_leaves=123,
                             max_depth=300)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="median", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("M", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_m = Pipeline([("processing", processing),
                ("model", model)])

In [12]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.573,
                             min_child_weight=76.42,
                             reg_lambda=97.94,
                             reg_alpha=39.4,
                             subsample=0.98,
                             num_leaves=191,
                             max_depth=66)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("P", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_p = Pipeline([("processing", processing),
                ("model", model)])

In [13]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=10000,
                          learning_rate=0.01,
                          colsample_bytree=0.932,
                             min_child_weight=61.98,
                             reg_lambda=27.76,
                             reg_alpha=16.99,
                             subsample=0.593,
                             num_leaves=84,
                             max_depth=172)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("S", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_s = Pipeline([("processing", processing),
                ("model", model)])

In [14]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          # colsample_bytree=0.697,
                             min_child_weight=235,
                             reg_lambda=91.21,
                             reg_alpha=55.72,
                             subsample=0.559,
                             num_leaves=54,
                             max_depth=9)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("V", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_v = Pipeline([("processing", processing),
                ("model", model)])

In [15]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.56,
                             min_child_weight=41,
                             reg_lambda=53.22,
                             reg_alpha=4.48,
                             subsample=0.565,
                             num_leaves=64,
                             max_depth=99)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="mean", fill_value=0)),
                       ("sel", ColumnSelector(sel_columns=["U1", "U2", 'Quant_RiskAdj_M4', 'Quant_Regime_P11', "Quant_FedModel_P11", "Quant_Global_Vol",
                "Quant_RelMom_M4", "Quant_RelMom_M1", "Quant_MomDiv_Tech", "Quant_MomDiv_Value", 
                "Quant_M4_Persistence", "Quant_M1_Persistence", "Quant_VolRegime_High", 
                "Quant_VolRegime_Low", "Quant_Price_Mom_Align", "Quant_Price_Mom_Align_Broad",
                "Quant_Rate_Sensitivity_P11", "Quant_Vol_Rate_Response"] + ["date_id"],
                patterns=[("cos", "contains"), ("sin", "contains")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_feats = Pipeline([("processing", processing),
                ("model", model)])

In [16]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.358,
                             min_child_weight=218.57,
                             reg_lambda=13.25,
                             reg_alpha=18.46,
                             subsample=0.413,
                             num_leaves=199,
                             max_depth=200)

processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="median", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("lag", "contains"), ("streak", "contains"), ("_mean_", "contains"), ("_std_", "contains")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_lags = Pipeline([("processing", processing),
                ("model", model)])

In [17]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.95,
                             min_child_weight=299,
                             reg_lambda=58,
                             reg_alpha=38,
                             subsample=0.788,
                             num_leaves=67,
                             max_depth=127)

processing = Pipeline([
    ("fe", FeatureEng(u1=True, u2=True, add_ts=False,
                                             riskadj_m4=False, quant_p11=False, fed_model=False,
                                             glob_vol=False, relmom=True, mom_div=False,
                                             mompersistence=False, momregime=False, vol_spread=True,
                                             price_mom=True, rate_sens=False)),
                            ("imputer", tml.DfImputer(strategy="constant", fill_value=0)),
                            #("scaler", tml.DfScaler())
                            ])

pipe_tot = Pipeline([("processing", processing),
                ("model", model)])

In [18]:
model = lgb.LGBMRegressor(random_state=34, n_jobs=-1, verbose=-1, n_estimators=300,
                          learning_rate=0.01,
                          colsample_bytree=0.476,
                             min_child_weight=131.7,
                             reg_lambda=27.9,
                             reg_alpha=29.9,
                             subsample=0.95,
                             num_leaves=143,
                             max_depth=82)


processing = Pipeline([#("fe", FeatureEng()),
                       ("imputer", tml.DfImputer(strategy="constant", fill_value=0)),
                       ("sel", ColumnSelector(patterns=[("lag", "contains"),
                                                        ("streak", "contains"),
                                                        ("_mean_", "contains"),
                                                        ("_std_", "contains"),
                                                        ("sin", "contains"),
                                                        ("cos", "contains"),
                                                        ("P", "startswith"),
                                                        ("U", "startswith"),
                                                        ("V", "startswith")]))
                       #("scaler", tml.DfScaler())
                            ])

pipe_fancy = Pipeline([("processing", processing),
                ("model", model)])

In [21]:
stacker = tml.Stacker(estimators=[("d", pipe_d), ("e", pipe_e), ("i", pipe_i), ("v", pipe_v),
                                        ("m", pipe_m), ("p", pipe_p), ("s", pipe_s), ("lags", pipe_lags), 
                                        ("feats", pipe_feats), ("fancy", pipe_fancy), ("tot", pipe_tot)], 
                                        final_estimator=Ridge(), cv=ts_folds,)

stacker.fit(train[FEATURES], train[TARGET])

In [22]:
stacker = StackingRegressor(estimators=[
    ("d", pipe_d), ("e", pipe_e), ("i", pipe_i), ("v", pipe_v),
                                        ("m", pipe_m), ("p", pipe_p), ("s", pipe_s), ("lags", pipe_lags), 
                                        ("feats", pipe_feats), ("fancy", pipe_fancy), 
                                        ("tot", pipe_tot)], 
                                        final_estimator=Ridge(), cv=5, n_jobs=-1)

stacker.fit(train[FEATURES], train[TARGET])

In [None]:
kfold = KFold(n_splits=5, random_state=634, shuffle=True)

stacker = tml.Stacker(estimators=[("d", pipe_d), ("e", pipe_e), ("i", pipe_i), ("v", pipe_v),
                                        ("m", pipe_m), ("p", pipe_p), ("s", pipe_s), ("lags", pipe_lags), 
                                        ("feats", pipe_feats), ("fancy", pipe_fancy), ("tot", pipe_tot)], 
                                        final_estimator=RandomForestRegressor(), cv=kfold,)

cvscore = TSCrossValidate(data=train[FEATURES], target=train[TARGET], cv=ts_folds, estimator=stacker,
                          #shap=True, 
                          imp_coef=True, 
                          # fit_params=fit_params, early_stopping=True
                          )
oof, res = cvscore.score()

summary_evaluation(res["folds_eval"], df, factor=1000)

# tml.plot_feat_imp(data=res["feat_imp"], n=15, imp="both")

# to_plot = res["feat_imp"].head(6)["Feature"].to_list()
# tml.plot_shap_values(res["shap_values"], features=to_plot)