In [1]:
# libraries and settings
import gc
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from scipy.stats import pearsonr
from sklearn.linear_model import Ridge
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_parquet('../data/input/train.parquet')
df_test = pd.read_parquet('../data/input/test.parquet')

In [3]:
# Clustering the features
X_cols = df_train.columns[df_train.columns.str.startswith('X')]
col2cluster = pd.Series(pd.read_csv('../data/intermediate/clusters.csv', index_col = 0).iloc[:,0])

# Group columns by cluster and take the mean across features inside each cluster
df_reduced = (
    df_train[X_cols].groupby(col2cluster, axis=1)
      .mean()
    #   .sort_index(axis=1) 
)

df_reduced.set_index(df_train.index, inplace = True)
df_reduced = pd.merge(
    df_train[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume', 'label']],
    df_reduced,
    left_index = True, 
    right_index = True
)



df_reduced_test = (
    df_test[X_cols].groupby(col2cluster, axis=1)
      .mean()
    #   .sort_index(axis=1) 
)

df_reduced_test.set_index(df_test.index, inplace = True)
df_reduced_test = pd.merge(
    df_test[['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']],
    df_reduced_test,
    left_index = True, 
    right_index = True
)


In [4]:
""" **************** ENSEMBLE MODEL ****************** """
class LightGBMTimeSeriesEnsemble:
    def __init__(self, n_models, n_trials, alphas=None, metric='rmse', random_seed=42):
        """
        n_models  : number of base learners (e.g. 50)
        n_trials  : Optuna trials per learner
        alphas    : candidate ridge regularization strengths
        metric    : LightGBM eval metric (for early stopping)
        """
        self.n_models = n_models
        self.n_trials = n_trials
        self.metric = metric
        self.alphas = alphas if alphas is not None else np.logspace(-2, 5, 30)
        self.seed = random_seed

        self.best_params_list = []
        self.best_iterations = []
        self.models = []
        self.ridge = None
        self.best_alpha = None

    def _optuna_objective(self, trial, X_tr, y_tr, X_val, y_val):
        params = {
            'objective': 'regression',
            'metric': 'rmse',  
            'verbosity': -1,
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 16, 64),
            'max_depth': trial.suggest_int('max_depth', 2, 6),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 0.5),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.6),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
            'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
            'num_boost_round': trial.suggest_int('num_boost_round', 200,500),
            'early_stopping_rounds': 50
        }
        model = lgb.train(
            params,
            lgb.Dataset(X_tr, y_tr),
            valid_sets=[lgb.Dataset(X_val, y_val)]
        )
        pred = model.predict(X_val)
        corr, _ = pearsonr(pred, y_val)
        return -corr

    def fit(self, X_train, y_train, X_val, y_val):
        """
        1) Fit each base model on X_train[i::n] versus y_train[i::n] with Optuna tuning.
        Next steps are carried out in the fit_ridge function:
        2) Make predictions of each base model on *full* X_train => (n_samples_train x n_models)
        3) Ridge-fit on (train_preds, y_train)
        4) Select alpha by evaluating ridge on full X_val predictions
        """
        self.models = []
        self.best_params_list = []
        self.best_iterations = []

        
        for i in range(self.n_models):
            print(f'Training model {i+1}/{self.n_models}')
            Xt = X_train.iloc[i::self.n_models]
            yt = y_train.iloc[i::self.n_models]
            Xv = X_val.iloc[i::self.n_models]
            yv = y_val.iloc[i::self.n_models]

            # Tune
            sampler = optuna.samplers.TPESampler(seed=self.seed)
            study = optuna.create_study(direction='minimize', sampler=sampler)
            study.optimize(
                lambda trial: self._optuna_objective(trial, Xt, yt, Xv, yv),
                n_trials=self.n_trials,
            )
            best_params = study.best_params
            best_params['seed'] = self.seed
            

            # Train model with best params
            model = lgb.train(
                {
                **best_params, 
                'objective':'regression',
                'metric':self.metric,
                'verbosity':-1, 
                'early_stopping_rounds': 50
                },
                lgb.Dataset(Xt, yt),
                valid_sets=[lgb.Dataset(Xv, yv)]
            )
            best_params['num_boost_round'] = model.best_iteration
            self.best_params_list.append(best_params)
            self.best_iterations.append(model.best_iteration)
            self.models.append(model)

        self.fit_ridge(X_train, y_train, X_val, y_val, self.alphas)
        
        
    def fit_ridge(self, X_train, y_train, X_val, y_val, alphas):
        """
        1) Make predictions of each base model on *full* X_train => (n_samples_train x n_models)
        2) Ridge-fit on (train_preds, y_train)
        3) Select alpha by evaluating ridge on full X_val predictions
        """
        # --- predictions on full training set ---
        train_stack = np.column_stack([m.predict(X_train, num_iteration=it)
                                       for m, it in zip(self.models, self.best_iterations)])

        # --- search alpha by evaluating on validation set ---
        val_stack = np.column_stack([m.predict(X_val, num_iteration=it)
                                     for m, it in zip(self.models, self.best_iterations)])

        self.alphas = alphas
        best_corr = -np.inf
        for a in self.alphas:
            r = Ridge(alpha=a)
            r.fit(train_stack, y_train)
            corr, _ = pearsonr(r.predict(val_stack), y_val)
            print(f' ridge alpha: {a:.5f}, val corr: {corr:.5f}')
            if corr > best_corr:
                best_corr = corr
                self.best_alpha = a
                self.ridge = r

        print(f'Best ridge alpha: {self.best_alpha:.5f}, val corr: {best_corr:.5f}')
    
    
    def refit_full(self, X_full, y_full):
        """
        Retrain base models and the ridge regression model on full data using best params and alpha.
        """
        self.models = []
        for i, params in enumerate(self.best_params_list):
            print(f'Refitting model {i+1}/{self.n_models} on full data...')
            model = lgb.train(
                {
                **params,  
                'objective':'regression',
                'metric':self.metric,
                'verbosity':-1
                },
                lgb.Dataset(X_full, y_full)
            )
            self.models.append(model)
        
        full_stack = np.column_stack([
            m.predict(X_full, num_iteration=it) for m, it in zip(self.models, self.best_iterations)
        ])
        self.ridge = Ridge(alpha=self.best_alpha)
        self.ridge.fit(full_stack, y_full)

    def predict(self, X):
        """
        Predict stacked output on full dataset X
        """
        base = np.column_stack([m.predict(X, num_iteration=it)
                                for m, it in zip(self.models, self.best_iterations)])
        return self.ridge.predict(base)

In [None]:
df_reduced.sort_index(inplace = True)
n_train = round(df_reduced.shape[0]*0.8)
X_train = df_reduced.iloc[:n_train].drop(columns = ['label'])
X_val = df_reduced.iloc[n_train:].drop(columns = ['label'])
y_train = df_reduced.iloc[:n_train]['label'] 
y_val = df_reduced.iloc[n_train:]['label'] 

ensemble = LightGBMTimeSeriesEnsemble(n_models=15, n_trials=50)
ensemble.fit(X_train, y_train, X_val, y_val)

[I 2025-08-19 15:00:03,975] A new study created in memory with name: no-name-24eaec7b-df24-4606-88fd-532dc2cfc34c


Training model 1/15


[I 2025-08-19 15:00:04,756] Trial 0 finished with value: -0.08720271523373274 and parameters: {'learning_rate': 0.015355286838886862, 'num_leaves': 62, 'max_depth': 5, 'min_data_in_leaf': 64, 'feature_fraction': 0.1624074561769746, 'bagging_fraction': 0.26239780813448105, 'bagging_freq': 1, 'lambda_l1': 4.330880728874676, 'lambda_l2': 3.005575058716044, 'num_boost_round': 413}. Best is trial 0 with value: -0.08720271523373274.
[I 2025-08-19 15:00:06,343] Trial 1 finished with value: -0.10092598658873288 and parameters: {'learning_rate': 0.005318033256270142, 'num_leaves': 63, 'max_depth': 6, 'min_data_in_leaf': 29, 'feature_fraction': 0.17272998688284025, 'bagging_fraction': 0.27336180394137355, 'bagging_freq': 4, 'lambda_l1': 2.6237821581611893, 'lambda_l2': 2.1597250932105787, 'num_boost_round': 287}. Best is trial 1 with value: -0.10092598658873288.
[I 2025-08-19 15:00:07,118] Trial 2 finished with value: -0.09174133779910788 and parameters: {'learning_rate': 0.03126143958203108, 'n

In [34]:
# further fine-tuning the ridge regression regularization parameter
ensemble.fit_ridge(X_train, y_train, X_val, y_val, range(6000, 15000, 500))

 ridge alpha: 6000.00000, val corr: 0.12500
 ridge alpha: 6500.00000, val corr: 0.12505
 ridge alpha: 7000.00000, val corr: 0.12508
 ridge alpha: 7500.00000, val corr: 0.12509
 ridge alpha: 8000.00000, val corr: 0.12510
 ridge alpha: 8500.00000, val corr: 0.12510
 ridge alpha: 9000.00000, val corr: 0.12510
 ridge alpha: 9500.00000, val corr: 0.12510
 ridge alpha: 10000.00000, val corr: 0.12509
 ridge alpha: 10500.00000, val corr: 0.12508
 ridge alpha: 11000.00000, val corr: 0.12507
 ridge alpha: 11500.00000, val corr: 0.12506
 ridge alpha: 12000.00000, val corr: 0.12505
 ridge alpha: 12500.00000, val corr: 0.12503
 ridge alpha: 13000.00000, val corr: 0.12502
 ridge alpha: 13500.00000, val corr: 0.12501
 ridge alpha: 14000.00000, val corr: 0.12500
 ridge alpha: 14500.00000, val corr: 0.12498
Best ridge alpha: 8500.00000, val corr: 0.12510


In [35]:
ensemble.refit_full(df_reduced.drop(columns='label'), df_reduced['label'])

y_test_pred = ensemble.predict(df_reduced_test)
y_test_pred_pd = pd.Series(y_test_pred, name = 'prediction')
y_test_pred_pd.index = df_test.index
y_test_pred_pd.to_csv('../data/output/submission4.csv')

Refitting model 1/20 on full data...
Refitting model 2/20 on full data...
Refitting model 3/20 on full data...
Refitting model 4/20 on full data...
Refitting model 5/20 on full data...
Refitting model 6/20 on full data...
Refitting model 7/20 on full data...
Refitting model 8/20 on full data...
Refitting model 9/20 on full data...
Refitting model 10/20 on full data...
Refitting model 11/20 on full data...
Refitting model 12/20 on full data...
Refitting model 13/20 on full data...
Refitting model 14/20 on full data...
Refitting model 15/20 on full data...
Refitting model 16/20 on full data...
Refitting model 17/20 on full data...
Refitting model 18/20 on full data...
Refitting model 19/20 on full data...
Refitting model 20/20 on full data...
