In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr
import optuna
from optuna.integration import LightGBMPruningCallback
import lightgbm as lgb

In [None]:
class Config:
    TRAIN_PATH       = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    TEST_PATH        = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    SUBMISSION_PATH  = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"
    
    BASE_FEATURES = [ # Renamed from FEATURES
    "X863", "X856", "X344", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
    "X415", "X345", "X137", "X855", "X174", "X302", "X178", "X532", "X168", "X612",
    "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume","X888", "X421", "X333"
    ]
    NEW_FEATURES = [] # To be populated by engineered features
    FEATURES = [] # Will be BASE_FEATURES + NEW_FEATURES
    
    
    LABEL_COLUMN     = "label"
    N_FOLDS          = 3
    RANDOM_STATE     = 42
    OPTUNA_N_TRIALS_XGB = 50 # Number of Optuna trials for XGBoost
    OPTUNA_N_TRIALS_LGBM = 50 # Number of Optuna trials for LightGBM

# Hyperparameters for XGBoost and LightGBM
XGB_PARAMS = { # These will be overridden by Optuna, but can serve as a fallback or initial guess
    "tree_method": "hist",
    "device": "gpu",
    "random_state": Config.RANDOM_STATE,
    "n_jobs": -1,
    "verbosity": 0,
}

LGBM_PARAMS = { # These will be overridden by Optuna
    "boosting_type": "gbdt",
    "device": "gpu",
    "n_jobs": -1,
    "verbose": -1,
    "random_state": Config.RANDOM_STATE,
}

LEARNERS = [
    {"name": "xgb",  "Estimator": XGBRegressor,  "params": XGB_PARAMS},
    {"name": "lgbm", "Estimator": LGBMRegressor, "params": LGBM_PARAMS} 
]

MODEL_SLICES = [
    {"name": "full_data",   "cutoff": 0},
    {"name": "last_75pct",  "cutoff": 0},  
    {"name": "last_50pct",  "cutoff": 0}
]


def create_time_decay_weights(n: int, decay: float = 0.95) -> np.ndarray:
    positions = np.arange(n)
    normalized = positions / float(n - 1)
    weights = decay ** (1.0 - normalized)
    return weights * n / weights.sum()

def _add_engineered_features(df):
    df_eng = df.copy()
    epsilon = 1e-6 # To avoid division by zero

    df_eng['bid_ask_qty_ratio'] = df_eng['bid_qty'] / (df_eng['ask_qty'] + epsilon)
    df_eng['buy_sell_qty_ratio'] = df_eng['buy_qty'] / (df_eng['sell_qty'] + epsilon)
    df_eng['qty_imbalance'] = (df_eng['bid_qty'] - df_eng['ask_qty']) - (df_eng['buy_qty'] - df_eng['sell_qty'])
    df_eng['vol_x_bid_qty'] = df_eng['volume'] * df_eng['bid_qty']
    df_eng['vol_x_ask_qty'] = df_eng['volume'] * df_eng['ask_qty']
    
    # Add more features here if needed
    
    # Update Config.NEW_FEATURES with the names of the new columns
    new_feature_names = [col for col in df_eng.columns if col not in df.columns]
    Config.NEW_FEATURES = new_feature_names
    return df_eng

def load_data():
    train_df = pd.read_parquet(
        Config.TRAIN_PATH,
        columns=Config.BASE_FEATURES + [Config.LABEL_COLUMN]
    ).reset_index(drop=True)
    test_df = pd.read_parquet(
        Config.TEST_PATH,
        columns=Config.BASE_FEATURES
    ).reset_index(drop=True)
    
    # Add engineered features
    train_df = _add_engineered_features(train_df)
    test_df = _add_engineered_features(test_df)

    # Update the main FEATURES list in Config
    Config.FEATURES = Config.BASE_FEATURES + Config.NEW_FEATURES
    
    submission_df = pd.read_csv(Config.SUBMISSION_PATH)
    print(f"Loaded train: {train_df.shape}, test: {test_df.shape}, submission: {submission_df.shape}")
    return train_df, test_df, submission_df



In [None]:
############################
# MAIN
############################

train_df, test_df, submission_df = load_data()
n_samples = len(train_df)
# set slice cutoffs
MODEL_SLICES[1]["cutoff"] = int(0.25 * n_samples)
MODEL_SLICES[2]["cutoff"] = int(0.50 * n_samples)

# prepare storage for OOF and test preds
oof_preds = {
    learner["name"]: {sl["name"]: np.zeros(n_samples) for sl in MODEL_SLICES}
    for learner in LEARNERS
}
test_preds = {
    learner["name"]: {sl["name"]: np.zeros(len(test_df)) for sl in MODEL_SLICES}
    for learner in LEARNERS
}

full_weights = create_time_decay_weights(n_samples)
kf = KFold(n_splits=Config.N_FOLDS, shuffle=False)

# cross-validation
for fold, (train_idx, valid_idx) in enumerate(kf.split(train_df), start=1):
    print(f"\n--- Fold {fold}/{Config.N_FOLDS} ---")
    X_valid_fold = train_df.iloc[valid_idx][Config.FEATURES]
    y_valid_fold = train_df.iloc[valid_idx][Config.LABEL_COLUMN]

    for sl in MODEL_SLICES:
        slice_name = sl["name"]
        cutoff     = sl["cutoff"]
        subset     = train_df.iloc[cutoff:].reset_index(drop=True)
        rel_idx    = train_idx[train_idx >= cutoff] - cutoff
        print(f"model setup {slice_name}")
        X_train_slice = subset.iloc[rel_idx][Config.FEATURES]
        y_train_slice = subset.iloc[rel_idx][Config.LABEL_COLUMN]

        # sample weights
        if cutoff == 0:
            sw_slice = full_weights[train_idx]
        else:
            sw_total = create_time_decay_weights(len(subset))
            sw_slice = sw_total[rel_idx]

        for learner in LEARNERS:
            name      = learner["name"]
            Estimator = learner["Estimator"]
            base_params = learner["params"].copy()

            def objective(trial):
                if name == "xgb":
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
                        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                        'max_depth': trial.suggest_int('max_depth', 3, 10),
                        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                        'gamma': trial.suggest_float('gamma', 0, 5),
                        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
                        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
                        **base_params
                    }
                    model = Estimator(**params)
                    model.fit(X_train_slice, y_train_slice, sample_weight=sw_slice,
                              eval_set=[(X_valid_fold, y_valid_fold)],
                              early_stopping_rounds=50, verbose=False)
                elif name == "lgbm":
                    params = {
                        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
                        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
                        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
                        'max_depth': trial.suggest_int('max_depth', 3, 12),
                        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
                        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
                        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
                        **base_params
                    }
                    model = Estimator(**params)
                    model.fit(X_train_slice, y_train_slice, sample_weight=sw_slice,
                              eval_set=[(X_valid_fold, y_valid_fold)],
                              callbacks=[LightGBMPruningCallback(trial, 'l1'), 
                                         lgb.early_stopping(stopping_rounds=50, verbose=-1)])
                
                preds = model.predict(X_valid_fold)
                score, _ = pearsonr(y_valid_fold, preds)
                return score

            study_name = f"{name}_{slice_name}_fold{fold}"
            study = optuna.create_study(direction='maximize', study_name=study_name, load_if_exists=True)
            n_trials = Config.OPTUNA_N_TRIALS_XGB if name == "xgb" else Config.OPTUNA_N_TRIALS_LGBM
            study.optimize(objective, n_trials=n_trials, n_jobs=1) # n_jobs=1 for GPU, can be -1 for CPU
            
            print(f"Best params for {name} on {slice_name} fold {fold}: {study.best_params}")
            best_params_for_model = {**base_params, **study.best_params}
            model = Estimator(**best_params_for_model)

            # Re-fit with best params on the full training data for this slice and fold
            # For XGBoost, we need to handle early stopping differently or remove it for final fit
            if name=="xgb":
                # For XGB, fit on X_train_slice, y_train_slice. Early stopping was for param search.
                # You might want to adjust n_estimators based on Optuna's findings or use a fixed large number and rely on early stopping during Optuna.
                # For simplicity here, we use the n_estimators found by Optuna.
                model.fit(X_train_slice, y_train_slice, sample_weight=sw_slice, verbose=False) 
            else: # For LGBM
                model.fit(X_train_slice, y_train_slice, sample_weight=sw_slice, 
                          eval_set=[(X_valid_fold, y_valid_fold)], # Can keep eval_set for potential internal use by LGBM, but not for early stopping here
                          callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=-1)] # Re-add early stopping for the final fit on this fold's data
                         )

            # OOF predictions
            mask = valid_idx >= cutoff
            if mask.any():
                idxs = valid_idx[mask]
                oof_preds[name][slice_name][idxs] = model.predict(
                    train_df.iloc[idxs][Config.FEATURES])
            if cutoff > 0 and (~mask).any():
                oof_preds[name][slice_name][valid_idx[~mask]] = (
                    oof_preds[name]["full_data"][valid_idx[~mask]])

            # test predictions
            test_preds[name][slice_name] += model.predict(test_df[Config.FEATURES]) / Config.N_FOLDS # Averaging here as Optuna runs per fold

# Note: Test predictions were previously averaged outside the loop. 
# With Optuna finding best params per fold, it's more common to average predictions from each fold's best model.
# The division by N_FOLDS for test_preds is now inside the learner loop.

# compute Pearson scores per learner and slice
pearson_scores = {
    name: {slice_name: pearsonr(train_df[Config.LABEL_COLUMN], preds)[0]
           for slice_name, preds in slices.items()}
    for name, slices in oof_preds.items()
}
print("\nPearson scores by learner and slice:")
print(pearson_scores)

# -- Ensemble per learner across slices --
learner_ensembles = {}
for learner_name, slice_scores in pearson_scores.items():
    # simple ensemble
    oof_simple = np.mean(list(oof_preds[learner_name].values()), axis=0)
    test_simple_list = [test_preds[learner_name][sn] for sn in MODEL_SLICES_NAMES] # Assuming MODEL_SLICES_NAMES is defined or use list(test_preds[learner_name].keys())
    # Need to ensure MODEL_SLICES_NAMES is defined, e.g., MODEL_SLICES_NAMES = [s['name'] for s in MODEL_SLICES]
    MODEL_SLICES_NAMES = [s['name'] for s in MODEL_SLICES] # Define it here for clarity
    test_simple = np.mean([test_preds[learner_name][sn] for sn in MODEL_SLICES_NAMES], axis=0)
    score_simple = pearsonr(train_df[Config.LABEL_COLUMN], oof_simple)[0]

    # weighted ensemble
    total_score = sum(slice_scores.values())
    slice_weights = {sn: sc/total_score if total_score else 1/len(slice_scores) for sn, sc in slice_scores.items()} # handle total_score = 0
    oof_weighted = sum(slice_weights[sn] * oof_preds[learner_name][sn]
                       for sn in slice_weights)
    test_weighted = sum(slice_weights[sn] * test_preds[learner_name][sn]
                        for sn in slice_weights)
    score_weighted = pearsonr(train_df[Config.LABEL_COLUMN], oof_weighted)[0]

    print(f"\n{learner_name.upper()} Simple ensemble Pearson:   {score_simple:.4f}")
    print(f"\n{learner_name.upper()} Weighted ensemble Pearson: {score_weighted:.4f}")

    learner_ensembles[learner_name] = {
        "oof_simple": oof_simple,
        "test_simple": test_simple # Storing the simple average of test predictions across slices for this learner
    }





In [None]:
# -- Final ensemble across learners (simple) --
final_oof = np.mean([le["oof_simple"] for le in learner_ensembles.values()], axis=0)
final_test = np.mean([le["test_simple"] for le in learner_ensembles.values()], axis=0)
final_score = pearsonr(train_df[Config.LABEL_COLUMN], final_oof)[0]
print(f"\nFINAL ensemble across learners Pearson: {final_score:.4f}")

# save submission
submission_df["prediction"] = final_test
submission_df.to_csv("submission.csv", index=False)
print("Wrote submission.csv")