# Importing Required Libraries and Data Loading

In [None]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import shap
from itertools import combinations
import gc

In [None]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
gc.collect()

In [None]:
data_path = "/home/max1024/Downloads/drw-crypto-market-prediction/"
os.listdir(data_path)

In [None]:
train_df = pd.read_parquet(data_path + "train.parquet")
print(train_df.shape)
train_df.head()

In [None]:
test_df = pd.read_parquet(data_path + "test.parquet")
print(test_df.shape)
test_df.head()

In [None]:
os.listdir("../data")

In [None]:
feature_importance_df = pd.read_csv("../data/feature_importance_20250723_2205.csv")
print(feature_importance_df.shape)
feature_importance_df

In [None]:
selected_features_list = feature_importance_df["Feature"].tolist()
print(len(selected_features_list))
selected_features_list[:50]

In [None]:
selected_cols_list = []
for f in selected_features_list:
    f_list = f.split("_")
    for col in f_list:
        if "X" in col:
            selected_cols_list.append(col)

In [None]:
selected_cols_list = list(set(selected_cols_list))
len(selected_cols_list)

In [None]:
selected_cols_list

In [None]:
cols_to_drop = [col for col in train_df.columns if col not in selected_cols_list]
len(cols_to_drop)

In [None]:
cols_to_drop.remove("label")

In [None]:
len(cols_to_drop)

In [None]:
train_df = train_df.drop(columns=cols_to_drop)
print(train_df.shape)
train_df.head()

In [None]:
test_df = test_df.drop(columns=cols_to_drop)
print(test_df.shape)
test_df.head()

In [None]:
feature_list = train_df.drop(columns=["label"]).columns.tolist()
print(len(feature_list))
feature_list

In [None]:
def row_wise_feat_engi(df):
    df = df.copy()
    new_features = {}

    new_features['row_mean'] = df[feature_list].mean(axis=1)
    new_features['row_std'] = df[feature_list].std(axis=1)
    new_features['row_max'] = df[feature_list].max(axis=1)
    new_features['row_min'] = df[feature_list].min(axis=1)
    new_features['row_sum'] = df[feature_list].sum(axis=1)

    for i in tqdm(range(19)):
        nth = round(0.05 + i * 0.05, 2)
        new_features['row_{}_quantile'.format(nth)] = df[feature_list].quantile(q=nth, axis=1)

    new_feats_df = pd.DataFrame(new_features, index=df.index)
    result_df = pd.concat([df, new_feats_df], axis=1)

    return result_df.copy()

In [None]:
train_df = row_wise_feat_engi(train_df)
print(train_df.shape)
train_df.head()

In [None]:
test_df = row_wise_feat_engi(test_df)
print(test_df.shape)
test_df.head()

### Nonlinear transformations

In [None]:
top_20_features_list = ['X758',
 'X757',
 'X272',
 'X614',
 'X752',
 'X772',
 'X753',
 'X759',
 'X756',
 'X27',
 'X605',
 'X25',
 'X774',
 'X332',
 'X607',
 'X648',
 'X754',
 'X329',
 'X767',
 'X280']

In [None]:
def nonlinear_feat_engi(df):
    df = df.copy()
    new_features = {}

    for f in tqdm(top_20_features_list):
        new_features["{}_percentile_rank".format(f)] = df[f].rank(pct=True)
        new_features["{}_square".format(f)] = df[f].apply(lambda x: x**2)
        new_features["{}_cube".format(f)] = df[f].apply(lambda x: x**3)
        new_features["{}_sqrt".format(f)] = df[f].apply(lambda x: np.sqrt(np.abs(x)))
        new_features["{}_log1p".format(f)] = df[f].apply(lambda x: np.log1p(np.abs(x)))
        new_features["{}_exp".format(f)] = df[f].apply(lambda x: np.exp(x))
        new_features["{}_sin".format(f)] = df[f].apply(lambda x: np.sin(x))
        new_features["{}_cos".format(f)] = df[f].apply(lambda x: np.cos(x))
        new_features["{}_tanh".format(f)] = df[f].apply(lambda x: np.tanh(x))
        new_features["{}_abs".format(f)] = df[f].apply(lambda x: np.abs(x))
    
    new_feats_df = pd.DataFrame(new_features, index=df.index)
    result_df = pd.concat([df, new_feats_df], axis=1)

    return result_df.copy()

In [None]:
train_df = nonlinear_feat_engi(train_df)
print(train_df.shape)
train_df.head()

In [None]:
test_df = nonlinear_feat_engi(test_df)
print(test_df.shape)
test_df.head()

### Interaction feature engineering

In [None]:
def interaction_feat_engi(df):
    df = df.copy()
    new_features = {}

    for f1, f2 in combinations(top_20_features_list, 2):
        new_features[f'{f1}_{f2}_prod'] = df[f1] * df[f2]
        new_features[f'{f1}_{f2}_sum'] = df[f1] + df[f2]
        new_features[f'{f1}_{f2}_diff'] = df[f1] - df[f2]
        new_features[f'{f1}_{f2}_ratio'] = df[f1] / (df[f2] + 1e-5)
        new_features[f'{f1}_{f2}_max'] = df[[f1, f2]].max(axis=1)
        new_features[f'{f1}_{f2}_min'] = df[[f1, f2]].min(axis=1)
        new_features[f'{f1}_{f2}_absdiff'] = np.abs(df[f1] - df[f2])

    new_feats_df = pd.DataFrame(new_features, index=df.index)
    result_df = pd.concat([df, new_feats_df], axis=1)

    return result_df.copy()

In [None]:
train_df = interaction_feat_engi(train_df)
print(train_df.shape)
train_df.head()

In [None]:
test_df = interaction_feat_engi(test_df)
print(test_df.shape)
test_df.head()

In [None]:
cols_to_drop = [col for col in train_df.columns if col not in selected_features_list]
len(cols_to_drop)

In [None]:
train_df = train_df[selected_features_list + ["label"]]
print(train_df.shape)
train_df.head()

In [None]:
test_df = test_df[selected_features_list + ["label"]]
print(test_df.shape)
test_df.head()

In [None]:
gc.collect()

# Hyper Parameter Tuning

### LGB

In [None]:
import optuna
import optunahub
from optuna.visualization import plot_slice, plot_param_importances
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
import datetime

# Get the current time in the local timezone
current_time_local = datetime.datetime.now()
print(f"Current time (local): {current_time_local}"[:-7])

In [None]:
def lgb_param_tune(optuna_n_trials):

    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)

    base_params = {
        "num_iterations": 50000,
        "early_stopping_round": 1000,
        "device": "gpu",
        "verbosity": -1
    }

    def objective(trial: optuna.Trial) -> float:

        num_features = trial.suggest_int('num_features', 30, len(selected_features_list))

        X = train_df.drop(columns=["label"]).iloc[:, :num_features]
        y = train_df["label"]


        params_to_tune = {
            'learning_rate': trial.suggest_float('learning_rate', 3e-4, 9e-2),
            'max_depth': trial.suggest_int('max_depth', 4, 10),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 150),
            'num_leaves': trial.suggest_int('num_leaves', 20, 200),
            'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 0.4),
            'lambda_l1': trial.suggest_float('lambda_l1', 0, 15),
            'lambda_l2': trial.suggest_float('lambda_l2', 0, 200),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1),
        }

        oof_preds = np.full(len(train_df), np.nan)

        for i, (train_idx, val_idx) in enumerate(tscv.split(train_df)):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

            model = LGBMRegressor(**base_params, **params_to_tune)

            model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)])

            val_preds = model.predict(X_val)

            oof_preds[val_idx] = val_preds

        mask = ~np.isnan(oof_preds)
        score = np.corrcoef(y[mask], oof_preds[mask])[0, 1]
            
        return score

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        current_time_local = datetime.datetime.now()

        study = optuna.create_study(
            direction="maximize",
            sampler=optunahub.load_module("samplers/auto_sampler").AutoSampler(),
            storage="sqlite:////home/max1024/Kaggle/drw2/optuna_study/db.sqlite3",
            study_name=f"DRW_LGB_param_tune_{current_time_local}"[:-7]
        )
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [None]:
lgb_study = lgb_param_tune(1000)

In [None]:
lgb_study.best_params

In [None]:
lgb_study.best_value

### XGB

In [None]:
def xgb_param_tune(optuna_n_trials):
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)

    base_params = {
        'tree_method': 'gpu_hist',
        'n_estimators': 50000,
        'verbosity': 0,
        'early_stopping_rounds': 1000
    }

    def objective(trial: optuna.Trial) -> float:

        num_features = trial.suggest_int('num_features', 30, len(selected_features_list))

        X = train_df.drop(columns=["label"]).iloc[:, :num_features]
        y = train_df["label"]

        params_to_tune = {            
            'learning_rate': trial.suggest_float('learning_rate', 3e-4, 9e-2),
            'max_depth': trial.suggest_int('max_depth', 4, 10),
            'subsample': trial.suggest_float('subsample', 0.6, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1),
            'reg_alpha': trial.suggest_float('reg_alpha', 0, 200),
            'reg_lambda': trial.suggest_float('reg_lambda', 0, 800),
            'min_child_weight': trial.suggest_float('min_child_weight', 0, 200),
            'min_split_loss': trial.suggest_float('min_split_loss', 0, 200),
        }

        oof_preds = np.full(len(train_df), np.nan)

        for i, (train_idx, val_idx) in enumerate(tscv.split(train_df)):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

            model = XGBRegressor(**base_params, **params_to_tune)

            model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=0)

            val_preds = model.predict(X_val)

            oof_preds[val_idx] = val_preds

        mask = ~np.isnan(oof_preds)
        score = np.corrcoef(y[mask], oof_preds[mask])[0, 1]
            
        return score

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        current_time_local = datetime.datetime.now()

        study = optuna.create_study(
            direction="maximize",
            sampler=optunahub.load_module("samplers/auto_sampler").AutoSampler(),
            storage="sqlite:////home/max1024/Kaggle/drw2/optuna_study/db.sqlite3",
            study_name=f"DRW_XGB_param_tune_{current_time_local}"[:-7]
        )
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [None]:
xgb_study = xgb_param_tune(1000)

In [None]:
xgb_study.best_params

In [None]:
xgb_study.best_value

In [None]:
def catboost_param_tune(optuna_n_trials):
    
    n_splits = 5
    tscv = TimeSeriesSplit(n_splits=n_splits)

    base_params = {
        'iterations': 50000,
        'verbose': 0,
        'task_type': 'GPU',
        'use_best_model': True,
        'early_stopping_rounds': 1000
    }

    def objective(trial: optuna.Trial) -> float:

        num_features = trial.suggest_int('num_features', 30, len(selected_features_list))

        X = train_df.drop(columns=["label"]).iloc[:, :num_features]
        y = train_df["label"]

        params_to_tune = {
            'learning_rate': trial.suggest_float('learning_rate', 3e-4, 9e-2),
            'depth': trial.suggest_int('depth', 4, 8),
            'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 800),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 3),
            'random_strength': trial.suggest_float('random_strength', 0, 40)
        }

        oof_preds = np.full(len(train_df), np.nan)

        for i, (train_idx, val_idx) in enumerate(tscv.split(train_df)):
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

            model = CatBoostRegressor(**base_params, **params_to_tune)

            model.fit(X_train, y_train, eval_set=(X_val, y_val))

            val_preds = model.predict(X_val)

            oof_preds[val_idx] = val_preds

        mask = ~np.isnan(oof_preds)
        score = np.corrcoef(y[mask], oof_preds[mask])[0, 1]
            
        return score

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        current_time_local = datetime.datetime.now()

        study = optuna.create_study(
            direction="maximize",
            sampler=optunahub.load_module("samplers/auto_sampler").AutoSampler(),
            storage="sqlite:////home/max1024/Kaggle/drw2/optuna_study/db.sqlite3",
            study_name=f"DRW_Catboost_param_tune_{current_time_local}"[:-7]
        )
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [None]:
catboost_study = catboost_param_tune(1000)

In [None]:
catboost_study.best_params

In [None]:
catboost_study.best_value