Feature selection via SHAP, it like lgbm, xgb importance value. You can train some model then select columns that in top ft importance on almost model

In [None]:
import os
import sys
from pathlib import Path

class Config:
    project_dir = Path(os.getcwd())
    logs_dir = project_dir / "logs"
    results_dir = project_dir / "results"
    data_dir = Path("/home/nikita/Data/drw-crypto-market-prediction") if "rds" not in project_dir.as_posix() else project_dir / "Data/drw-crypto-market-prediction"
    seed = 42
    
    FEATURES = [
        "X863", "X856", "X344", "X598", "X862", "X385", "X852", "X603", "X860", "X674",
        "X415", "X345", "X137", "X855", "X174", "X302", "X178", "X532", "X168", "X612",
        "bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume", "X888", "X421", "X333"
    ]
    TARGET = "label"
    
print(Config.project_dir)
print(Config.logs_dir)
print(Config.results_dir)
print(Config.data_dir)

In [None]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import json
import os
import sys
import gc


import numpy as np
import pandas as pd
import seaborn as sns

#set pandas display options
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.width', 1000)
# Set numpy print options
np.set_printoptions(precision=3, suppress=True, linewidth=1000)


# Load data

In [None]:
# train = reduce_mem_usage(pd.read_parquet(Config.data_dir / "train.parquet"), "train")
# test = reduce_mem_usage(pd.read_parquet(Config.data_dir / "test.parquet"), "test")
# sample = pd.read_csv(Config.data_dir / "sample_submission.csv")

train_X = pd.read_parquet(Config.data_dir / "train.parquet",
                         #columns=Config.FEATURES + [Config.TARGET]
                         ).astype(np.float32)
train_y = train_X.pop("label")

# Remove constant columns

In [None]:
# Constant features
constant_cols = [col for col in train_X.columns if train_X[col].nunique() == 1]

In [None]:
constant_cols

In [None]:
train_X.drop(columns=constant_cols, inplace=True)

# add features

In [None]:
def add_features(df):
    bid = df['bid_qty']
    ask = df['ask_qty']
    vol = df['volume']
    log1p_vol = np.log1p(vol)
    buy = df['buy_qty']
    sell = df['sell_qty']
    EPS = 1e-6
    
    # basic features
    df['volume_weighted_sell'] = sell * log1p_vol
    df['volume_weighted_buy'] = buy * log1p_vol
    df['buy_sell_ratio'] = buy / (sell + EPS)
    df['selling_pressure'] = sell / (vol + EPS)
    
    # more advanced features
    df['effective_spread_proxy'] = np.abs(buy - sell) / (vol + EPS)
    df['order_imbalance'] = (bid - ask) / (bid + ask + EPS)
    df['flow_imbalance'] = (buy - sell) / (buy + sell + EPS)
    df['liquidity_ratio'] = (bid + ask) / (vol + EPS)
    
    # some more advanced features from kaggle
    df['kyle_lambda'] = df['flow_imbalance'] * np.sqrt(df['order_imbalance'].abs()) / (log1p_vol + EPS)
    df['vol_adjusted_pressure'] =  np.log1p(bid + ask) * np.exp(-vol / (vol.mean() + EPS))
    buy_intensity = buy / (vol + 1e-6)
    sell_intensity = sell / (vol + 1e-6)
    df['trade_intensity_asymmetry'] = np.sign(buy_intensity - sell_intensity) * \
                            np.log1p(np.abs(buy_intensity - sell_intensity))
                            
                            
                            
add_features(train_X)

# Pairwise correlation

In [None]:
# Assuming your DataFrame is named df and the target column is 'target'
# Drop the target column to focus only on features
feature_df = train_X.drop(columns=['label'], errors='ignore')

# Compute pairwise Pearson correlation matrix
correlation_matrix = feature_df.corr(method='pearson')

# Display shape and sample
print("Correlation matrix shape:", correlation_matrix.shape)
correlation_matrix.head()


In [None]:
# Mask upper triangle to avoid duplicate pairs and self-correlation
mask = np.triu(np.ones(correlation_matrix.shape), k=1)
corr_matrix_masked = correlation_matrix.where(mask == 1)

corr_pairs = corr_matrix_masked[
    (0.98 < corr_matrix_masked.abs())# & (corr_matrix_masked.abs() < 1)
].stack().reset_index()
corr_pairs.columns = ['feature_1', 'feature_2', 'correlation']
corr_pairs = corr_pairs.sort_values(by='correlation', ascending=False)

# Show result
print(f"Found {len(corr_pairs)} feature pairs.")
corr_pairs

In [None]:
pairs = {}
used = set()
for v1, v2 in zip(corr_pairs['feature_1'], corr_pairs['feature_2']):
    if v2 not in used:
        if v1 not in pairs:
            pairs[v1] = [v1, v2]
            used.add(v1)
            used.add(v2)
        elif v2 not in pairs[v1]:
            pairs[v1] += [v2]
            used.add(v2)
pairs

In [None]:
copies_to_drop = []
for k, v in pairs.items():
    copies_to_drop += v[1:]
print(f"Number of features to drop: {len(copies_to_drop)}")
print("['" + "', '".join(sorted(copies_to_drop)) + "']")


In [None]:
drop0999 = ['X146', 'X104', 'X116', 'X158', 'X110', 'X152', 'X122', 'X164', 'X170', 'X128', 'X134', 'X176', 'X140', 'X182', 'X363', 'X399', 'X357', 'X393', 'X351', 'X405', 'X411', 'X417', 'X423', 'X429', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294']
drop0995 = ['X417', 'X423', 'X429', 'X104', 'X146', 'X110', 'X152', 'X393', 'X399', 'X405', 'X411', 'X116', 'X158', 'X122', 'X164', 'X128', 'X170', 'X134', 'X176', 'X140', 'X182', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294', 'X295', 'X634', 'X302', 'X48', 'X47', 'X303', 'X55', 'X288', 'X56', 'X289', 'X489', 'X619', 'X490', 'X488', 'X296', 'X42', 'X491', 'X41', 'X297', 'X622', 'X494', 'X492', 'X493', 'X242', 'X50', 'X49', 'X243', 'X283', 'X241', 'X282', 'X244', 'X290', 'X245', 'X246']
drop099 = ['X393', 'X429', 'X423', 'X417', 'X411', 'X405', 'X399', 'X110', 'X152', 'X116', 'X158', 'X122', 'X164', 'X104', 'X146', 'X128', 'X170', 'X182', 'X140', 'X176', 'X134', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294', 'X295', 'X634', 'X302', 'X48', 'X47', 'X303', 'X55', 'X288', 'X56', 'X289', 'X489', 'X619', 'X490', 'X488', 'X296', 'X42', 'X491', 'X41', 'X297', 'X622', 'X494', 'X492', 'X493', 'X242', 'X50', 'X49', 'X243', 'X283', 'X241', 'X282', 'X244', 'X290', 'X245', 'X246', 'X8', 'X291', 'X247', 'X886', 'X36', 'X191', 'X44', 'X696', 'X43', 'X435', 'X35', 'X887', 'X255']
drop098 = ['X104', 'X110', 'X116', 'X122', 'X128', 'X134', 'X140', 'X146', 'X152', 'X158', 'X16', 'X164', 'X170', 'X176', 'X182', 'X191', 'X241', 'X242', 'X243', 'X244', 'X245', 'X246', 'X247', 'X255', 'X263', 'X282', 'X283', 'X284', 'X285', 'X288', 'X289', 'X290', 'X291', 'X294', 'X295', 'X296', 'X297', 'X300', 'X301', 'X302', 'X303', 'X35', 'X351', 'X357', 'X36', 'X363', 'X369', 'X375', 'X38', 'X381', 'X387', 'X393', 'X399', 'X405', 'X41', 'X411', 'X417', 'X42', 'X423', 'X429', 'X43', 'X434', 'X435', 'X438', 'X44', 'X47', 'X48', 'X488', 'X489', 'X49', 'X490', 'X491', 'X492', 'X493', 'X494', 'X50', 'X53', 'X54', 'X55', 'X56', 'X619', 'X622', 'X631', 'X634', 'X643', 'X646', 'X655', 'X658', 'X667', 'X669', 'X670', 'X672', 'X679', 'X681', 'X682', 'X684', 'X691', 'X694', 'X696', 'X789', 'X793', 'X797', 'X8', 'X801', 'X805', 'X817', 'X821', 'X825', 'X886', 'X887']
drop095 = ['X399', 'X357', 'X393', 'X351', 'X411', 'X369', 'X405', 'X363', 'X417', 'X375', 'X429', 'X387', 'X110', 'X152', 'X164', 'X122', 'X182', 'X176', 'X339', 'X423', 'X158', 'X116', 'X170', 'X104', 'X146', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294', 'X295', 'X634', 'X302', 'X48', 'X47', 'X303', 'X55', 'X288', 'X56', 'X289', 'X489', 'X619', 'X490', 'X488', 'X296', 'X42', 'X491', 'X41', 'X297', 'X622', 'X494', 'X492', 'X493', 'X242', 'X50', 'X49', 'X243', 'X283', 'X241', 'X282', 'X244', 'X290', 'X245', 'X246', 'X8', 'X291', 'X247', 'X886', 'X36', 'X191', 'X44', 'X696', 'X43', 'X435', 'X35', 'X887', 'X255', 'X438', 'X16', 'X285', 'X284', 'X684', 'X434', 'X789', 'X263', 'X793', 'X672', 'X797', 'X817', 'X821', 'X801', 'X38', 'X669', 'X825', 'X805', 'X681', 'X450', 'X753', 'X737', 'X749', 'X741', 'X809', 'X745', 'X449', 'X451', 'X37', 'X733', 'X829', 'X624', 'X453', 'X757', 'X5', 'X452', 'X660', 'X448', 'X719', 'X813', 'X721', 'X723', 'X447', 'X454', 'X205', 'X433', 'X725', 'X204', 'X693', 'X833', 'X455', 'X252', 'X459', 'X203', 'X657', 'X456', 'X837', 'X206', 'X727', 'X207', 'X437', 'X404', 'X841', 'X208', 'X132', 'X457', 'X889', 'X202', 'X209', 'X874', 'X138', 'X180', 'X187', 'X210', 'X410', 'X729', 'X93', 'X13', 'X211', 'X120', 'X385', 'X879', 'X337', 'X6', 'X90', 'X765', 'X761', 'X271', 'X78', 'X343', 'X24', 'X427', 'X769', 'X362', 'X260', 'X325', 'X190', 'X880', 'X212', 'X157', 'X621', 'X368', 'X773', 'X422', 'X605', 'X692', 'X341', 'X96', 'X162', 'X731', 'X428', 'X32', 'X340', 'X367', 'X326', 'X868', 'X409', 'X163', 'X466', 'volume', 'X338', 'X383', 'X425', 'X380', 'X186', 'X344', 'X785', 'X473', 'X75', 'X386', 'X777', 'X135', 'X253', 'X612', 'X842', 'X814', 'X781', 'X882', 'X865', 'X480', 'X786']
drop09 = ['X128', 'X170', 'X333', 'X122', 'X164', 'X327', 'X158', 'X116', 'X387', 'X429', 'X393', 'X140', 'X176', 'X134', 'X381', 'X423', 'X405', 'X399', 'X411', 'X152', 'X315', 'X146', 'X182', 'X417', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294', 'X295', 'X634', 'X302', 'X48', 'X47', 'X303', 'X55', 'X288', 'X56', 'X289', 'X489', 'X619', 'X490', 'X488', 'X296', 'X42', 'X491', 'X41', 'X297', 'X622', 'X494', 'X492', 'X493', 'X242', 'X50', 'X49', 'X243', 'X283', 'X241', 'X282', 'X244', 'X290', 'X245', 'X246', 'X8', 'X291', 'X247', 'X886', 'X36', 'X191', 'X44', 'X696', 'X43', 'X435', 'X35', 'X887', 'X255', 'X438', 'X16', 'X285', 'X284', 'X684', 'X434', 'X789', 'X263', 'X793', 'X672', 'X797', 'X817', 'X821', 'X801', 'X38', 'X669', 'X825', 'X805', 'X681', 'X450', 'X753', 'X737', 'X749', 'X741', 'X809', 'X745', 'X449', 'X451', 'X37', 'X733', 'X829', 'X624', 'X453', 'X757', 'X5', 'X452', 'X660', 'X448', 'X719', 'X813', 'X721', 'X723', 'X447', 'X468', 'X454', 'X205', 'X433', 'X725', 'X204', 'X693', 'X833', 'X455', 'X252', 'X459', 'X203', 'X657', 'X456', 'X837', 'X206', 'X727', 'X207', 'X437', 'X404', 'X841', 'X208', 'X132', 'X457', 'X889', 'X202', 'X209', 'X874', 'X138', 'X180', 'X187', 'X210', 'X410', 'X729', 'X93', 'X13', 'X211', 'X120', 'X321', 'X385', 'X879', 'X337', 'X6', 'X90', 'X765', 'X761', 'X271', 'X78', 'X343', 'X24', 'X427', 'X769', 'X362', 'X260', 'X325', 'X190', 'X880', 'X212', 'X157', 'X621', 'X368', 'X773', 'X422', 'X605', 'X692', 'X341', 'X96', 'X162', 'X731', 'X428', 'X32', 'X340', 'X367', 'X326', 'X309', 'X868', 'X409', 'X163', 'X466', 'volume', 'X338', 'X383', 'X425', 'X380', 'X186', 'X344', 'X785', 'X473', 'X75', 'X386', 'X777', 'X135', 'X253', 'X612', 'X842', 'X814', 'X781', 'X882', 'X865', 'X480', 'X786', 'X758', 'X115', 'X113', 'X71', 'X279', 'X21', 'X219', 'X322', 'X155', 'X268', 'X360', 'X177', 'X28', 'X318', 'X426', 'X119', 'X883', 'X275', 'X29', 'X467', 'X602', 'X161', 'X890', 'X695', 'X77', 'X121', 'X402', 'X173', 'X366', 'X378', 'X382', 'X324', 'X276', 'X358', 'X408', 'X323', 'X175', 'X407', 'X117', 'X179', 'X689', 'X131', 'X384', 'X400', 'X364', 'X436', 'X609', 'X153', 'X365', 'X159', 'X406', 'X462', 'X474', 'X336', 'X424', 'X463', 'X137', 'X342', 'X181', 'X746', 'X89', 'X774', 'X830', 'X194', 'X73', 'X130', 'X166', 'X95', 'X136', 'X178', 'X88', 'X79', 'X802', 'X76', 'X97', 'X160', 'X118', 'X94', 'X133', 'X139', 'X216', 'X227', 'X226', 'X232', 'X220', 'X680', 'X14', 'X653', 'X470', 'X477', 'X250', 'X261', 'X3', 'X471', 'X668', 'X233', 'X223', 'X475', 'X168', 'X518', 'X373', 'X650', 'X415', 'X861', 'X596', 'X854']
drop05 = ['X158', 'X417', 'X164', 'X357', 'X399', 'X393', 'X351', 'X134', 'X176', 'X339', 'X98', 'X411', 'X170', 'X429', 'X146', 'X423', 'X152', 'X405', 'X363', 'X182', 'X345', 'X694', 'X691', 'X682', 'X679', 'X670', 'X667', 'X655', 'X300', 'X658', 'X301', 'X53', 'X643', 'X54', 'X646', 'X631', 'X294', 'X295', 'X634', 'X302', 'X48', 'X47', 'X303', 'X55', 'X288', 'X56', 'X289', 'X489', 'X619', 'X490', 'X488', 'X296', 'X42', 'X491', 'X41', 'X297', 'X622', 'X494', 'X492', 'X493', 'X242', 'X50', 'X49', 'X243', 'X283', 'X241', 'X282', 'X244', 'X290', 'X245', 'X246', 'X8', 'X291', 'X247', 'X886', 'X36', 'X191', 'X44', 'X696', 'X43', 'X435', 'X35', 'X185', 'X887', 'X255', 'X438', 'X16', 'X285', 'X284', 'X684', 'X434', 'X789', 'X263', 'X793', 'X672', 'X797', 'X817', 'X821', 'X801', 'X38', 'X669', 'X825', 'X805', 'X681', 'X450', 'X753', 'X737', 'X749', 'X741', 'X809', 'X745', 'X449', 'X451', 'X37', 'X733', 'X829', 'X624', 'X453', 'X757', 'X5', 'X452', 'X660', 'X448', 'X719', 'X813', 'X721', 'X723', 'X447', 'X468', 'X454', 'X205', 'X433', 'X725', 'X204', 'X693', 'X833', 'X455', 'X252', 'X459', 'X203', 'X657', 'X456', 'X837', 'X206', 'X727', 'X207', 'X437', 'X404', 'X841', 'X208', 'X132', 'X457', 'X889', 'X202', 'X209', 'X874', 'X138', 'X180', 'X187', 'X333', 'X210', 'X410', 'X729', 'X93', 'X327', 'X13', 'X211', 'X120', 'X385', 'X879', 'X337', 'X6', 'X90', 'X765', 'X761', 'X271', 'X78', 'X343', 'X24', 'X427', 'X769', 'X362', 'X392', 'X260', 'X325', 'X190', 'X880', 'X212', 'X157', 'X621', 'X368', 'X773', 'X422', 'X605', 'X692', 'X341', 'X96', 'X162', 'X731', 'X428', 'X32', 'X857', 'X340', 'X367', 'X326', 'X868', 'X272', 'X409', 'X163', 'X466', 'volume', 'X338', 'X383', 'X425', 'X380', 'X186', 'X344', 'X785', 'X473', 'X75', 'X386', 'X777', 'X135', 'X253', 'X612', 'X842', 'X814', 'X781', 'X882', 'X865', 'X25', 'X480', 'X786', 'X758', 'X115', 'X145', 'X113', 'X71', 'X279', 'X21', 'X219', 'X322', 'X155', 'X268', 'X360', 'X177', 'X28', 'X318', 'X426', 'X119', 'X883', 'X275', 'X29', 'X467', 'X460', 'X602', 'X161', 'X890', 'X695', 'X77', 'X121', 'X402', 'X173', 'X366', 'X378', 'X382', 'X324', 'X276', 'X358', 'X408', 'X323', 'X175', 'X407', 'X117', 'X179', 'X689', 'X131', 'X384', 'X400', 'X364', 'X436', 'X609', 'X153', 'X365', 'X159', 'X406', 'X462', 'X474', 'X336', 'X424', 'X463', 'X137', 'X342', 'X181', 'X861', 'X746', 'X89', 'X774', 'X830', 'X194', 'X73', 'X103', 'X130', 'X166', 'X95', 'X136', 'X178', 'X88', 'X79', 'X802', 'X74', 'X76', 'X97', 'X160', 'X118', 'X94', 'X133', 'X80', 'X139', 'X216', 'X227', 'X226', 'X232', 'X220', 'X680', 'X14', 'X653', 'X470', 'X477', 'X250', 'X261', 'X3', 'X471', 'X668', 'X233', 'X223', 'X475', 'X168', 'X518', 'X373', 'X650', 'X644', 'X415', 'X11', 'X594', 'X148', 'X258', 'X224', 'X516', 'X142', 'X81', 'X314', 'X350', 'X530', 'X371', 'X532', 'X567', 'X565', 'X581', 'X413', 'X108', 'X269', 'X230', 'X328', 'X22', 'X858', 'X266', 'X346', 'X273', 'X63', 'X313', 'X603', 'X19', 'X26', 'X397', 'X579', 'X30', 'X189', 'X588', 'X310', 'X150', 'X228', 'X539', 'X355', 'X66', 'X123', 'X517', 'X610', 'X600', 'X414', 'X778', 'X656', 'X607', 'X586', 'X277', 'X659', 'X834', 'X750', 'X370', 'X105', 'X141', 'X165', 'X412', 'X806', 'X636', 'X642', 'X85', 'X395', 'X214', 'X537', 'X353', 'X311', 'X766', 'X86', 'X64', 'X822', 'X738', 'X735', 'X671', 'X794', 'X791', 'X531', 'X665', 'X629', 'X623', 'X478', 'X566', 'X580', 'X639', 'X633', 'X859', 'X597', 'X587', 'X538', 'X501', 'X533', 'X528', 'X540', 'X508', 'X515', 'X519', 'X514', 'X522', 'X851', 'X849', 'X850', 'X542', 'X866', 'X743', 'X755', 'X759', 'X831', 'X504', 'X775', 'X747', 'X632', 'X569', 'X564', 'X839', 'X783', 'X648', 'X557', 'X521', 'X845', 'X843', 'X502', 'X582', 'X550', 'X787', 'X529', 'X534', 'X799', 'X143', 'X846', 'X847', 'X571', 'X803', 'X553', 'X551', 'X256', 'X589', 'X578', 'X583', 'X536', 'X543', 'X811', 'X568', 'X503', 'X552', 'X815', 'X585', 'X877', 'X9', 'X584', 'X590', 'X620', 'X507', 'X592', 'X591', 'X570', 'X635', 'X500', 'X860', 'X348', 'X823', 'X767', 'X862', 'X856', 'X390', 'X556', 'X549', 'X593', 'X388', 'X876', 'X596', 'X854']

print("drop0999 len:", len(drop0999))
print("drop0995 len:", len(drop0995))
print("drop099 len:", len(drop099))
print("drop095 len:", len(drop095))
print("drop09 len:", len(drop09))
print("drop05 len:", len(drop05))

In [None]:
# TODO check how much cv score lin reg decreases with each drop

In [None]:
len(train_X.columns)


# Final features to drop

In [None]:
# high_corr_drop = ['X39', 'X41', 'X40', 'X42', 'X45', 'X47', 'X46', 'X48', 'X47', 
#                   'X49', 'X48', 'X50', 'X51', 'X53', 'X55', 'X52', 'X54', 'X56', 
#                   'X62', 'X104', 'X146', 'X68', 'X110', 'X152', 'X74', 'X116', 
#                   'X158', 'X80', 'X122', 'X164', 'X86', 'X128', 'X170', 'X92', 
#                   'X134', 'X176', 'X98', 'X140', 'X182', 'X234', 'X241', 'X235', 
#                   'X242', 'X236', 'X243', 'X237', 'X244', 'X238', 'X245', 'X239', 
#                   'X246', 'X280', 'X282', 'X281', 'X283', 'X286', 'X288', 'X287', 
#                   'X289', 'X288', 'X290', 'X292', 'X294', 'X293', 'X295', 'X294', 
#                   'X296', 'X295', 'X297', 'X298', 'X300', 'X302', 'X299', 'X301', 
#                   'X303', 'X309', 'X351', 'X393', 'X315', 'X357', 'X399', 'X321', 
#                   'X363', 'X405', 'X327', 'X369', 'X411', 'X333', 'X375', 'X417', 
#                   'X339', 'X381', 'X423', 'X345', 'X387', 'X429', 'X481', 'X488', 
#                   'X482', 'X489', 'X483', 'X490', 'X484', 'X491', 'X485', 'X492', 
#                   'X486', 'X493', 'X487', 'X494', 'X613', 'X619', 'X616', 'X622', 
#                   'X625', 'X631', 'X628', 'X634', 'X637', 'X643', 'X640', 'X646', 
#                   'X649', 'X655', 'X652', 'X658', 'X661', 'X667', 'X664', 'X670', 
#                   'X673', 'X679', 'X676', 'X682', 'X685', 'X691', 'X688', 'X694']

# constant_cols = ['X697', 'X698', 'X699', 'X700', 'X701', 'X702', 'X703', 'X704', 'X705', 
#                  'X706', 'X707', 'X708', 'X709', 'X710', 'X711', 'X712', 'X713', 'X714', 
#                  'X715', 'X716', 'X717', 'X864', 'X867', 'X869', 'X870', 'X871', 'X872']

# train_X.drop(columns=high_corr_drop + constant_cols, inplace=True)

In [None]:
# train_X.head() 

# cv code

In [None]:
# What do i want to do here? 
from sklearn.base import BaseEstimator
from sklearn.model_selection import KFold


def xgb_params(trial):
    return {
        # fixed
        'objective': trial.suggest_categorical('objective', ['reg:squarederror']),
        'tree_method': trial.suggest_categorical('tree_method', ['hist']),
        'device': trial.suggest_categorical('device', ['cuda']),
        # 'predictor': trial.suggest_categorical('predictor', ['gpu_predictor']),
        'random_state': trial.suggest_categorical('random_state', [Config.seed]),
        # hyperparams
        'n_estimators': trial.suggest_int('n_estimators', 100, 100, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'subsample': trial.suggest_float('subsample', 0.01, 0.25, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.7),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 0.7),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.3, 0.7),
        'lambda': trial.suggest_float('lambda', 10, 200, log=True),
        'alpha': trial.suggest_float('alpha', 10, 100, log=True),
        'gamma': trial.suggest_float('gamma', 0.0, 3.0),
    }# other: n_jobs, random_state, verbose, max_leaves, min_child_weight



def lgbm_params(trial):
    return {
        # fixed
        'objective': trial.suggest_categorical('objective', ['regression']),
        'device': trial.suggest_categorical('device', ['cuda']),
        'random_state': trial.suggest_categorical('random_state', [Config.seed]),
        'verbose': trial.suggest_categorical('verbose', [0]),  # no output
        # hyperparams
        'n_estimators': trial.suggest_int('n_estimators', 100, 100, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 128),
        'subsample': trial.suggest_float('subsample', 0.01, 0.25, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 0.7),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.3, 0.7),
        'reg_alpha': trial.suggest_float('reg_alpha', 10, 100, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 10, 200, log=True),
    }# other? goss


def catb_params(trial):
    return {
        # fixed
        # 'loss_function': trial.suggest_categorical('loss_function', ['RMSE']),
        'task_type': trial.suggest_categorical('task_type', ['GPU']),
        'random_state': trial.suggest_categorical('random_state', [Config.seed]),
        'verbose': trial.suggest_categorical('verbose', [0]),  # no output
        'leaf_estimation_iterations': trial.suggest_categorical('leaf_estimation_iterations', [5]), #default 10
        # hyperparams
        'n_estimators': trial.suggest_int('n_estimators', 100, 100, step=100),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.05, log=True),
        'depth': trial.suggest_int('depth', 3, 7),
        'subsample': trial.suggest_float('subsample', 0.01, 0.25, log=True),
        #'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.05, 0.25, log=True), # only supported on cpu
        #'bootstrap_type': trial.suggest_categorical('bootstrap_type', ['Bernoulli']),  
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 10, 200, log=True),
    }# other? max_bin?




# ive decided to use naive nested 5 fold to do the tuning. 
# maybe setting one of them to None can yield back simple kfold. 
# not sure what to do yet for ensembling, but this is an issue for way later

import optuna
import logging
from datetime import datetime
import os
import sys

import numpy as np
from scipy.stats import pearsonr
    
def do_opuna_optimization(
    X: np.ndarray,
    y: np.ndarray,
    ModelClass: BaseEstimator,
    params_fn: Callable = xgb_params,
    n_trials: int = 100,
    cv = KFold(n_splits=5, shuffle=False),
):
    #Configure logging
    timestamp = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
    log_dir = Config.logs_dir / ModelClass.__name__
    os.makedirs(log_dir, exist_ok=True)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO) 
    logger.addHandler(logging.FileHandler(log_dir / f"optuna{timestamp}.log", mode="w"))  # Log to a file named "optuna.log"
    optuna.logging.enable_propagation()
    
    #optuna objective
    def objective(trial):
        params = params_fn(trial)
        scores = []
        for train_idx, valid_idx in cv.split(X, y):
            X_train, X_valid = X[train_idx], X[valid_idx]
            y_train, y_valid = y[train_idx], y[valid_idx]
            mdl = ModelClass(**params)
            mdl.fit(X_train, y_train)
            preds = mdl.predict(X_valid)
            rho, _ = pearsonr(y_valid, preds)
            scores.append(rho)
        return np.mean(scores)

    study = optuna.create_study(direction="maximize", 
                                sampler=optuna.samplers.TPESampler(seed=Config.seed))
    study.optimize(objective, n_trials=n_trials)
    print("Best trial:", study.best_trial.number)
    print("Best value (CV RMSE):", study.best_value)
    print("Best params:", study.best_params)
    return study


from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [None]:
do_opuna_optimization(
    X=train_X.values,
    y=train_y.values,
    ModelClass=XGBRegressor,
    params_fn=xgb_params,
    n_trials=5,
    cv=KFold(n_splits=5, shuffle=False),
)

In [None]:
do_opuna_optimization(
    X=train_X.values,
    y=train_y.values,
    ModelClass=LGBMRegressor,
    params_fn=lgbm_params,
    n_trials=5,
    cv=KFold(n_splits=5, shuffle=False),
)

In [None]:
do_opuna_optimization(
    X=train_X.values,
    y=train_y.values,
    ModelClass=CatBoostRegressor,
    params_fn=catb_params,
    n_trials=5,
    cv=KFold(n_splits=5, shuffle=False),
)

# I need to do feature importances

In [None]:

# i need to train a single model and do feature importances

# testing

In [None]:
%run kaggle/do_optuna.py \
    --models "XGBoost" \
    --logs_dir "/rds/general/user/nz423/home/kaggle/logs/" \
    --data_dir "/rds/general/user/nz423/home/Data/drw-crypto-market-prediction/" \
    --n_optuna_trials 5 \
    --kfolds 5 \
    --seed 42

  from .autonotebook import tqdm as notebook_tqdm
  df['volume_weighted_sell'] = sell * log1p_vol
  df['volume_weighted_buy'] = buy * log1p_vol
  df['buy_sell_ratio'] = buy / (sell + EPS)
  df['selling_pressure'] = sell / (vol + EPS)
  df['effective_spread_proxy'] = np.abs(buy - sell) / (vol + EPS)
  df['order_imbalance'] = (bid - ask) / (bid + ask + EPS)
  df['flow_imbalance'] = (buy - sell) / (buy + sell + EPS)
  df['liquidity_ratio'] = (bid + ask) / (vol + EPS)
  df['kyle_lambda'] = df['flow_imbalance'] * np.sqrt(df['order_imbalance'].abs()) / (log1p_vol + EPS)
  df['vol_adjusted_pressure'] =  np.log1p(bid + ask) * np.exp(-vol / (vol.mean() + EPS))
  df['trade_intensity_asymmetry'] = np.sign(buy_intensity - sell_intensity) * \
[I 2025-06-25 21:36:34,747] A new study created in memory with name: no-name-f7eb7958-08de-4317-912b-6f19240a3a17
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_pr

Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x1539d893bf50>>
Traceback (most recent call last):
  File "/rds/general/user/nz423/home/miniforge3/envs/hydraboost/lib/python3.11/site-packages/xgboost/core.py", line 585, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 
