In [1]:
# Constants
TRAIN = False
OVERWRITE = False
DEBUG = False

tuning = False

VERSION_NB = 4
STATE = 42
N_TRIALS = 1
download_kaggle_data = False
models_dir = "models_8"

# External general-purpose modules
import gc
import os
import zipfile

import shutil
import warnings
from datetime import datetime
import itertools as itt
from itertools import combinations, product
from warnings import simplefilter

import joblib
import numpy as np
import pandas as pd
import polars as pl
from dotenv import load_dotenv
from joblib import dump
from numba import njit, prange

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Load environment variables
load_dotenv()

False

In [2]:
#from utils import aggregate_feature_importance

In [3]:
path_project_dir = os.getcwd()
if path_project_dir not in ["/kaggle/working", "/content"]:
    path_project_dir = os.getenv("ROOT_PATH")

# Conditional imports and settings based on TRAIN constant
if TRAIN:

    if path_project_dir == '/kaggle/working':
        !pip install loguru mlflow optuna > /dev/null

    # External Libraries
    import lightgbm as lgbm
    import mlflow
    import optuna
    from loguru import logger
    from mlflow.tracking import MlflowClient
    from optuna.integration.mlflow import MLflowCallback
    from sklearn.model_selection import KFold
    from tqdm import tqdm
    from xgboost import XGBRegressor as XGBR
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR

    # Local Modules Imports
    from utils import log_feature_importance, create_model, log_training_details, aggregate_feature_importance, get_data, clean_directory_except_one, experiments_data


    # Logger setup
    logger.add("logs.log", format="{time:YYYY-MM-DD HH:mm} | {level} | {message}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
    
    # Auto-reload modules - Specific to Jupyter Notebooks
    %load_ext autoreload
    %autoreload 2

    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )


In [4]:
if path_project_dir == "/kaggle/working":
    path_data_project_dir = "/kaggle/input/optiver-trading-at-the-close"
    path_experiments_storage = os.path.join(path_project_dir, "experiments_storage")

    path_dataset_train_raw = "/kaggle/input/optiver-trading-at-the-close/train.csv"
    path_dataset_test_raw = (
        "/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv"
    )

    path_dataset_processed = "/kaggle/working/processed_data"
    path_dataset_train = os.path.join(path_dataset_processed, "train.csv")
    path_dataset_test = os.path.join(path_dataset_processed, "test.csv")

else:
    name_folder_data_project = "kaggle_optiver_trading_at_the_close"

    path_data_dir = os.path.join(path_project_dir, "data")
    path_dataset_train_raw = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/raw", "train.csv"
    )
    path_dataset_processed = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/processed"
    )

    path_data_project_dir = os.path.join(path_data_dir, name_folder_data_project)

    path_config_dir = os.path.join(path_project_dir, "config")
    path_config_train = os.path.join(path_config_dir, "train_config.yaml")

    path_experiments_storage = os.path.join(
        path_data_project_dir, "experiments_storage"
    )

    if download_kaggle_data:
        dataset_name = "ravi20076/optiver-memoryreduceddatasets"
        kaggle_json_path = os.path.join(path_project_dir, "kaggle.json")
        get_data(
            kaggle_json_path,
            path_data_project_dir,
            dataset_name=dataset_name,
            specific_file=None,
        )

    file_name_df_train = "train.csv"
    file_name_df_test = "test.csv"

    path_dataset_train = os.path.join(path_data_project_dir, file_name_df_train)
    path_dataset_test = os.path.join(path_data_project_dir, file_name_df_test)

if TRAIN:
    mlflow.set_tracking_uri(path_experiments_storage)
    client = MlflowClient()

In [5]:
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = (
                df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            )
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [
        (price.index(a), price.index(b), price.index(c))
        for a, b, c in combinations(price, 3)
    ]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [6]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if (col_type != object) and (col != "target"):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float32)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df

In [7]:
# generate imbalance features
def imbalance_features(df):
    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [["ask_price", "bid_price", "wap", "reference_price"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df["wap_momentum"] = df.groupby("stock_id")["weighted_wap"].pct_change(periods=6)
    df["imbalance_momentum"] = (
        df.groupby(["stock_id"])["imbalance_size"].diff(periods=1) / df["matched_size"]
    )
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(["stock_id"])["price_spread"].diff()
    df["price_pressure"] = df["imbalance_size"] * (df["ask_price"] - df["bid_price"])
    df["market_urgency"] = df["price_spread"] * df["liquidity_imbalance"]
    df["depth_pressure"] = (df["ask_size"] - df["bid_size"]) * (
        df["far_price"] - df["near_price"]
    )
    df["spread_depth_ratio"] = (df["ask_price"] - df["bid_price"]) / (
        df["bid_size"] + df["ask_size"]
    )
    df["mid_price_movement"] = (
        df["mid_price"]
        .diff(periods=5)
        .apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    )
    df["micro_price"] = (
        (df["bid_price"] * df["ask_size"]) + (df["ask_price"] * df["bid_size"])
    ) / (df["bid_size"] + df["ask_size"])
    df["relative_spread"] = (df["ask_price"] - df["bid_price"]) / df["wap"]

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in [
        "matched_size",
        "imbalance_size",
        "reference_price",
        "imbalance_buy_sell_flag",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby(["stock_id","date_id"])[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby(["stock_id","date_id"])[col].pct_change(window)

    for col in [
        "ask_price",
        "bid_price",
        "ask_size",
        "bid_size",
        "wap",
        "near_price",
        "far_price",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby(["stock_id","date_id"])[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)


# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


# generate all features
def feat_engineering(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    
    feature_name = [
        i for i in df.columns if i not in ["row_id", "time_id"]
    ]

    return df[feature_name]

In [8]:
if TRAIN:
    if not os.path.exists(path_dataset_processed):
        os.makedirs(path_dataset_processed)

    if not os.path.exists(path_dataset_train) or OVERWRITE:
        df_train_raw = pd.read_csv(path_dataset_train_raw)

    else:
        df_train_raw = pd.read_csv(path_dataset_train)

    if DEBUG:
        df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2])]

    drop_idx = df_train_raw.loc[df_train_raw["target"].isna(), "target"].index.to_list()
    df_train = df_train_raw.drop(drop_idx, axis=0)
    df_train.reset_index(drop=True, inplace=True)

    # df_train_raw = df_train_raw.drop(["time_id","row_id"], axis = 1)

In [9]:
weights = [
    0.004,
    0.001,
    0.002,
    0.006,
    0.004,
    0.004,
    0.002,
    0.006,
    0.006,
    0.002,
    0.002,
    0.008,
    0.006,
    0.002,
    0.008,
    0.006,
    0.002,
    0.006,
    0.004,
    0.002,
    0.004,
    0.001,
    0.006,
    0.004,
    0.002,
    0.002,
    0.004,
    0.002,
    0.004,
    0.004,
    0.001,
    0.001,
    0.002,
    0.002,
    0.006,
    0.004,
    0.004,
    0.004,
    0.006,
    0.002,
    0.002,
    0.04,
    0.002,
    0.002,
    0.004,
    0.04,
    0.002,
    0.001,
    0.006,
    0.004,
    0.004,
    0.006,
    0.001,
    0.004,
    0.004,
    0.002,
    0.006,
    0.004,
    0.006,
    0.004,
    0.006,
    0.004,
    0.002,
    0.001,
    0.002,
    0.004,
    0.002,
    0.008,
    0.004,
    0.004,
    0.002,
    0.004,
    0.006,
    0.002,
    0.004,
    0.004,
    0.002,
    0.004,
    0.004,
    0.004,
    0.001,
    0.002,
    0.002,
    0.008,
    0.02,
    0.004,
    0.006,
    0.002,
    0.02,
    0.002,
    0.002,
    0.006,
    0.004,
    0.002,
    0.001,
    0.02,
    0.006,
    0.001,
    0.002,
    0.004,
    0.001,
    0.002,
    0.006,
    0.006,
    0.004,
    0.006,
    0.001,
    0.002,
    0.004,
    0.006,
    0.006,
    0.001,
    0.04,
    0.006,
    0.002,
    0.004,
    0.002,
    0.002,
    0.006,
    0.002,
    0.002,
    0.004,
    0.006,
    0.006,
    0.002,
    0.002,
    0.008,
    0.006,
    0.004,
    0.002,
    0.006,
    0.002,
    0.004,
    0.006,
    0.002,
    0.004,
    0.001,
    0.004,
    0.002,
    0.004,
    0.008,
    0.006,
    0.008,
    0.002,
    0.004,
    0.002,
    0.001,
    0.004,
    0.004,
    0.004,
    0.006,
    0.008,
    0.004,
    0.001,
    0.001,
    0.002,
    0.006,
    0.004,
    0.001,
    0.002,
    0.006,
    0.004,
    0.006,
    0.008,
    0.002,
    0.002,
    0.004,
    0.002,
    0.04,
    0.002,
    0.002,
    0.004,
    0.002,
    0.002,
    0.006,
    0.02,
    0.004,
    0.002,
    0.006,
    0.02,
    0.001,
    0.002,
    0.006,
    0.004,
    0.006,
    0.004,
    0.004,
    0.004,
    0.004,
    0.002,
    0.004,
    0.04,
    0.002,
    0.008,
    0.002,
    0.004,
    0.001,
    0.004,
    0.006,
    0.004,
]

weights = {int(k): v for k, v in enumerate(weights)}

In [10]:
if TRAIN:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median()
        + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std()
        + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
        - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median()
        + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std()
        + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
        - df_train.groupby("stock_id")["ask_price"].min(),
    }

    df_train_feats = feat_engineering(df_train)
    print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)
    
    df_train = df_train_feats.copy()
    
    

In [11]:
def time_series_split(X, n_splits, n_test_splits, embargo_td=2):
    factorized_indices = np.unique(X["factorized"])

    # Compute the fold boundaries
    fold_bounds = [
        (fold[0], fold[-1] + 1) for fold in np.array_split(factorized_indices, n_splits)
    ]

    # Create the list of all tests test_fold_bounds that will become the test sets
    selected_fold_bounds = list(itt.combinations(fold_bounds, n_test_splits))

    # Reverse to start the testing from the most recent part of the dataset
    selected_fold_bounds.reverse()

    for fold_bound_list in selected_fold_bounds:
        test_factorized_indices = np.empty(0)
        test_fold_bounds = []

        for fold_start, fold_end in fold_bound_list:
            # Records the boundaries of the current test split
            if not test_fold_bounds or fold_start != test_fold_bounds[-1][-1]:
                test_fold_bounds.append((fold_start, fold_end))
            elif fold_start == test_fold_bounds[-1][-1]:
                test_fold_bounds[-1] = (test_fold_bounds[-1][0], fold_end)

            test_factorized_indices = np.union1d(
                test_factorized_indices, factorized_indices[fold_start:fold_end]
            ).astype(int)

        # Compute the train set indices
        train_indices = np.setdiff1d(factorized_indices, test_factorized_indices)

        # Purge and embargo can be added here if needed
        # ...

        yield train_indices, test_factorized_indices

In [12]:
if TRAIN:
    col_split = "date_id"
    df_train.sort_values([col_split], inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_train["factorized"] = pd.factorize(df_train[col_split])[0]

    list_cols_drop = ["date_id"]
    df_train.drop(list_cols_drop, axis=1, inplace=True)
    

In [13]:
if DEBUG:
    n_estimators_min = n_estimators_max = 50
else:
    n_estimators_min = 500
    n_estimators_max = 500

In [14]:
gpu_switch = "OFF"
n_splits = 6
n_test_split = 1
embargo_td = 100

n_repeats = 1
nbrnd_erly_stp = 130

cv_mthd = "KF"

# Cross-Validation Setup
if TRAIN:
    all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=STATE)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "mae",
                "boosting_type": "gbdt",
                "random_state": STATE,
                "n_jobs": 4,
                "verbose": -1,
                "importance_type": "gain",
            },
            "dynamic_params": {
                "n_estimators": {
                    "type": "int",
                    "low": n_estimators_min,
                    "high": n_estimators_max,
                },
                "learning_rate": {
                    "type": "float",
                    "low": 0.005,
                    "high": 0.06,
                },
                "max_depth": {"type": "int", "low": 10, "high": 90},
                "num_leaves": {
                    "type": "int",
                    "low": 20,
                    "high": 90,
                },
                "min_child_samples": {
                    "type": "int",
                    "low": 10,
                    "high": 70,
                },
                "subsample": {
                    "type": "float",
                    "low": 0.7,
                    "high": 1,
                },
                "colsample_bytree": {
                    "type": "float",
                    "low": 1,
                    "high": 1,
                },
                "min_split_gain": {
                    "type": "float",
                    "low": 0,
                    "high": 2,
                },
                "reg_alpha": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
                "reg_lambda": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
            },
        },
    }

    dict_models = {"LGBMR": LGBMR}

    log_model = True

    experiment_date_str = datetime.now().strftime("%Y%m%d_%H_%M_%S")
    experiment_purpose = "optiver_trading_at_the_close"
    experiment_name = f"{experiment_purpose}_{experiment_date_str}"

    mlflow.set_experiment(experiment_name)

In [15]:
def get_params_trained_models(model_path):
    model = joblib.load(model_path)
    return model.get_params()

if TRAIN:
    dict_fixed_model_params = get_params_trained_models('/kaggle/input/models-6/LGBMR_0_20231108_235435.pkl')

In [16]:
if TRAIN:
    args = {
        "cv_mthd": cv_mthd,
        "experiment_purpose": experiment_purpose,
        "experiment_name": experiment_name,
        "VERSION_NB": VERSION_NB,
        "dict_models": dict_models,
        "model_params_dict": model_params_dict,
        "logger": logger,
        "n_splits": n_splits,
        "n_test_split": n_test_split,
        "experiment_date_str": experiment_date_str,
        "path_experiments_storage": path_experiments_storage,
        "target_col": "target",
    }

In [17]:
def run_mlflow_experiment(df_train, args, trial=None):
    cv_mthd = args["cv_mthd"]
    experiment_purpose = args["experiment_purpose"]
    experiment_name = args["experiment_name"]
    VERSION_NB = args["VERSION_NB"]
    dict_models = args["dict_models"]
    model_params_dict = args["model_params_dict"]
    logger = args["logger"]
    n_splits = args["n_splits"]
    n_test_split = args["n_test_split"]
    experiment_date_str = args["experiment_date_str"]
    path_experiments_storage = args["path_experiments_storage"]
    target_col = args["target_col"]

    if trial == None:
        trial = optuna.trial.FixedTrial(
            {
                "n_estimators": 500,
                "learning_rate": 0.005,
                "max_depth": 10,
                "num_leaves": 20,
                "min_child_samples": 10,
                "subsample": 0.7,
                "colsample_bytree": 1.0,
                "min_split_gain": 0.0,
                "reg_alpha": 0.0,
                "reg_lambda": 0.0,
                "device": "gpu" if gpu_switch == "ON" else "cpu",
            }
        )

    with mlflow.start_run() as run:
        mlflow.log_param("cv_mthd", cv_mthd)
        mlflow.set_tag("experiment_purpose", experiment_purpose)
        mlflow.set_tag("experiment_name", experiment_name)
        mlflow.set_tag("version_nb", VERSION_NB)
        mlflow.set_tag("n_trial", trial.number)

        score_list = []
        for model_name, model_class in dict_models.items():
            
            
            
            if tuning:
                model = create_model(
                    trial,
                    model_class,
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
            else:
                print(dict_fixed_model_params)
                model = model_class(**dict_fixed_model_params)

            log_training_details(logger, model, trial, model_name)

            for fold_n, (train_indices, test_indices) in enumerate(
                time_series_split(
                    df_train, n_splits=n_splits, n_test_splits=n_test_split
                )
            ):
                with mlflow.start_run(
                    run_name=f"fold_{fold_n+1}", nested=True
                ) as nested_run:
                    mask_train = df_train["factorized"].isin(train_indices)
                    mask_test = df_train["factorized"].isin(test_indices)

                    
                    y_train = df_train.loc[mask_train, target_col]
                    y_val = df_train.loc[mask_test, target_col]
                    X_train = df_train.loc[mask_train].drop(
                        [target_col, "factorized"], axis=1
                    )
                    X_val = df_train.loc[mask_test].drop(
                        [target_col, "factorized"], axis=1
                    )

                    mlflow.log_param("train_rows", X_train.shape[0])
                    mlflow.log_param("train_cols", X_train.shape[1])
                    
                    list_features = list(aggregate_feature_importance(['/kaggle/working/feat_impor_optiver_trading_at_the_close_20231109_11_14_13.csv'])['feat'][:80])
                    X_train = X_train[list_features]
                    X_val = X_val[list_features]

                    model.fit(
                        X_train,
                        y_train,
                        eval_set=[(X_val, y_val)],
                        eval_metric="mae",
                        callbacks=[
                            lgbm.callback.early_stopping(stopping_rounds=100),
                            lgbm.callback.log_evaluation(period=100),
                        ],
                    )

                    log_feature_importance(
                        trial.number,
                        model,
                        X_train,
                        fold_n,
                        experiment_purpose,
                        experiment_date_str,
                    )

                    fold_score = model.best_score_["valid_0"]["l1"]

                    score_list.append(fold_score)

                    mlflow.log_param("fold_score", fold_score)
                    mlflow.log_param("fold_number", fold_n + 1)
                    mlflow.log_param("model_name", model_name)

                    params_to_log = model.get_params()
                    mlflow.log_params(params_to_log)

                    current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                    model_log_name = f"{model_name}_{trial.number}_{current_time_str}"

                    mlflow.log_param("model_log_name", model_log_name)
                    mlflow.sklearn.log_model(model, model_log_name)

                    mlflow.log_param("run_time", current_time_str)

                    nested_run_id = nested_run.info.run_id
                    model_path = f"{path_experiments_storage}/{run.info.experiment_id}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                    mlflow.log_param("model_path", model_path)

        avg_score = sum(score_list) / len(score_list)

        return avg_score

In [18]:
def objective(trial, df_train):
    avg_score = run_mlflow_experiment(df_train, args, trial)
    return avg_score

In [19]:
# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train), n_trials=N_TRIALS)

In [20]:
if TRAIN:
    df_exp = experiments_data(client, list_experiment_id=None, save_df=None, list_columns=None)
    list_base_cols = [
        "run_time",
        "experiment_id",
        "run_id",
        "model_name",
        "fold_number"
    ]
    list_dynamic_params = list(model_params_dict["LGBMR"]["dynamic_params"].keys())

    list_cols_exp = list_base_cols + list_dynamic_params + ["model_path"]

    df_exp = df_exp[list_cols_exp]

    df_exp["run_time"] = pd.to_datetime(
        df_exp["run_time"], format="%Y%m%d_%H%M%S", errors="coerce"
    )

In [21]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.median(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred

In [22]:
if TRAIN:
    model_paths = list(
        df_exp[df_exp['experiment_id'] == '220790691490267461'
        ]["model_path"]
    )

    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    for model_path in model_paths:
        

        if not os.path.exists(model_path):
            print(f"File does not exist: {model_path}")
            continue  # Skip to the next iteration

        specific_part = model_path.split("/")[-2]
        dest_path = os.path.join(models_dir, f"{specific_part}.pkl")
        if not os.path.exists(dest_path):
            print(f"Copying from {model_path} to {dest_path}")
            shutil.copy(model_path, dest_path)
        else:
            print(f"File {dest_path} already exists. Skipping copy.")

    zipf = zipfile.ZipFile(
        f"/kaggle/working/{models_dir}.zip", "w", zipfile.ZIP_DEFLATED
    )

    # Navigate through the folder and add each file to the ZIP
    for root, dirs, files in os.walk(f"/kaggle/working/{models_dir}"):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(
                    os.path.join(root, file), f"/kaggle/working/{models_dir}"
                ),
            )

    zipf.close()

In [23]:
model_paths = []
models_dir_input = models_dir.replace("_", "-")
directory = f"/kaggle/input/{models_dir_input}"

# Check if the directory exists
if os.path.exists(directory):
    # Traverse the directory and collect file paths
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)

        # Check if the item is a file (and not a sub-directory)
        if os.path.isfile(full_path):
            model_paths.append(full_path)
else:
    print(f"The directory {directory} does not exist.")

# Print or return the list of file paths
print("List of file paths:", model_paths)

List of file paths: ['/kaggle/input/models-8/LGBMR_0_20231109_173044.pkl', '/kaggle/input/models-8/LGBMR_0_20231109_171503.pkl', '/kaggle/input/models-8/LGBMR_0_20231109_174647.pkl', '/kaggle/input/models-8/LGBMR_0_20231109_180232.pkl', '/kaggle/input/models-8/LGBMR_0_20231109_181820.pkl', '/kaggle/input/models-8/LGBMR_0_20231109_183341.pkl']


In [24]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [25]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [26]:
df_train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median()
    + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std()
    + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
    - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median()
    + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std()
    + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
    - df_train.groupby("stock_id")["ask_price"].min(),
}

In [27]:
counter = 0
df_tot_test = []
for test, revealed_targets, sample_prediction in iter_test:

    if counter < 13:
        df_tot_test.append(test)
    else:
        df_tot_test = df_tot_test[1:]
        df_tot_test.append(test)
        
    df_test = pd.concat(df_tot_test, axis = 0, ignore_index = True)
    


    feat = feat_engineering(df_test)[-len(test):]
    feat = feat.sort_values(["date_id",'seconds_in_bucket','stock_id'])[-len(test):]
    
    list_cols_drop = ["date_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)
    
    model = joblib.load('/kaggle/input/models-8/LGBMR_0_20231109_171503.pkl')
    
    list_features = model.feature_name_
    feat= feat[list_features]
    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [28]:
# clean_directory_except_one('/kaggle/working/', 'submission.csv')

In [29]:
def feat_engineering(df_train):
    weights = [
        0.004,
        0.001,
        0.002,
        0.006,
        0.004,
        0.004,
        0.002,
        0.006,
        0.006,
        0.002,
        0.002,
        0.008,
        0.006,
        0.002,
        0.008,
        0.006,
        0.002,
        0.006,
        0.004,
        0.002,
        0.004,
        0.001,
        0.006,
        0.004,
        0.002,
        0.002,
        0.004,
        0.002,
        0.004,
        0.004,
        0.001,
        0.001,
        0.002,
        0.002,
        0.006,
        0.004,
        0.004,
        0.004,
        0.006,
        0.002,
        0.002,
        0.04,
        0.002,
        0.002,
        0.004,
        0.04,
        0.002,
        0.001,
        0.006,
        0.004,
        0.004,
        0.006,
        0.001,
        0.004,
        0.004,
        0.002,
        0.006,
        0.004,
        0.006,
        0.004,
        0.006,
        0.004,
        0.002,
        0.001,
        0.002,
        0.004,
        0.002,
        0.008,
        0.004,
        0.004,
        0.002,
        0.004,
        0.006,
        0.002,
        0.004,
        0.004,
        0.002,
        0.004,
        0.004,
        0.004,
        0.001,
        0.002,
        0.002,
        0.008,
        0.02,
        0.004,
        0.006,
        0.002,
        0.02,
        0.002,
        0.002,
        0.006,
        0.004,
        0.002,
        0.001,
        0.02,
        0.006,
        0.001,
        0.002,
        0.004,
        0.001,
        0.002,
        0.006,
        0.006,
        0.004,
        0.006,
        0.001,
        0.002,
        0.004,
        0.006,
        0.006,
        0.001,
        0.04,
        0.006,
        0.002,
        0.004,
        0.002,
        0.002,
        0.006,
        0.002,
        0.002,
        0.004,
        0.006,
        0.006,
        0.002,
        0.002,
        0.008,
        0.006,
        0.004,
        0.002,
        0.006,
        0.002,
        0.004,
        0.006,
        0.002,
        0.004,
        0.001,
        0.004,
        0.002,
        0.004,
        0.008,
        0.006,
        0.008,
        0.002,
        0.004,
        0.002,
        0.001,
        0.004,
        0.004,
        0.004,
        0.006,
        0.008,
        0.004,
        0.001,
        0.001,
        0.002,
        0.006,
        0.004,
        0.001,
        0.002,
        0.006,
        0.004,
        0.006,
        0.008,
        0.002,
        0.002,
        0.004,
        0.002,
        0.04,
        0.002,
        0.002,
        0.004,
        0.002,
        0.002,
        0.006,
        0.02,
        0.004,
        0.002,
        0.006,
        0.02,
        0.001,
        0.002,
        0.006,
        0.004,
        0.006,
        0.004,
        0.004,
        0.004,
        0.004,
        0.002,
        0.004,
        0.04,
        0.002,
        0.008,
        0.002,
        0.004,
        0.001,
        0.004,
        0.006,
        0.004,
    ]
    df = df_train.copy()
    del df_train
    weights = {int(k): v for k, v in enumerate(weights)}
    df["stock_weights"] = df["stock_id"].map(weights)

    df["volume"] = df.eval("ask_size + bid_size")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [["ask_price", "bid_price", "wap", "reference_price"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df["wap_momentum"] = df.groupby("stock_id")["weighted_wap"].pct_change(periods=6)
    df["imbalance_momentum"] = (
        df.groupby(["stock_id"])["imbalance_size"].diff(periods=1) / df["matched_size"]
    )
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(["stock_id"])["price_spread"].diff()
    df["market_urgency"] = df["price_spread"] * df["liquidity_imbalance"]
    df["depth_pressure"] = (df["ask_size"] - df["bid_size"]) * (
        df["far_price"] - df["near_price"]
    )
    df["spread_depth_ratio"] = (df["ask_price"] - df["bid_price"]) / (
        df["bid_size"] + df["ask_size"]
    )

    for func in ["mean", "std", "skew"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    df = df.replace([np.inf, -np.inf], 0)

    df = pl.DataFrame(df)
    df = df.sort(["stock_id", "date_id", "seconds_in_bucket"])

    df = df.with_columns(
        (pl.col("seconds_in_bucket") / 10).cast(pl.Int32).alias("seconds_in_bucket")
    )

    list_cols = list(df.columns)
    list_cols_ma = [
        col
        for col in list_cols
        if col
        not in [
            "stock_id",
            "date_id",
            "seconds_in_bucket",
            "imbalance_buy_sell_flag",
            "reference_price",
            "matched_size",
            "far_price",
            "near_price",
            "bid_price",
            "bid_size",
            "ask_price",
            "ask_size",
            "wap",
            "target",
            "time_id",
            "row_id",
            "stock_weights",
            "near_price_ask_price_imb",
        ]
    ]

    list_cols = [
        "all_prices_mean",
        "all_prices_mean_mean_15",
        "all_prices_mean_mean_3",
        "all_prices_mean_mean_8",
        "all_prices_mean_std_8",
        "all_prices_skew_mean_10",
        "all_prices_skew_mean_15",
        "all_prices_skew_std_10",
        "all_prices_skew_std_8",
        "all_prices_std_std_8",
        "all_sizes_mean_mean_15",
        "all_sizes_mean_std_10",
        "all_sizes_mean_std_15",
        "all_sizes_std",
        "all_sizes_std_std_15",
        "ask_price_bid_price_imb_std_10",
        "ask_price_bid_price_imb_std_15",
        "ask_price_bid_price_reference_price_imb2",
        "ask_price_bid_price_reference_price_imb2_std_10",
        "ask_price_bid_price_reference_price_imb2_std_15",
        "ask_price_bid_price_reference_price_imb2_std_8",
        "ask_price_bid_price_wap_imb2",
        "ask_price_wap_imb",
        "ask_price_wap_imb_mean_8",
        "ask_price_wap_imb_std_10",
        "ask_price_wap_imb_std_8",
        "ask_price_wap_reference_price_imb2",
        "ask_price_wap_reference_price_imb2_mean_15",
        "bid_price_wap_imb",
        "bid_price_wap_imb_std_10",
        "bid_price_wap_imb_std_8",
        "bid_price_wap_reference_price_imb2",
        "bid_price_wap_reference_price_imb2_std_15",
        "bid_price_wap_reference_price_imb2_std_8",
        "bid_size_ask_size_imbalance_size_imb2_mean_8",
        "far_price_near_price_imb_mean_3",
        "far_price_near_price_imb_std_8",
        "imbalance_momentum_mean_10",
        "imbalance_momentum_mean_15",
        "imbalance_momentum_mean_8",
        "imbalance_momentum_std_15",
        "imbalance_size",
        "imbalance_size_mean_15",
        "imbalance_size_std_10",
        "imbalance_size_std_8",
        "market_urgency",
        "market_urgency_mean_8",
        "matched_size_ask_size_imbalance_size_imb2_std_15",
        "matched_size_bid_size_ask_size_imb2",
        "matched_size_bid_size_ask_size_imb2_std_10",
        "matched_size_bid_size_imbalance_size_imb2_std_8",
        "price_spread_std_15",
        "reference_price_ask_price_imb",
        "reference_price_ask_price_imb_mean_10",
        "reference_price_ask_price_imb_mean_15",
        "reference_price_ask_price_imb_std_15",
        "reference_price_bid_price_imb",
        "reference_price_bid_price_imb_mean_15",
        "reference_price_bid_price_imb_std_8",
        "reference_price_wap_imb",
        "reference_price_wap_imb_mean_10",
        "reference_price_wap_imb_mean_15",
        "reference_price_wap_imb_mean_8",
        "reference_price_wap_imb_std_8",
        "seconds_in_bucket",
        "size_imbalance",
        "spread_depth_ratio",
        "spread_depth_ratio_mean_15",
        "spread_depth_ratio_std_10",
        "spread_depth_ratio_std_15",
        "spread_depth_ratio_std_8",
        "spread_intensity_std_10",
        "spread_intensity_std_15",
        "volume_mean_15",
        "wap_momentum",
        "wap_momentum_mean_10",
        "wap_momentum_mean_15",
        "wap_momentum_mean_3",
        "wap_momentum_mean_6",
        "wap_momentum_mean_8",
        "weighted_wap",
        "weighted_wap_mean_15",
        "weighted_wap_mean_3",
        "weighted_wap_mean_8",
        "weighted_wap_std_15",
    ]

    def rolling_polars(df, list_cols, col_group_by, index_column):
        for col in list_cols:
            if TRAIN:
                print(col)
            function = col.split("_")[-2]
            if function in ["mean", "std"]:
                base_col = "_".join(col.split("_")[:-2])

                window = col.split("_")[-1]

                rolling_group = df.group_by_rolling(
                    index_column=index_column,
                    period=f"{window}i",  # 'i' denotes index count (integer)
                    by=col_group_by,
                    closed="left",  # Adjust as needed
                )
                if function == "mean":
                    df = df.join(
                        rolling_group.agg(
                            pl.col(base_col).mean().alias(f"{base_col}_mean_{window}")
                        ),
                        on=["stock_id", "date_id", "seconds_in_bucket"],
                        how="left",
                    )
                elif function == "std":
                    df = df.join(
                        rolling_group.agg(
                            pl.col(base_col).std().alias(f"{base_col}_std_{window}")
                        ),
                        on=["stock_id", "date_id", "seconds_in_bucket"],
                        how="left",
                    )
        return df

    df = rolling_polars(df, list_cols, ["stock_id", "date_id"], "seconds_in_bucket")

    df = df.to_pandas()

    if TRAIN:
        return df[list_cols + ["date_id", "stock_id", "target"]]
    else:
        return df[list_cols + ["date_id", "stock_id"]]