In [1]:
# Constants
TRAIN = False
OVERWRITE = False
DEBUG = False

models_dir = "models_4"

N_TRIALS = 3

VERSION_NB = 4

state = 42

download_kaggle_data = False

# External general-purpose modules
import os
import shutil
import warnings
from datetime import datetime
import glob
from itertools import combinations
from warnings import simplefilter

import numpy as np
import pandas as pd
import polars as pl
from dotenv import load_dotenv
from joblib import dump
import joblib
import os

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Load environment variables
load_dotenv()

False

In [2]:

path_project_dir = os.getcwd()
if path_project_dir not in ["/kaggle/working", "/content"]:
    path_project_dir = os.getenv("ROOT_PATH")

# Imports and setup for training
if TRAIN:
    
    import itertools as itt
    # Install packages and import logging libraries
    if path_project_dir == '/kaggle/working':
        !pip install loguru mlflow optuna > /dev/null
        

    from utils import log_feature_importance, create_model, log_training_details, aggregate_feature_importance,  get_data, clean_directory_except_one
    
    from loguru import logger
    import mlflow
    import optuna
    from optuna.integration.mlflow import MLflowCallback
    from mlflow.tracking import MlflowClient
    import zipfile
    
    from tqdm import tqdm

    # Import machine learning libraries
    import lightgbm as lgbm
    
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR
    from sklearn.model_selection import KFold
    from xgboost import XGBRegressor as XGBR

    # Set logging
    logger.add("logs.log", format="{time:YYYY-MM-DD HH:mm} | {level} | {message}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
    
    # Auto-reload modules
    %load_ext autoreload
    %autoreload 2

    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

In [3]:
if path_project_dir == "/kaggle/working":
    path_data_project_dir = "/kaggle/input/optiver-trading-at-the-close"
    path_experiments_storage = os.path.join(path_project_dir, "experiments_storage")

    path_dataset_train_raw = "/kaggle/input/optiver-trading-at-the-close/train.csv"
    path_dataset_test_raw = (
        "/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv"
    )

    path_dataset_processed = "/kaggle/working/processed_data"
    path_dataset_train = os.path.join(path_dataset_processed, "train.csv")
    path_dataset_test = os.path.join(path_dataset_processed, "test.csv")

else:
    name_folder_data_project = "kaggle_optiver_trading_at_the_close"

    path_data_dir = os.path.join(path_project_dir, "data")
    path_dataset_train_raw = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/raw", "train.csv"
    )
    path_dataset_processed = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/processed"
    )

    path_data_project_dir = os.path.join(path_data_dir, name_folder_data_project)

    path_config_dir = os.path.join(path_project_dir, "config")
    path_config_train = os.path.join(path_config_dir, "train_config.yaml")

    path_experiments_storage = os.path.join(
        path_data_project_dir, "experiments_storage"
    )

    if download_kaggle_data:
        dataset_name = "ravi20076/optiver-memoryreduceddatasets"
        kaggle_json_path = os.path.join(path_project_dir, "kaggle.json")
        get_data(
            kaggle_json_path,
            path_data_project_dir,
            dataset_name=dataset_name,
            specific_file=None,
        )

    file_name_df_train = "train.csv"
    file_name_df_test = "test.csv"

    path_dataset_train = os.path.join(path_data_project_dir, file_name_df_train)
    path_dataset_test = os.path.join(path_data_project_dir, file_name_df_test)

if TRAIN:
    mlflow.set_tracking_uri(path_experiments_storage)
    client = MlflowClient()

In [4]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if (col_type != object) and (col != "target"):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df


In [5]:
if TRAIN:
    if not os.path.exists(path_dataset_processed):
        os.makedirs(path_dataset_processed)

    if not os.path.exists(path_dataset_train) or OVERWRITE:
        df_train_raw = pd.read_csv(path_dataset_train_raw)
        
    else:
        df_train_raw = pd.read_csv(path_dataset_train)

    if DEBUG:
        df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2])]
        


In [6]:
if TRAIN:
    # Dropping rows with null targets:-
    drop_idx = df_train_raw.loc[df_train_raw["target"].isna(), "target"].index.to_list()
    df_train_raw = df_train_raw.drop(drop_idx, axis=0)
    df_train_raw.reset_index(drop=True, inplace=True)
    df_train_raw = df_train_raw.drop(["time_id","row_id"], axis = 1)

In [7]:
def feat_engineering(df_train):
    df = pl.DataFrame(df_train)
    # 7. Handle Missing Values
    df = df.with_columns(
        [
            pl.col("far_price").fill_null(strategy="forward").alias("far_price"),
            pl.col("near_price").fill_null(strategy="forward").alias("near_price"),
        ]
    )
    # Level 1 Features
    level_one_features = [
        (pl.col("imbalance_size") / pl.col("matched_size")).alias(
            "imbalance_to_matched_size"
        ),
        (pl.col("imbalance_size") * pl.col("imbalance_buy_sell_flag")).alias(
            "imbalance_flag_to_size"
        ),
        (pl.col("ask_price") - pl.col("bid_price")).alias("spread"),
        (pl.col("bid_size") - pl.col("ask_size")).alias("bid_ask_imbalance"),
        (pl.col("bid_size") / pl.col("ask_size")).alias("liquidity"),
      #  (pl.col("ask_size") - pl.col("wap")).alias("size_diff_ask_to_wap"),
        (pl.col("wap") - pl.col("wap").shift(1).over(["stock_id", "date_id"])).alias(
            "wap_velocity"
        ),
        (
            pl.col("wap") / pl.col("wap").shift(5).over(["stock_id", "date_id"]) - 1
        ).alias("wap_momentum_5"),
        (
            pl.col("wap")
            .std()
            .over(["stock_id", "date_id"])
            .alias("short_term_volatility")
        ),
        (
            (
                pl.col("imbalance_size")
                / (pl.col("matched_size") + pl.col("imbalance_size"))
            ).alias("price_impact")
        ),
        (
            (pl.col("bid_size") - pl.col("ask_size"))
            / (pl.col("bid_size") + pl.col("ask_size"))
        ).alias("order_imbalance_ratio"),
        (
            (pl.col("ask_price") - pl.col("bid_price"))
            / (pl.col("ask_price") + pl.col("bid_price"))
        ).alias("price_skewness"),
        (pl.col("seconds_in_bucket") / 600).alias("time_decay"),
    ]

    # Level 2 Features
    level_two_features = [
        (
            pl.col("wap_velocity")
            - pl.col("wap_velocity").shift(1).over(["stock_id", "date_id"])
        ).alias("wap_acceleration"),
        (
            pl.col("short_term_volatility").shift(1).over(["stock_id", "date_id"])
            - pl.col("short_term_volatility")
        ).alias("volatility_rate_of_change"),
        (
            (
                pl.col("liquidity")
                - pl.col("liquidity").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("liquidity").shift(1).over(["stock_id", "date_id"])
        ).alias("liquidity_ratio_change"),
        (
            (
                pl.col("order_imbalance_ratio")
                - pl.col("order_imbalance_ratio").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("order_imbalance_ratio").shift(1).over(["stock_id", "date_id"])
        ).alias("order_imbalance_over_time"),
        (
            (
                pl.col("price_skewness")
                - pl.col("price_skewness").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("price_skewness").shift(1).over(["stock_id", "date_id"])
        ).alias("price_skewness_rate_of_change"),
    ]

    # Level 3 Features
    level_three_aggregations = [
        pl.col("wap").mean().alias("avg_wap_by_market"),
        pl.col("near_price").mean().alias("avg_near_price_by_market"),
        pl.col("matched_size").mean().alias("avg_matched_size_by_market"),
        pl.col("imbalance_to_matched_size")
        .mean()
        .alias("avg_imbalance_to_matched_size_by_market"),
        pl.col("spread").mean().alias("avg_spread_by_market"),
        pl.col("liquidity").mean().alias("avg_liquidity_by_market"),
        pl.col("short_term_volatility").mean().alias("avg_market_volatility"),
        pl.col("order_imbalance_ratio").mean().alias("avg_market_imbalance"),
        pl.col("liquidity").mean().alias("avg_market_liquidity"),
        pl.col("price_impact").mean().alias("avg_market_price_impact"),
        pl.col("price_skewness").mean().alias("avg_market_price_skewness"),
    ]

    # Adding all features and performing join operation
    df = df.with_columns(level_one_features)
    df = df.with_columns(level_two_features)
    group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
        *level_three_aggregations
    )
    df = df.join(group_by_market, on=["date_id", "seconds_in_bucket"], how="left")

    polynomial_and_interaction_features = [
        (pl.col("seconds_in_bucket") * pl.col("near_price")).alias(
            "seconds_in_bucket_X_near_price"
        ),
        (pl.col("matched_size") * pl.col("near_price")).alias(
            "matched_size_X_near_price"
        ),
        (pl.col("near_price") ** 2).alias("near_price_squared"),
        (pl.col("matched_size") ** 2).alias("matched_size_squared"),
        (pl.col("seconds_in_bucket") * pl.col("imbalance_flag_to_size")).alias(
            "seconds_in_bucket_X_imbalance_flag_to_size"
        ),
        (pl.col("seconds_in_bucket") ** 2).alias("seconds_in_bucket_squared"),
        (pl.col("imbalance_flag_to_size") ** 2).alias("imbalance_flag_to_size_squared"),
    ]

    # Relative to Market Features
    relative_to_market_features = [
        (pl.col("wap") / pl.col("avg_wap_by_market")).alias("relative_wap_to_market"),
        (pl.col("near_price") / pl.col("avg_near_price_by_market")).alias(
            "relative_near_price_to_market"
        ),
        (pl.col("matched_size") / pl.col("avg_matched_size_by_market")).alias(
            "relative_matched_size_to_market"
        ),
        (
            pl.col("imbalance_to_matched_size")
            / pl.col("avg_imbalance_to_matched_size_by_market")
        ).alias("relative_imbalance_to_matched_size_to_market"),
        (pl.col("spread") / pl.col("avg_spread_by_market")).alias(
            "relative_spread_to_market"
        ),
        (pl.col("liquidity") / pl.col("avg_liquidity_by_market")).alias(
            "relative_liquidity_to_market"
        ),
    ]

    # Combine all Level 4 features and add them to the DataFrame
    all_level_four_features = (
        polynomial_and_interaction_features + relative_to_market_features
    )
    df = df.with_columns(all_level_four_features)

    for window in [5, 10]:
        rolling_group = df.group_by_rolling(
            index_column="seconds_in_bucket",
            period=f"{window}i",  # 'i' denotes index count (integer)
            by=["stock_id", "date_id"],
            closed="left",  # Adjust as needed
        )

        # Apply to basic and new features
        for col in [
            "wap",
            "imbalance_size",
            "bid_price",
            "ask_price",
            "relative_wap_to_market",
            "wap_momentum_5",
        ]:
            df = df.join(
                rolling_group.agg(pl.col(col).mean().alias(f"{col}_mean_{window}")),
                on=["stock_id", "date_id", "seconds_in_bucket"],
                how="left",
            )

    low_importance_cols = [
        "wap_mean_5",
        #"imbalance_buy_sell_flag",
        "imbalance_flag_to_size_squared",
        "imbalance_size_mean_5",
        "bid_price_mean_5",
        "ask_price_mean_5",
        "wap_momentum_5_mean_5",
        "relative_wap_to_market_mean_5",
        "volatility_rate_of_change",
        "avg_market_liquidity",
        "seconds_in_bucket_squared",
        "order_imbalance_over_time"
        # Add more columns as needed
    ]

    existing_cols = df.columns

    # Drop columns only if they exist in DataFrame
    cols_to_drop = [col for col in low_importance_cols if col in existing_cols]

    
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

   
    df = df.with_columns(
        (pl.col("ask_size") + pl.col("bid_size")).alias("volume")
    )
    df = df.with_columns(
        ((pl.col("ask_price") + pl.col("bid_price")) / 2).alias("mid_price")
    )
    df = df.with_columns(
        ((pl.col("bid_size") - pl.col("ask_size")) / (pl.col("bid_size") + pl.col("ask_size"))).alias("liquidity_imbalance")
    )

    for c in combinations(prices, 2):
        df = df.with_columns(
            ((pl.col(c[0]) - pl.col(c[1])) / (pl.col(c[0]) + pl.col(c[1]))).alias(f"{c[0]}_{c[1]}_imb")
        )
    
    
    if cols_to_drop:
        engineered_df = df.drop(cols_to_drop)
    else:
        engineered_df = df.to_pandas()

    engineered_df = engineered_df.to_pandas()
    
  

    print("# V2")
    engineered_df["imbalance_momentum"] = engineered_df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / engineered_df['matched_size']
 

    engineered_df['price_pressure'] = engineered_df['imbalance_size'] * (engineered_df['ask_price'] - engineered_df['bid_price'])
    engineered_df['market_urgency'] = engineered_df['spread'] * engineered_df['liquidity_imbalance']
    engineered_df['depth_pressure'] = (engineered_df['ask_size'] - engineered_df['bid_size']) * (engineered_df['far_price'] - engineered_df['near_price'])
    for func in ["mean", "std", "skew", "kurt"]:
        engineered_df[f"all_prices_{func}"] = engineered_df[prices].agg(func, axis=1)
        engineered_df[f"all_sizes_{func}"] = engineered_df[sizes].agg(func, axis=1)
        
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, engineered_df)
        engineered_df[triplet_feature.columns] = triplet_feature.values

    print("V3")
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [ 3]:
            engineered_df[f"{col}_shift_{window}"] = engineered_df.groupby(['stock_id','date_id'])[col].shift(window)
            engineered_df[f"{col}_ret_{window}"] = engineered_df.groupby(['stock_id','date_id'])[col].pct_change(window)
            
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [ 3]:
            engineered_df[f"{col}_diff_{window}"] = engineered_df.groupby(["stock_id",'date_id'])[col].diff(window)
    
    engineered_df["dow"] = engineered_df["date_id"] % 5
    engineered_df["seconds"] = engineered_df["seconds_in_bucket"] % 60
    engineered_df["minute"] = engineered_df["seconds_in_bucket"] // 60
    
    
    
    cols_to_drop = ['near_price_squared', 'matched_size_bid_size_imbalance_size_imb2',
       'minute', 'all_prices_kurt', 'imbalance_size_mean_10',
       'avg_liquidity_by_market', 'imbalance_size',
       'far_price_ask_price_imb', 'far_price_bid_price_imb',
       'wap_mean_10', 'matched_size_ask_size_imbalance_size_imb2',
       'all_prices_skew', 'all_sizes_skew', 'price_pressure',
       'all_sizes_kurt', 'reference_price_far_price_imb',
       'far_price_wap_imb', 'relative_liquidity_to_market',
       'bid_size_diff_3', 'imbalance_buy_sell_flag',
       'matched_size_squared', 'liquidity_ratio_change', 'price_impact',
       'ask_price_bid_price_imb', 'seconds',
       'bid_size_ask_size_imbalance_size_imb2', 'wap_acceleration',
       'ask_size_diff_3', 'depth_pressure',
       'imbalance_buy_sell_flag_ret_3','reference_price_far_price_imb', 'all_sizes_skew',
       'matched_imbalance', 'relative_near_price_to_market',
       'avg_market_imbalance', 'all_sizes_kurt',
       'avg_market_price_skewness',
       'relative_imbalance_to_matched_size_to_market', 'liquidity',
       'far_price_wap_imb', 'relative_wap_to_market_mean_10',
       'time_decay', 'matched_size', 'wap_velocity', 'bid_size_diff_3',
       'imbalance_to_matched_size',
       'bid_size_ask_size_imbalance_size_imb2', 'ask_size_diff_3',
       'matched_size_X_near_price', 'bid_size', 'depth_pressure',
       'relative_spread_to_market', 'seconds', 'spread',
       'bid_price_mean_10', 'ask_price_mean_10', 'ask_size',
       'order_imbalance_ratio', 'price_skewness', 'bid_ask_imbalance',
       'price_skewness_rate_of_change', 'imbalance_size_mean_10',
       'avg_liquidity_by_market', 'imbalance_buy_sell_flag',
       'imbalance_size', 'size_diff_ask_to_wap', 'price_diff_ask_to_wap',
       'price_diff_bid_to_wap', 'size_diff_bid_to_wap',
       'imbalance_buy_sell_flag_ret_3', 'wap_mean_10', 'price_spread',
       'wap_acceleration', 'liquidity_ratio_change',
       'ask_price_bid_price_imb', 'near_price_squared',
       'relative_liquidity_to_market', 'minute', 'liquidity_imbalance',
       'price_impact', 'matched_size_squared', 'size_imbalance',
       'ask_price_mean_1', 'wap_mean_1', 'wap_momentum_5_mean_3',
       'wap_momentum_5_mean_2', 'wap_momentum_5_mean_1', 'wap_mean_3',
       'wap_mean_2', 'bid_price_mean_2', 'bid_price_mean_3',
       'imbalance_size_mean_2', 'ask_price_mean_3', 'ask_price_mean_2',
       'imbalance_size_mean_1', 'relative_wap_to_market_mean_1',
       'bid_price_mean_1', 'relative_wap_to_market_mean_2',
       'relative_wap_to_market_mean_3', 'imbalance_size_mean_3']
    cols_drop_in = [col for col in cols_to_drop if col in engineered_df.columns]
    
    engineered_df = engineered_df.drop(cols_drop_in, axis=1)

            
    return engineered_df.replace([np.inf, -np.inf], 0)

In [8]:
def feat_engineering(df_train):
    df = pl.DataFrame(df_train)
    df = df.sort(["stock_id", "date_id", "seconds_in_bucket"])

    df = df.with_columns(
        (pl.col("seconds_in_bucket") / 10).cast(pl.Int32).alias("seconds_in_bucket")
    )
    for window in range(1, 30, 3):
        if TRAIN:
            print(f"Processing window size: {window}")
        rolling_group = df.group_by_rolling(
            index_column="seconds_in_bucket",
            period=f"{window}i",  # 'i' denotes index count (integer)
            by=["stock_id", "date_id"],
            closed="left",  # Adjust as needed
        )

        # Apply to basic and new features
        for col in ["wap", "bid_price", "ask_price", "imbalance_size"]:
            df = df.join(
                rolling_group.agg(pl.col(col).mean().alias(f"{col}_mean_{window}")),
                on=["stock_id", "date_id", "seconds_in_bucket"],
                how="left",
            )

            df = df.join(
                rolling_group.agg(pl.col(col).std().alias(f"{col}_std_{window}")),
                on=["stock_id", "date_id", "seconds_in_bucket"],
                how="left",
            )


    return df.to_pandas()

In [9]:
if TRAIN:
    df_train = feat_engineering(df_train_raw)
    
    #df_train = reduce_mem_usage(df_train, verbose=1)

In [10]:
#engineered_df = df_train.copy()
#numerical_cols = list(engineered_df.select_dtypes(include=['number']).columns)
#list_cols = [col for col in numerical_cols if col not in ['stock_id', 'date_id', 'seconds_in_bucket']]
#correlation_matrix = engineered_df[list_cols].corr()
#high_correlation = correlation_matrix[correlation_matrix > 0.8]

# Or, for the highest N correlations
#N = 30
#highest_correlations = correlation_matrix.unstack().sort_values(ascending=False).drop_duplicates()
#highest_N_correlations = highest_correlations.head(N)

In [11]:



def time_series_split(X, n_splits, n_test_splits, embargo_td=2):
    factorized_indices = np.unique(X["factorized"])

    # Compute the fold boundaries
    fold_bounds = [
        (fold[0], fold[-1] + 1) for fold in np.array_split(factorized_indices, n_splits)
    ]

    # Create the list of all tests test_fold_bounds that will become the test sets
    selected_fold_bounds = list(itt.combinations(fold_bounds, n_test_splits))

    # Reverse to start the testing from the most recent part of the dataset
    selected_fold_bounds.reverse()

    for fold_bound_list in selected_fold_bounds:
        test_factorized_indices = np.empty(0)
        test_fold_bounds = []

        for fold_start, fold_end in fold_bound_list:
            # Records the boundaries of the current test split
            if not test_fold_bounds or fold_start != test_fold_bounds[-1][-1]:
                test_fold_bounds.append((fold_start, fold_end))
            elif fold_start == test_fold_bounds[-1][-1]:
                test_fold_bounds[-1] = (test_fold_bounds[-1][0], fold_end)

            test_factorized_indices = np.union1d(
                test_factorized_indices, factorized_indices[fold_start:fold_end]
            ).astype(int)

        # Compute the train set indices
        train_indices = np.setdiff1d(factorized_indices, test_factorized_indices)

        # Purge and embargo can be added here if needed
        # ...

        yield train_indices, test_factorized_indices


# # Example usage:


In [12]:
if TRAIN:
    col_split = "date_id"
    df_train.sort_values([col_split], inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_train["factorized"] = pd.factorize(df_train[col_split])[0]
    
    list_cols_drop = ["date_id","stock_id"]
    df_train.drop(list_cols_drop, axis=1, inplace=True)

In [13]:
if DEBUG:
    n_estimators_min =n_estimators_max= 50
else:
    n_estimators_min = 500
    n_estimators_max = 500
    

In [14]:
gpu_switch = "OFF"
n_splits = 5
n_test_split = 1
embargo_td = 100

n_repeats = 1
nbrnd_erly_stp = 130

cv_mthd = "KF"

# Cross-Validation Setup
if TRAIN:
    all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "mae",
                "boosting_type": "gbdt",
                "random_state": state,
                "n_jobs" : 4,
                "verbose": -1,
                "importance_type" : "gain",
            },
            "dynamic_params": {
                "n_estimators": {
                    "type": "int",
                    "low": n_estimators_min,
                    "high": n_estimators_max,
                },
                "learning_rate": {
                    "type": "float",
                    "low": 0.01,
                    "high": 0.05,
                },
                "max_depth": {"type": "int", "low": 20, "high": 70},
                "num_leaves": {
                    "type": "int",
                    "low": 60,
                    "high": 300,
                },
                "min_child_samples": {
                    "type": "int",
                    "low": 20,
                    "high": 40,
                },
                "subsample": {
                    "type": "float",
                    "low": 0.7,
                    "high": 1,
                },
                "colsample_bytree": {
                    "type": "float",
                    "low": 1,
                    "high": 1,
                },
                "min_split_gain": {
                    "type": "float",
                    "low": 0,
                    "high": 1,
                },
                "reg_alpha": {
                    "type": "float",
                    "low": 0,
                    "high": 1,
                },
                "reg_lambda": {
                    "type": "float",
                    "low": 0,
                    "high": 1,
                },
            },
        },
    }

    dict_models = {"LGBMR": LGBMR}

    log_model = True

    experiment_date_str = datetime.now().strftime("%Y%m%d_%H_%M_%S")
    experiment_purpose = "optiver_trading_at_the_close"
    experiment_name = f"{experiment_purpose}_{experiment_date_str}"

    mlflow.set_experiment(experiment_name)


In [15]:



def objective(trial, df_train):
    try:
        print(f"trial: {trial.number}")
        with mlflow.start_run() as run:
            mlflow.log_param("cv_mthd", cv_mthd)
            mlflow.set_tag("experiment_purpose", experiment_purpose)
            mlflow.set_tag("experiment_name", experiment_name)
            mlflow.set_tag("version_nb", VERSION_NB)
            for model_name, model_class in dict_models.items():
                model = create_model(
                    trial,
                    dict_models[model_name],
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
                mae_list = []

                log_training_details(logger, model, trial, model_name)

                for fold_n, (train_indices, test_indices) in enumerate(time_series_split(df_train, n_splits = n_splits, n_test_splits = n_test_split)
                ):
                  
                    with mlflow.start_run(
                        run_name=f"fold_{fold_n+1}", nested=True
                    ) as nested_run:
                        mask_train = df_train["factorized"].isin(list(train_indices))
                        mask_test = df_train["factorized"].isin(list(test_indices))

                        # Filter based on the 'factorized' field
                        y_train = df_train.loc[mask_train, "target"].squeeze()
                        y_val = df_train.loc[mask_test, "target"].squeeze()
                        X_train = df_train[mask_train].drop(["target","factorized"], axis=1)
                        X_val = df_train[mask_test].drop(["target","factorized"], axis=1)

                        mlflow.log_param("training_data_rows", X_train.shape[0])
                        mlflow.log_param("training_data_columns", X_train.shape[1])
                        
                        

                        model.fit(
                            X_train,
                            y_train,
                            eval_set=[(X_val, y_val)],
                            eval_metric="mae",
                            callbacks=[
                                lgbm.callback.early_stopping(stopping_rounds=100),
                                lgbm.callback.log_evaluation(period=100),
                            ],
                            
                        )

                        log_feature_importance(
                            trial.number,
                            model,
                            X_train,
                            fold_n,
                            experiment_purpose,
                            experiment_date_str,
                        )

                        fold_mae = model.best_score_["valid_0"]["l1"]
                        print(model.best_score_)
                        mae_list.append(fold_mae)
                        logger.info(f"{fold_n + 1:<5} {'|':<2} {fold_mae:<20}")

                      
                        mlflow.log_param("fold_number", fold_n + 1)
                        mlflow.log_param("model_name", model_name)
                        mlflow.log_param("log_model", log_model)

                        params_to_log = model.get_params()
                        mlflow.log_params(params_to_log)

                        if log_model:
                            current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                            model_log_name = (
                                f"{model_name}_{trial.number}_{current_time_str}"
                            )

                            mlflow.log_param("model_log_name", model_log_name)

                            mlflow.sklearn.log_model(model, model_log_name)

                            mlflow.log_param("run_time", current_time_str)

                        nested_run_id = nested_run.info.run_id
                        model_path = f"{path_experiments_storage}/{run.info.experiment_id}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                        mlflow.log_param("model_path", model_path)
                avg_mae = sum(mae_list) / len(mae_list)

                mlflow.log_param("model_name", model_name)
                mlflow.log_param("mae", avg_mae)

                return avg_mae

    except Exception as e:
        logger.error(f"An exception occurred: {e}")
        return float("inf")

In [16]:
# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train), n_trials=N_TRIALS)

In [17]:
def experiments_data(list_experiment_id = None, save_df = None, list_columns = None):
    """
    Ogni volta che viene chiamata questa funzione legge tutti gli esperimenti e ritorna una nuova versione del file con tutti gli esperimenti storicizzati
    """
    experiments = client.search_experiments()
    all_runs_data = []
    for exp in experiments:
        experiment_id = exp.experiment_id
        if (list_experiment_id == None) or (experiment_id in list_experiment_id):
        
            run_infos = client.search_runs(experiment_ids=[experiment_id])

            for run_info in run_infos:
                run_data = {
                    "experiment_id": experiment_id,
                    "experiment_name": exp.name,
                    "run_id": run_info.info.run_id,
                }

                # Add metrics to run_data
                for key, value in run_info.data.metrics.items():
                    run_data[f"{key}"] = value

                # Add params to run_data
                for key, value in run_info.data.params.items():
                    run_data[f"{key}"] = value

                all_runs_data.append(run_data)
        
    df_runs_new = pd.DataFrame(all_runs_data)
    

    
    df_runs_new = df_runs_new[~df_runs_new["fold_number"].isna()]
    
    if list_columns:
        df_runs_new = df_runs_new[list_columns]
        
    if save_df:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
        csv_filename = f"df_runs_{timestamp}.csv"
        df_runs_new.to_csv(csv_filename, index=False)

        print(f"DataFrame saved to {csv_filename}, Shape: {df_unique.shape}")

    return df_runs_new

In [18]:
if TRAIN:
    df_exp  = experiments_data(list_experiment_id = None, save_df = None, list_columns = None)
    list_base_cols = ['run_time','experiment_id','model_name','fold_number','mae','training_data_rows','training_data_columns'] 
    list_dynamic_params = list(model_params_dict["LGBMR"]['dynamic_params'].keys())
    
    
    
    list_cols_exp = list_base_cols + list_dynamic_params +['model_path']
    
    
    df_exp = df_exp[list_cols_exp]
    
    df_exp['run_time'] = pd.to_datetime(df_exp['run_time'], format='%Y%m%d_%H%M%S', errors='coerce')

In [19]:
if TRAIN:
    list_files_feat_importance = ['/kaggle/working/feat_impor_optiver_trading_at_the_close_20231101_072036.csv','/kaggle/working/feat_impor_optiver_trading_at_the_close_20231031_095537.csv','/kaggle/working/feat_impor_optiver_trading_at_the_close_20231031_132330.csv']



    aggregate_feature_importance( list_files_feat_importance)['feat'][-70:].values


In [20]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.mean(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred

In [21]:


if TRAIN:
    #model_paths = list(df_exp[df_exp['run_time'] >= pd.to_datetime("2023-11-01 22:10:54")]['model_path'])
    model_paths = list(df_exp['model_path'])
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    for model_path in model_paths:
        print(f"Checking if model path exists: {model_path}")

        if not os.path.exists(model_path):
            print(f"File does not exist: {model_path}")
            continue  # Skip to the next iteration

        specific_part = model_path.split("/")[-2]
        dest_path = os.path.join(models_dir, f"{specific_part}.pkl")
        if not os.path.exists(dest_path):
            print(f"Copying from {model_path} to {dest_path}")
            shutil.copy(model_path, dest_path)
        else:
            print(f"File {dest_path} already exists. Skipping copy.")

    zipf = zipfile.ZipFile(
        f"/kaggle/working/{models_dir}.zip", "w", zipfile.ZIP_DEFLATED
    )

    # Navigate through the folder and add each file to the ZIP
    for root, dirs, files in os.walk(f"/kaggle/working/{models_dir}"):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(
                    os.path.join(root, file), f"/kaggle/working/{models_dir}"
                ),
            )

    zipf.close()

In [22]:
model_paths = []
models_dir_input = models_dir.replace("_", "-")
directory = f"/kaggle/input/{models_dir_input}"

# Check if the directory exists
if os.path.exists(directory):
    # Traverse the directory and collect file paths
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)

        # Check if the item is a file (and not a sub-directory)
        if os.path.isfile(full_path):
            model_paths.append(full_path)
else:
    print(f"The directory {directory} does not exist.")

# Print or return the list of file paths
print("List of file paths:", model_paths)

List of file paths: ['/kaggle/input/models-4/LGBMR_0_20231102_113839.pkl', '/kaggle/input/models-4/LGBMR_0_20231102_121251.pkl', '/kaggle/input/models-4/LGBMR_1_20231102_123627.pkl', '/kaggle/input/models-4/LGBMR_0_20231102_114958.pkl', '/kaggle/input/models-4/LGBMR_0_20231102_122359.pkl', '/kaggle/input/models-4/LGBMR_0_20231102_120127.pkl']


In [23]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [24]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [25]:
counter = 0
for test, revealed_targets, sample_prediction in iter_test:
    # df_test_raw = pl.DataFrame(test)

    feat = feat_engineering(test)
    
    list_cols_drop = ["date_id","stock_id","row_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)

    # feat = df_test.to_pandas()

    #list_cols_drop = ["stock_id", "date_id", "row_id"]
    #feat = feat.drop(list_cols_drop, axis=1)

    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [26]:
#clean_directory_except_one('/kaggle/working/', 'submission.csv')