In [1]:
# Constants and Configuration Variables
DEBUG = False
TRAIN = False
TUNING = True
OVERWRITE_PROCESSED_DATA = True
N_TRIALS = 10
STATE = 42
N_FOLD_BREAK = 7

GPU_SWITCH = "ON"
N_SPLITS = 5
N_TEST_SPLITS = 1
N_PURGE = 20
N_EMBARGO = 20

SAVE_MODELS = False

VERSION_NB = 22
EXPERIMENT_PURPOSE = "optiver_trading_at_the_close"


#learning_rate: 0.01299 | max_depth: 13 | boosting_type: gbdt | colsample_bytree: 0.5 | 
#                importance_type: gain | min_child_samples: 20 | min_child_weight: 0.001 |
#                            min_split_gain: 0.0 | n_estimators: 1000 | num_leaves: 205 | 
#                                        objective: mae | reg_alpha: 0.0 | reg_lambda: 0.0 |
#                                                    subsample: 0.45 | subsample_for_bin: 200000 | 
#                                                            subsample_freq: 0 | max_bin: 254
#                                                                    
                                                                    
list_experiment_id = ['329561590225205643']
run_name_startswith = "23_11_20_111711_fold"

model_params_dict = {
    "LGBMR": {
        "static_params": {
            "device": "gpu" if GPU_SWITCH == "ON" else "cpu",
            "objective": "mae",
            "boosting_type": "gbdt",
            "random_state": STATE,
            "n_jobs": 4,
            "verbose": -1,
            "importance_type": "gain",
            "max_bin": 254
        },
        "dynamic_params": {
            "n_estimators": {
                "type": "int",
                "low": 1000,
                "high": 1000,
            },
            "learning_rate": {
                "type": "float",
                "low":0.0131,
                "high": 0.0131,
            },
            "max_depth": {"type": "int", "low": 13, "high": 13},
            "num_leaves": {
                "type": "int",
                "low": 205,
                "high": 205,
            },
            "min_child_samples": {
                "type": "int",
                "low": 20,
                "high": 20,
            },
            "subsample": {
                "type": "float",
                "low": 0.45,
                "high": 0.45,
            },
            "colsample_bytree": {
                "type": "float",
                "low": 0.5,
                "high": 0.5,
            },
        },
    },
}

# RECORD ENSAMBLE MODEL
model_paths = [
    "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/59356aefe3c64eaba02b9c7fbb11c5e8/artifacts/LGBMR_0_20231120_001712/model.pkl",
    "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/8a881129db7c45e9bd92094cd1097297/artifacts/LGBMR_0_20231118_044743/model.pkl",
    "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/9e402e7a59be460897bf7cee58fc85c5/artifacts/LGBMR_0_20231118_045214/model.pkl",
    "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/1ffb209c2fe2420db01bb4df17d86f1d/artifacts/LGBMR_0_20231118_045642/model.pkl",
    "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/562d3c3b41c74c9a90e080b88eb5af3a/artifacts/LGBMR_0_20231118_050108/model.pkl",
]

# Define the model name for registration in MLflow

version = 10
model_name = f"ensemble_model_{version}.pkl"
folder_model = f"models-v{version}"

In [2]:
# External general-purpose modules
import gc
import sys
import os
import pickle
import itertools as itt
from itertools import combinations, product
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import joblib
import time

from pathlib import Path
import warnings
from numba import njit, prange
import boto3
from botocore.exceptions import NoCredentialsError
from dotenv import load_dotenv


# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Load environment variables
load_dotenv()
path_root_project = Path.cwd()
if path_root_project.name not in ["working", "content"]:
    path_root_project = Path(os.getenv("ROOT_PATH") or path_root_project)

    directories_to_add = ["utils", "feat_engineering", "validation"]
    for directory in directories_to_add:
        sys.path.append(str(path_root_project / "src" / directory))


from utils_training import create_model, experiments_data
from utils_data import load_config, load_dataset, reduce_mem_usage, PathManager
from utils_kaggle import (
    setup_kaggle,
    download_data,
    get_data,
    clean_directory_except_one,
)

from fe_optiver_trading_at_the_close import (
    calculate_triplet_imbalance_numba,
    convert_weights_to_dict,
    global_stock_id_feats,
    compute_rolling_averages,
    generate_rsi,
)

pm = PathManager(path_root_project)

if TRAIN:
    if pm.path_root_project.name == "working":
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()
        aws_access_key_id = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
        aws_region = user_secrets.get_secret("AWS_DEFAULT_REGION")
        aws_secret_access_key = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
        s3_bucket_name = user_secrets.get_secret("S3_BUCKET")

        # Set AWS credentials in the environment variables
        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
        os.environ["AWS_DEFAULT_REGION"] = aws_region
    else:
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
model_prod = joblib.load(f'/kaggle/input/{folder_model}/{model_name}')

In [4]:
def list_path_models(list_experiment_id, run_name_startswith):
    df_exp = experiments_data(
            client, list_experiment_id=list_experiment_id, save_df=None, list_columns=None
        )

    return list(df_exp[df_exp['mlflow.runName'].str.startswith(run_name_startswith)]['model_path'])

In [5]:
#df_feat = pd.read_csv("/kaggle/working/feat_impor_optiver_trading_at_the_close_23_11_18_0438.csv").sort_values("t0_imp_fold_1")

In [6]:
def aggregate_feature_importance(df_feat_importance):
    df_feat_importance["feat_imp_mean"] = df_feat_importance.loc[
        :, df_feat_importance.columns != "feat"
    ].mean(axis=1, skipna=True)



    df_feat_importance.sort_values('feat_imp_mean', inplace = True, ascending = False)
    df_feat_importance.reset_index(drop =  True, inplace = True)



    cols  = [col for col in  df_feat_importance.columns if col not in  ["feat","feat_imp_mean"]]


    df_feat_importance["feat_imp_std"] = df_feat_importance.loc[
            :,cols
        ].std(axis=1, skipna=True)

    df_feat_importance['feat_imp_variability'] =  df_feat_importance['feat_imp_std'] /df_feat_importance['feat_imp_mean'] 

    return df_feat_importance

In [7]:


# Conditional imports and settings based on TRAIN constant
if TRAIN:
    if pm.path_root_project.name == "working":
        !pip install loguru mlflow optuna > /dev/null

    

    from utils_mlflow import (
        get_experiments_df,
        delete_runs_and_artifacts,
        download_and_load_model,
        load_models_and_create_ensemble,
        save_and_register_model,
        log_model_parameters,
        get_or_create_experiment,
        experiments_data,
    )
    from utils_feat_importance import log_feature_importance, aggregate_feature_importance
    from model_validation import time_series_split

    # External Libraries
    import boto3
    from botocore.exceptions import NoCredentialsError
    from mlflow.exceptions import MlflowException
    import lightgbm as lgbm
    import mlflow
    import optuna
    from mlflow.tracking import MlflowClient
    from optuna.integration.mlflow import MLflowCallback
    from sklearn.model_selection import KFold
    from xgboost import XGBRegressor as XGBR
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR



    # Auto-reload modules - Specific to Jupyter Notebooks
    %load_ext autoreload
    %autoreload 2
    if not DEBUG:
        mlflow.set_tracking_uri(pm.path_experiments_dir)
        
    client = MlflowClient()
    
    # Create an S3 client
    s3 = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )

In [8]:
def save_and_register_model(ensemble_model, model_name):
    """
    Function to save and register the ensemble model
    """
    with open(model_name, "wb") as f:
        pickle.dump(ensemble_model, f)

    with mlflow.start_run() as run:
        mlflow.log_artifact(model_name, "model")
        run_id = run.info.run_id

        # Records the model in the Model Registry
        model_uri = f"runs:/{run_id}/model"
        mlflow.register_model(model_uri, model_name)

    # Clean up the local file system
    # if os.path.exists(temp_ensemble_path):
    #     os.remove(temp_ensemble_path)

    print(f"Ensemble model registered under run_id: {run_id}")

In [9]:
model_name

'ensemble_model_10.pkl'

In [10]:
if SAVE_MODELS:
    #model_paths = list_path_models(list_experiment_id, run_name_startswith)
    model_paths = ['s3://mlflow-v1/kaggle_optiver_trading_at_the_close/f485ee23b66a4108b77e96a000fa1aab/artifacts/LGBMR_0_20231120_115148/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/c62ada9bdc0041f08cae8413ddc2a4e5/artifacts/LGBMR_0_20231120_114452/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/8c5000761f264b8ab28f95cd460dc4e1/artifacts/LGBMR_0_20231120_113758/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/9caf6f7a804e4a8ba48f8d42919cec06/artifacts/LGBMR_0_20231120_113057/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/d40e52ddb6fa4d59af9bc43dc5fd7426/artifacts/LGBMR_0_20231120_112401/model.pkl']
    s3 = boto3.client(
     "s3",
     aws_access_key_id=aws_access_key_id,
     aws_secret_access_key=aws_secret_access_key,
    )
    # Load the models and create an ensemble
    ensemble_model = load_models_and_create_ensemble(s3, model_paths)

    # Save and register the ensemble model in MLflow
    save_and_register_model(ensemble_model, model_name)

In [11]:
if TRAIN:
    if not os.path.exists(pm.path_dataset_processed) or OVERWRITE_PROCESSED_DATA:
        df_train_raw = pd.read_csv(pm.path_data_train_raw)

        if DEBUG:
            df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2, 3, 4])]

        drop_idx = df_train_raw.loc[
            df_train_raw["target"].isna(), "target"
        ].index.to_list()
        df_train = df_train_raw.drop(drop_idx, axis=0)
        df_train.reset_index(drop=True, inplace=True)
    else:
        df_train = pd.read_csv(pm.path_dataset_processed)
        if DEBUG:
            df_train = df_train[df_train["stock_id"].isin([0, 1, 2, 3, 4])]

    df_train.sort_values(["time_id", "stock_id"], inplace=True)

In [12]:
if TRAIN:
    dict_global_stock_id_feats = global_stock_id_feats(df_train)

In [13]:
@njit(parallel=True)
def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    rows, cols = data.shape
    macd_values = np.empty((rows, cols))
    signal_line_values = np.empty((rows, cols))
    histogram_values = np.empty((rows, cols))

    for i in prange(cols):
        short_ema = np.zeros(rows)
        long_ema = np.zeros(rows)

        for j in range(1, rows):
            short_ema[j] = (data[j, i] - short_ema[j - 1]) * (2 / (short_window + 1)) + short_ema[j - 1]
            long_ema[j] = (data[j, i] - long_ema[j - 1]) * (2 / (long_window + 1)) + long_ema[j - 1]

        macd_values[:, i] = short_ema - long_ema

        signal_line = np.zeros(rows)
        for j in range(1, rows):
            signal_line[j] = (macd_values[j, i] - signal_line[j - 1]) * (2 / (signal_window + 1)) + signal_line[j - 1]

        signal_line_values[:, i] = signal_line
        histogram_values[:, i] = macd_values[:, i] - signal_line

    return macd_values, signal_line_values, histogram_values


def generate_macd(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for stock_id, values in df.groupby(['stock_id'])[prices]:
        macd_values, signal_line_values, histogram_values = calculate_macd(values.values)
        col_macd = [f'macd_{col}' for col in values.columns]
        col_signal = [f'macd_sig_{col}' for col in values.columns]
        col_hist = [f'macd_hist_{col}' for col in values.columns]
        
        df.loc[values.index, col_macd] = macd_values
        df.loc[values.index, col_signal] = signal_line_values
        df.loc[values.index, col_hist] = histogram_values
    
    return df

def generate_rsi(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for stock_id, values in df.groupby(['stock_id'])[prices]:
        columns = [f'rsi_{col}' for col in values.columns]
        data = calculate_rsi(values.values)
        df.loc[values.index, columns] = data
    
    return df



@njit(parallel = True)
def calculate_rsi(prices, period=14):
    rsi_values = np.zeros_like(prices)

    for col in prange(prices.shape[1]):
        price_data = prices[:, col]
        delta = np.zeros_like(price_data)
        delta[1:] = price_data[1:] - price_data[:-1]
        gain = np.where(delta > 0, delta, 0)
        loss = np.where(delta < 0, -delta, 0)

        avg_gain = np.mean(gain[:period])
        avg_loss = np.mean(loss[:period])
        
        if avg_loss != 0:
            rs = avg_gain / avg_loss
        else:
            rs = 1e-9  # or any other appropriate default value
            
        rsi_values[:period, col] = 100 - (100 / (1 + rs))

        for i in prange(period-1, len(price_data)-1):
            avg_gain = (avg_gain * (period - 1) + gain[i]) / period
            avg_loss = (avg_loss * (period - 1) + loss[i]) / period
            if avg_loss != 0:
                rs = avg_gain / avg_loss
            else:
                rs = 1e-9  # or any other appropriate default value
            rsi_values[i+1, col] = 100 - (100 / (1 + rs))

    return rsi_values



In [14]:
def order_flow_imbalance(df):
    df['ofi'] = (df['ask_price'] - df['bid_price']) * (df['ask_size'] - df['bid_size'])
    df['ofi_normalized'] = df['ofi'] / df['volume']
    return df

def trend_momentum_features(df):
    # Exponential Moving Average
    for col in ['wap', 'ask_price', 'bid_price']:
        for window in [5, 10, 15]:
            df[f'{col}_ema_{window}'] = df.groupby('stock_id')[col].transform(lambda x: x.ewm(span=window).mean())

    # Moving Average Convergence Divergence (MACD)
    for col in ['wap', 'ask_price', 'bid_price']:
        fast_window, slow_window, signal = 12, 26, 9
        df[f'{col}_macd'] = df.groupby('stock_id')[col].transform(lambda x: x.ewm(span=fast_window).mean() - x.ewm(span=slow_window).mean())
        df[f'{col}_macd_signal'] = df.groupby('stock_id')[f'{col}_macd'].transform(lambda x: x.ewm(span=signal).mean())

    return df


def relative_strength_index(df, periods=14):
    delta = df['wap'].diff(1)
    gain = (delta.where(delta > 0, 0)).groupby(df['stock_id']).transform(lambda x: x.rolling(periods).mean())
    loss = (-delta.where(delta < 0, 0)).groupby(df['stock_id']).transform(lambda x: x.rolling(periods).mean())
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    return df


import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

def calculate_trend(df):
    # Calcola la tendenza del prezzo WAP
    def trendline(data, order=1):
        coeffs = np.polyfit(range(len(data)), list(data), order)
        slope = coeffs[-2]
        return float(slope)

    df['trend'] = df.groupby(['stock_id', 'time_id'])['wap'].transform(trendline)
    return df



In [15]:
import pandas as pd

def calculate_market_pressure(df):
    df['bid_ask_size_diff'] = df['bid_size'] - df['ask_size']
    df['bid_ask_price_diff'] = df['bid_price'] - df['ask_price']
    df['market_pressure'] = df['bid_ask_size_diff'] * df['bid_ask_price_diff']
    return df

def calculate_imbalance_ratio(df):
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    return df

def calculate_auction_price_indicators(df):
    df['far_price_deviation'] = df['far_price'] - df['wap']
    df['near_price_deviation'] = df['near_price'] - df['wap']
    return df

def calculate_auction_time_series_features(df):
    for col in ['imbalance_size', 'matched_size', 'far_price', 'near_price']:
        df[f'{col}_time_change'] = df.groupby(['stock_id', 'time_id'])[col].diff()
    return df



In [16]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    

        
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', "wap"]:
        for window in [1, 2, 3,6, 10,15]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    print("Calculate diff features for specific columns...")
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3,6, 10,15]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    df["price_delta"] = df["ask_price"] - df["bid_price"]
    df["size_delta"] = df["ask_size"] - df["bid_size"]
    df["auction_imbalance_ratio"] = df["imbalance_buy_sell_flag"].abs()
    
    df["volume_weighted_imbalance"] = (df["imbalance_size"] * df["wap"]) / df["volume"]

    
    # Bid-Ask Spread Over Time
    df["bid_ask_spread"] = df["ask_price"] - df["bid_price"]
    df["bid_ask_spread_change"] = df.groupby("stock_id")["bid_ask_spread"].diff()

    # Order Book Density
    df["order_book_density"] = (df["bid_size"] + df["ask_size"]) / df["bid_ask_spread"]
        
    return df.replace([np.inf, -np.inf], 0)

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    for key, value in dict_global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df




In [17]:
 def rolling_polars(df, list_cols, col_group_by, index_column):
    for col in list_cols:
        base_col = col
        for function in ["mean", "std"]:
            print(f"rolling {function} ...")
            for window in [3,5,9]:

                rolling_group = df.group_by_rolling(
                    index_column=index_column,
                    period=f"{window}i",  # 'i' denotes index count (integer)
                    by=col_group_by,
                    closed="left",  # Adjust as needed
                )
                if function == "mean":
                    df = df.join(
                        rolling_group.agg(
                            pl.col(base_col).mean().alias(f"{base_col}_mean_{window}")
                        ),
                        on=["stock_id", "time_id"],
                        how="left",
                    )
                elif( function == "std") & (window > 6):
                    df = df.join(
                        rolling_group.agg(
                            pl.col(base_col).std().alias(f"{base_col}_std_{window}")
                        ),
                        on=["stock_id", "time_id"],
                        how="left",
                    )
    return df

In [18]:


def feat_engineering(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    
    

    df_pl = pl.DataFrame(df)
    df_pl = df_pl.sort(["stock_id", "time_id"])

    df_pl = df_pl.with_columns(
        (pl.col("seconds_in_bucket") / 10).cast(pl.Int32).alias("seconds_in_bucket")
    )
    df_pl = df_pl.with_columns(
        pl.col("time_id").cast(pl.Int32).alias("time_id")
    )

    df_pl = rolling_polars(df_pl, ["wap","volume","price_delta","size_delta"], ["stock_id"], "time_id")

    df = df_pl.to_pandas()

    gc.collect()  
    df = order_flow_imbalance(df)
    
    df = trend_momentum_features(df)
    df = relative_strength_index(df, periods=14)
    #df = calculate_trend(df)
    
    gc.collect()  
    
    print("Applica le funzioni per generare nuove caratteristiche...")
    df = calculate_market_pressure(df)
    df = calculate_imbalance_ratio(df)
    df = calculate_auction_price_indicators(df)
    df = calculate_auction_time_series_features(df)
   
    # Calcolo delle variazioni passate di WAP
    df['wap_change'] = df.groupby('stock_id')['wap'].pct_change()

    # Calcolo del momentum di WAP
    df['wap_momentum'] = df['wap'] - df.groupby('stock_id')['wap'].shift(1)

    # Calcolo della volatilità di WAP
    df['wap_volatility'] = df.groupby('stock_id')['wap'].rolling(window=5).std().reset_index(level=0, drop=True)

    # Rapporto tra WAP e Volume
    df['wap_volume_ratio'] = df['wap'] / df['volume']

    # Imbalance Dinamico
    df['dynamic_imbalance'] = df['imbalance_buy_sell_flag'] * df['imbalance_size']

    list_cols = [i for i in df.columns if i not in ["row_id"]]
    gc.collect()  
    
    return df[list_cols]
    

In [19]:
if TRAIN:
    df_train_feats = feat_engineering(df_train)
    print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

In [20]:
if TRAIN:
    col_split = "time_id"
    df_train_feats.sort_values([col_split], inplace=True)

    df_train_feats["factorized"] = pd.factorize(df_train_feats[col_split])[0]

    df_train_feats.sort_values(["time_id", "stock_id"], inplace=True)
    list_cols_drop = ["date_id", "time_id"]

    df_train_feats.reset_index(drop=True, inplace=True)
    df_train_feats.drop(list_cols_drop, axis=1, inplace=True)

In [21]:
if TRAIN:
    experiment_name = f"{EXPERIMENT_PURPOSE}_v{VERSION_NB}"
    name_folder_models = f"models_v{VERSION_NB}"

    experiment_date_str = datetime.now().strftime("%y_%m_%d_%H%M")

    if DEBUG:
        experiment_name = f"{experiment_name}_debug"
        name_folder_models = f"{name_folder_models}_debug"
        path_artifact_location = "."
    else:
        path_artifact_location = pm.path_artifact_location

    experiment_id = get_or_create_experiment(
        client, experiment_name, artifact_location=path_artifact_location
    )

    nbrnd_erly_stp = 130
    cv_mthd = "KF"

    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

    all_cv = {"KF": KFold(n_splits=5, shuffle=True, random_state=STATE)}
    cv = all_cv[cv_mthd]

    dict_models = {"LGBMR": LGBMR}

    log_model = True

    args = {
        "cv_mthd": cv_mthd,
        "experiment_purpose": EXPERIMENT_PURPOSE,
        "experiment_name": experiment_name,
        "dict_models": dict_models,
        "model_params_dict": model_params_dict,
        "n_splits": N_SPLITS,
        "n_test_splits": N_TEST_SPLITS,
        "n_purge": N_PURGE,
        "n_embargo": N_EMBARGO,
        "experiment_date_str": experiment_date_str,
        "path_artifact_location": pm.path_artifact_location,
        "target_col": "target",
    }

In [22]:
def run_mlflow_experiment(df_train, args, trial=None):
    cv_mthd = args["cv_mthd"]
    experiment_purpose = args["experiment_purpose"]
    experiment_name = args["experiment_name"]
    dict_models = args["dict_models"]
    model_params_dict = args["model_params_dict"]

    n_splits = args["n_splits"]
    n_test_splits = args["n_test_splits"]
    n_purge = args["n_purge"]
    n_embargo = args["n_embargo"]

    experiment_date_str = args["experiment_date_str"]
    path_artifact_location = args["path_artifact_location"]
    target_col = args["target_col"]

    if trial == None:
        trial = optuna.trial.FixedTrial(
            {
                "n_estimators": 500,
                "learning_rate": 0.005,
                "max_depth": 10,
                "num_leaves": 20,
                "min_child_samples": 10,
                "subsample": 0.7,
                "colsample_bytree": 1.0,
                "min_split_gain": 0.0,
                "reg_alpha": 0.0,
                "reg_lambda": 0.0,
                "device": "gpu" if GPU_SWITCH == "ON" else "cpu",
            }
        )

    run_time_start_trial = datetime.now().strftime("%y_%m_%d_%H%M%S")

    with mlflow.start_run(
        run_name=run_time_start_trial, experiment_id=experiment_id
    ) as run:
        score_list = []

        # mlflow.set_tag("cv_mthd", cv_mthd)
        mlflow.set_tag("n_splits", n_splits)
        mlflow.set_tag("n_test_splits", n_test_splits)
        mlflow.set_tag("n_purge", n_purge)
        mlflow.set_tag("n_embargo", n_embargo)

        for model_name, model_class in dict_models.items():
            if TUNING:
                
                model = create_model(
                    trial,
                    model_class,
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
            
            else:
                params = model_prod.get_params()
                params["device"] = "gpu" if GPU_SWITCH == "ON" else "cpu"

                model = model_class(**params)

            priority_params = ["learning_rate", "max_depth"]
            excluded_params = [
                "device",
                "class_weight",
                "random_state",
                "silent",
                "verbose",
                "n_jobs",
            ]

            ordered_params = log_model_parameters(
                model, priority_params, excluded_params, verbose=True
            )

            mlflow.log_params(ordered_params)

            for fold_n, (train_indices, test_indices) in enumerate(
                time_series_split(
                    df_train,
                    n_splits=n_splits,
                    n_test_splits=n_test_splits,
                    n_purge=n_purge,
                    n_embargo=n_embargo,
                )
            ):
                
                with mlflow.start_run(
                    run_name=f"{run_time_start_trial}_fold_{fold_n+1}",
                    nested=True,
                    experiment_id=experiment_id,
                ) as nested_run:
                    mlflow.set_tag("n_trial", str(trial.number))

                    mask_train = df_train["factorized"].isin(train_indices)
                    mask_test = df_train["factorized"].isin(test_indices)

                    y_train = df_train.loc[mask_train, target_col]
                    y_val = df_train.loc[mask_test, target_col]
                    X_train = df_train.loc[mask_train].drop(
                        [target_col, "factorized"], axis=1
                    )
                    X_val = df_train.loc[mask_test].drop(
                        [target_col, "factorized"], axis=1
                    )
                    
                    print(f"X_train shape: {X_train.shape}")

                    mlflow.log_param("train_rows", X_train.shape[0])
                    mlflow.log_param("train_cols", X_train.shape[1])

                    model.fit(
                        X_train,
                        y_train,
                        eval_set=[(X_val, y_val)],
                        eval_metric="mae",
                        callbacks=[
                            lgbm.callback.early_stopping(stopping_rounds=100),
                            lgbm.callback.log_evaluation(period=100),
                        ],
                    )
                    

                    log_feature_importance(
                        trial.number,
                        model,
                        X_train,
                        fold_n,
                        experiment_purpose,
                        experiment_date_str,
                    )
                    
                    del mask_train, mask_test, X_train, y_train

                    fold_score = model.best_score_["valid_0"]["l1"]

                    score_list.append(fold_score)

                    mlflow.log_metric("fold_score", round(fold_score, 6))
                    mlflow.log_param("fold_number", fold_n + 1)
                    mlflow.log_param("model_name", model_name)

                    mlflow.log_params(ordered_params)

                    current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                    model_log_name = f"{model_name}_{trial.number}_{current_time_str}"

                    mlflow.sklearn.log_model(model, model_log_name)

                    mlflow.log_param("run_time", current_time_str)

                    nested_run_id = nested_run.info.run_id
                    model_path = f"{path_artifact_location}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                    mlflow.log_param("model_path", model_path)

                avg_score = sum(score_list) / len(score_list)
                median_score = np.median(score_list)
                mlflow.log_metric("avg score", round(avg_score, 6))
                mlflow.log_metric("median score", round(median_score, 6))
                gc.collect()  
                if fold_n >= N_FOLD_BREAK:
                    break

        return avg_score


def objective(trial, df_train):
    avg_score = run_mlflow_experiment(df_train, args, trial)
    return avg_score


# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train_feats), n_trials=N_TRIALS)

In [23]:
if TRAIN:
    df_exp = experiments_data(
        client, list_experiment_id=None, save_df=None, list_columns=None
    )
    list_base_cols = [
        "run_time",
        "experiment_id",
        "n_trial",
        "run_id",
        "model_name",
        "fold_number",
        "fold_score",
    ]
    list_dynamic_params = list(model_params_dict["LGBMR"]["dynamic_params"].keys())

    df_exp["run_time"] = pd.to_datetime(
        df_exp["run_time"], format="%Y%m%d_%H%M%S", errors="coerce"
    )

    for col in df_exp.columns:
        df_exp[col] = pd.to_numeric(df_exp[col], errors="ignore")

    for col in df_exp.select_dtypes(include=["float", "int"]):
        df_exp[col] = df_exp[col].round(5)

    list_cols_exp = ["run_time"] + list_base_cols + list_dynamic_params + ["model_path"]

    experiment_id
    df_exp = df_exp[df_exp["experiment_id"] != 0]

    df_exp = df_exp[list_cols_exp]

In [24]:
df_train = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")

In [25]:
dict_global_stock_id_feats = global_stock_id_feats(df_train)

In [26]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
y_min, y_max = -64, 64
qps, predictions = [], []
cache = pd.DataFrame()

for (test, revealed_targets, sample_prediction) in iter_test:
    now_time = time.time()
    
    test['time_id'] = counter
    
    cache = pd.concat([cache, test], ignore_index=True, axis=0)
    if counter > 0:
        cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)

    feat = feat_engineering(cache)[-len(test):]

    # added after new API, reference: https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/455690#2526672
    if test.currently_scored.iloc[0]== False:
        sample_prediction['target'] = 0
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))
        continue

    #feat = feat.drop(columns = ["currently_scored"])    
    # end of new codes for new API

    # Generate predictions for each model and calculate the weighted average

    list_cols_drop = ["date_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)

    list_features = model_prod.models[0].feature_name_
    feat = feat[list_features]
    lgb_predictions = model_prod.predict(feat, "mean")


    lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
    clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
    sample_prediction['target'] = clipped_predictions
    env.predict(sample_prediction)
    counter += 1
    qps.append(time.time() - now_time)
    if counter % 10 == 0:
        print(counter, 'qps:', np.mean(qps))

time_cost = 1.146 * np.mean(qps)
print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
Calculate diff features for specific columns...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
Applica le funzioni per generare nuove caratteristiche...
Calculate diff features for specific columns...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
Applica le funzioni per generare nuove caratteristiche...
Calculate diff features for specific columns...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
Applica le funzioni per generare nuove caratteristiche...
Calculate diff features for specific columns...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean ...
rolling std ...
rolling mean .

In [27]:
# clean_directory_except_one('/kaggle/working/','submission.csv')