In [1]:
# Constants and Configuration Variables
DEBUG = True
TRAIN = True
TUNING = False
OVERWRITE_PROCESSED_DATA = True
N_TRIALS = 10
STATE = 42

VERSION_NB = 1
EXPERIMENT_PURPOSE = "optiver_trading_at_the_close"


# External general-purpose modules
import gc
import zipfile
import shutil
import os
import itertools as itt
from itertools import combinations, product
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import joblib
from pathlib import Path
import warnings

from dotenv import load_dotenv

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
secret_value_1 = user_secrets.get_secret("AWS_DEFAULT_REGION")
secret_value_2 = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
s3_bucket_name = user_secrets.get_secret("S3_BUCKET")

# Set AWS credentials in the environment variables
os.environ['AWS_ACCESS_KEY_ID'] = secret_value_0
os.environ['AWS_SECRET_ACCESS_KEY'] = secret_value_2
os.environ['AWS_DEFAULT_REGION'] = secret_value_1

# Load environment variables
load_dotenv()
path_root_project = Path.cwd()
if path_root_project.name not in ["working", "content"]:
    path_root_project = Path(os.getenv("ROOT_PATH") or path_root_project)
    
    


In [2]:
class PathManager:
    def __init__(self, path_project_dir):
        self.path_root_project = Path(path_project_dir)

        self.path_data_dir = None
        self.path_experiments_dir = None
        self.name_train_file = None
        self.name_test_file = None

        self.name_project_dir = None

        self.path_experiments_dir = (
            "http://ec2-13-38-228-107.eu-west-3.compute.amazonaws.com:5000"
        )
        self.path_artifact_location = (
            f"s3://mlflow-v1/kaggle_optiver_trading_at_the_close/"
        )

        self.initialize_paths()

    def initialize_paths(self):
        if self.path_root_project.name == "working":
            self.setup_kaggle_paths()
        else:
            self.setup_local_paths()

    def setup_kaggle_paths(self):
        self.name_project_dir = "optiver-trading-at-the-close"

        self.name_train_file = "train.csv"
        self.name_test_file = "test.csv"

        self.data_dir = Path("/kaggle/input") / self.name_project_dir
        self.path_data_train_raw = self.data_dir / self.name_train_file
        self.path_data_test_raw = self.data_dir / self.name_test_file

        self.path_dataset_processed = "/kaggle/working/processed_data"
        # path_dataset_train = os.path.join(path_dataset_processed, "train.csv")
        # path_dataset_test = os.path.join(path_dataset_processed, "test.csv")

    def setup_local_paths(self):
        self.name_project_dir = "kaggle_optiver_trading_at_the_close"
        self.data_dir = self.path_root_project / "data" / self.name_project_dir

        self.path_data_train_raw = self.data_dir / "raw" / "train.csv"
        self.path_data_test_raw = self.data_dir / "raw" / "test.csv"

        self.path_dataset_processed = self.path_root_project / "data" / "processed"
        # path_dataset_train = path_dataset_processed / "train.csv"
        # path_dataset_test = path_dataset_processed / "test.csv"
        
pm = PathManager(path_root_project)

In [3]:

# Conditional imports and settings based on TRAIN constant
if TRAIN:
    if pm.path_root_project.name == "working":
        !pip install loguru mlflow optuna > /dev/null

    # External Libraries
    import boto3
    import lightgbm as lgbm
    import mlflow
    import optuna
    from mlflow.tracking import MlflowClient
    from optuna.integration.mlflow import MLflowCallback
    from sklearn.model_selection import KFold
    from tqdm import tqdm
    from xgboost import XGBRegressor as XGBR
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR

    # Local Modules Imports
    from utils import log_feature_importance, create_model, log_training_details, aggregate_feature_importance, get_data, clean_directory_except_one, experiments_data, reduce_mem_usage
    from fe_optiver_trading_at_the_close import calculate_triplet_imbalance_numba, compute_triplet_imbalance, convert_weights_to_dict
    
    # Auto-reload modules - Specific to Jupyter Notebooks
    %load_ext autoreload
    %autoreload 2

    mlflow.set_tracking_uri(pm.path_experiments_dir)
    client = MlflowClient()
    
    s3_client = boto3.client('s3')


In [4]:
if TRAIN:
    if not os.path.exists(pm.path_dataset_processed) or OVERWRITE_PROCESSED_DATA:
        df_train_raw = pd.read_csv(pm.path_data_train_raw)

        if DEBUG:
            df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2])]

        drop_idx = df_train_raw.loc[
            df_train_raw["target"].isna(), "target"
        ].index.to_list()
        df_train = df_train_raw.drop(drop_idx, axis=0)
        df_train.reset_index(drop=True, inplace=True)
    else:
        df_train = pd.read_csv(pm.path_dataset_processed)
        if DEBUG:
            df_train = df_train[df_train["stock_id"].isin([0, 1, 2])]

In [5]:
if TRAIN:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median()
        + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std()
        + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
        - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median()
        + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std()
        + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
        - df_train.groupby("stock_id")["ask_price"].min(),
    }

In [6]:

def get_experiments_df(client):
    experiments =client.search_experiments()
    data = []
    for exp in experiments:
        exp_detail = {
            "Experiment ID": exp.experiment_id,
            "Creation Time": datetime.fromtimestamp(exp.creation_time / 1000.0),
            "Name": exp.name,
            "Artifact Location": exp.artifact_location,
            "Lifecycle Stage": exp.lifecycle_stage
        }
        data.append(exp_detail)

    df = pd.DataFrame(data)
    return df

df_experiments = get_experiments_df(client)


In [7]:
df_experiments

Unnamed: 0,Experiment ID,Creation Time,Name,Artifact Location,Lifecycle Stage
0,462236245386961179,2023-11-11 13:50:58.934,optiver_trading_at_the_close_v1,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active
1,275946284308535692,2023-11-11 13:44:48.776,23_11_11_1344_optiver_trading_at_the_close,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active
2,592537742477651992,2023-11-11 13:39:12.211,23_11_11_1339_optiver_trading_at_the_close,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active
3,500744986040112169,2023-11-11 13:38:11.856,23_11_11_1338_optiver_trading_at_the_close,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active
4,195291970476306379,2023-11-11 13:37:52.807,23_11_11_1337_optiver_trading_at_the_close,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active
5,0,2023-11-10 19:26:01.927,Default,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...,active


In [8]:
import boto3
from mlflow.tracking import MlflowClient
from mlflow.exceptions import MlflowException

def delete_experiment_and_artifacts(experiment_ids, bucket_name):
    client = MlflowClient()
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    for exp_id in experiment_ids:
        try:
            # Get experiment data to find the artifact location
            experiment_data = client.get_experiment(exp_id)
            artifact_uri = experiment_data.artifact_location

            # Assuming the artifact URI is an S3 path
            if "s3://" in artifact_uri:
                artifact_path = artifact_uri.replace(f"s3://{bucket_name}/", "")

                # Delete artifacts from S3 corresponding to the experiment ID
                # It's crucial that the artifact_path is specific to the experiment
                if exp_id in artifact_path:
                    bucket.objects.filter(Prefix=artifact_path).delete()

            # Delete the experiment from MLflow
            client.delete_experiment(exp_id)
            print(f"Deleted experiment {exp_id} and its artifacts.")

        except MlflowException as e:
            print(f"Error deleting experiment {exp_id}: {e}")

# Example usage
experiment_ids_to_delete = ["521097711691933856"]  
delete_experiment_and_artifacts(experiment_ids_to_delete, s3_bucket_name)


Error deleting experiment 521097711691933856: RESOURCE_DOES_NOT_EXIST: Could not find experiment with ID 521097711691933856


In [9]:
# generate imbalance features
def imbalance_features(df):
    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [["ask_price", "bid_price", "wap", "reference_price"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    weights = convert_weights_to_dict()
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df["wap_momentum"] = df.groupby("stock_id")["weighted_wap"].pct_change(periods=6)
    df["imbalance_momentum"] = (
        df.groupby(["stock_id"])["imbalance_size"].diff(periods=1) / df["matched_size"]
    )
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(["stock_id"])["price_spread"].diff()
    df["price_pressure"] = df["imbalance_size"] * (df["ask_price"] - df["bid_price"])
    df["market_urgency"] = df["price_spread"] * df["liquidity_imbalance"]
    df["depth_pressure"] = (df["ask_size"] - df["bid_size"]) * (
        df["far_price"] - df["near_price"]
    )
    df["spread_depth_ratio"] = (df["ask_price"] - df["bid_price"]) / (
        df["bid_size"] + df["ask_size"]
    )
    df["mid_price_movement"] = (
        df["mid_price"]
        .diff(periods=5)
        .apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    )
    df["micro_price"] = (
        (df["bid_price"] * df["ask_size"]) + (df["ask_price"] * df["bid_size"])
    ) / (df["bid_size"] + df["ask_size"])
    df["relative_spread"] = (df["ask_price"] - df["bid_price"]) / df["wap"]

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in [
        "matched_size",
        "imbalance_size",
        "reference_price",
        "imbalance_buy_sell_flag",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby(["stock_id"])[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby(["stock_id"])[col].pct_change(window)

    for col in [
        "ask_price",
        "bid_price",
        "ask_size",
        "bid_size",
        "wap",
        "near_price",
        "far_price",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby(["stock_id"])[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)


# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


# generate all features
def feat_engineering(df):
    cols = [c for c in df.columns if c not in ["row_id"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    list_cols = [i for i in df.columns if i not in ["row_id"]]

    return df[list_cols]

In [10]:
if TRAIN:
    df_train_feats = feat_engineering(df_train)
    print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

    df_train = df_train_feats.copy()

    del df_train_feats

Build Online Train Feats Finished.


In [11]:
def time_series_split(X, n_splits, n_test_splits, embargo_td=2):
    factorized_indices = np.unique(X["factorized"])

    # Compute the fold boundaries
    fold_bounds = [
        (fold[0], fold[-1] + 1) for fold in np.array_split(factorized_indices, n_splits)
    ]

    # Create the list of all tests test_fold_bounds that will become the test sets
    selected_fold_bounds = list(itt.combinations(fold_bounds, n_test_splits))

    # Reverse to start the testing from the most recent part of the dataset
    selected_fold_bounds.reverse()

    for fold_bound_list in selected_fold_bounds:
        test_factorized_indices = np.empty(0)
        test_fold_bounds = []

        for fold_start, fold_end in fold_bound_list:
            # Records the boundaries of the current test split
            if not test_fold_bounds or fold_start != test_fold_bounds[-1][-1]:
                test_fold_bounds.append((fold_start, fold_end))
            elif fold_start == test_fold_bounds[-1][-1]:
                test_fold_bounds[-1] = (test_fold_bounds[-1][0], fold_end)

            test_factorized_indices = np.union1d(
                test_factorized_indices, factorized_indices[fold_start:fold_end]
            ).astype(int)

        # Compute the train set indices
        train_indices = np.setdiff1d(factorized_indices, test_factorized_indices)

        # Purge and embargo can be added here if needed
        # ...

        yield train_indices, test_factorized_indices

In [12]:
if TRAIN:
    col_split = "date_id"
    df_train.sort_values([col_split], inplace=True)
    df_train.reset_index(drop=True, inplace=True)
    df_train["factorized"] = pd.factorize(df_train[col_split])[0]

    list_cols_drop = ["date_id", "time_id"]
    df_train.drop(list_cols_drop, axis=1, inplace=True)

In [13]:
if DEBUG:
    n_estimators_min = n_estimators_max = 50
else:
    n_estimators_min = 100
    n_estimators_max = 100

In [14]:
from collections import OrderedDict
import mlflow

def log_model_parameters(model, priority_params, excluded_params):
    """
    Logs the model's parameters to MLflow, with priority parameters logged first.

    Parameters:
    model: The model object with a get_params() method.
    priority_params (list): A list of parameter names to log first.
    excluded_params (list): A list of parameter names to exclude from logging.
    """
    params_to_log = model.get_params()
    
    # Create an OrderedDict to keep the priority parameters first
    ordered_params = OrderedDict()

    # Add the priority parameters with rounding if they are floats or ints
    for key in priority_params:
        if key in params_to_log:
            value = params_to_log[key]
            if isinstance(value, (int, float)):
                ordered_params[key] = round(value, 5)
            else:
                ordered_params[key] = value

    # Add the remaining parameters, excluding the ones in excluded_params and already added priority keys
    for key, value in params_to_log.items():
        if key not in excluded_params and key not in priority_params:
            if isinstance(value, (int, float)):
                ordered_params[key] = round(value, 5)
            else:
                ordered_params[key] = value

    return ordered_params
    


In [18]:
def get_or_create_experiment(client, experiment_name, artifact_location):
    """
    Get the ID of an existing MLflow experiment with the given name or create a new one if it
    does not exist.

    Parameters:
    experiment_name (str): The name of the experiment.
    artifact_location (str): The location for storing artifacts for the experiment.

    Returns:
    str: The experiment ID of the existing or newly created experiment.
    """

    try:
        # Check if the experiment already exists
        experiment = client.get_experiment_by_name(experiment_name)
        if experiment:
            experiment_id = experiment.experiment_id
            print(f"Experiment '{experiment_name}' already exists with ID {experiment_id}.")
        else:
            # If the experiment does not exist, create it
            experiment_id = client.create_experiment(name=experiment_name, artifact_location=artifact_location)
            print(f"Created new experiment with ID {experiment_id}.")
            
    except MlflowException as e:
        raise e
    
    return experiment_id

In [21]:
experiment_name = f"{EXPERIMENT_PURPOSE}_v{VERSION_NB}"
name_folder_models = f"models_v{VERSION_NB}"

experiment_date_str = datetime.now().strftime("%y_%m_%d_%H%M")
experiment_id = get_or_create_experiment(client, experiment_name, artifact_location=pm.path_artifact_location)


gpu_switch = "OFF"
n_splits = 6
n_test_split = 1
embargo_td = 100

n_repeats = 1
nbrnd_erly_stp = 130

cv_mthd = "KF"

# Cross-Validation Setup
if TRAIN:
    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

    all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=STATE)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "mae",
                "boosting_type": "gbdt",
                "random_state": STATE,
                "n_jobs": 4,
                "verbose": -1,
                "importance_type": "gain",
            },
            "dynamic_params": {
                "n_estimators": {
                    "type": "int",
                    "low": n_estimators_min,
                    "high": n_estimators_max,
                },
                "learning_rate": {
                    "type": "float",
                    "low": 0.005,
                    "high": 0.06,
                },
                "max_depth": {"type": "int", "low": 10, "high": 90},
                "num_leaves": {
                    "type": "int",
                    "low": 20,
                    "high": 90,
                },
                "min_child_samples": {
                    "type": "int",
                    "low": 10,
                    "high": 70,
                },
                "subsample": {
                    "type": "float",
                    "low": 0.7,
                    "high": 1,
                },
                "colsample_bytree": {
                    "type": "float",
                    "low": 1,
                    "high": 1,
                },
                "min_split_gain": {
                    "type": "float",
                    "low": 0,
                    "high": 2,
                },
                "reg_alpha": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
                "reg_lambda": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
            },
        },
    }

    dict_models = {"LGBMR": LGBMR}

    log_model = True

   

    
    

def get_params_trained_models(model_path):
    model = joblib.load(model_path)
    return model.get_params()


if TRAIN:
    dict_fixed_model_params = get_params_trained_models(
        "/kaggle/input/models-6/LGBMR_0_20231108_235435.pkl"
    )

if TRAIN:
    args = {
        "cv_mthd": cv_mthd,
        "experiment_purpose": EXPERIMENT_PURPOSE,
        "experiment_name": experiment_name,
        "dict_models": dict_models,
        "model_params_dict": model_params_dict,
        "n_splits": n_splits,
        "n_test_split": n_test_split,
        "experiment_date_str": experiment_date_str,
        "path_experiments_storage": pm.path_experiments_dir,
        "target_col": "target",
    }

def run_mlflow_experiment(df_train, args, trial=None):
    cv_mthd = args["cv_mthd"]
    experiment_purpose = args["experiment_purpose"]
    experiment_name = args["experiment_name"]
    dict_models = args["dict_models"]
    model_params_dict = args["model_params_dict"]

    n_splits = args["n_splits"]
    n_test_split = args["n_test_split"]
    experiment_date_str = args["experiment_date_str"]
    path_experiments_storage = args["path_experiments_storage"]
    target_col = args["target_col"]

    if trial == None:
        trial = optuna.trial.FixedTrial(
            {
                "n_estimators": 500,
                "learning_rate": 0.005,
                "max_depth": 10,
                "num_leaves": 20,
                "min_child_samples": 10,
                "subsample": 0.7,
                "colsample_bytree": 1.0,
                "min_split_gain": 0.0,
                "reg_alpha": 0.0,
                "reg_lambda": 0.0,
                "device": "gpu" if gpu_switch == "ON" else "cpu",
            }
        )

    run_time_start_trial = datetime.now().strftime("%y_%m_%d_%H%M%S")
    
    with mlflow.start_run(run_name=run_time_start_trial, experiment_id = experiment_id) as run:
        score_list = []
        
        mlflow.set_tag("cv_mthd", cv_mthd)

        for model_name, model_class in dict_models.items():
            if TUNING:
                model = create_model(
                    trial,
                    model_class,
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
            else:
                
                model = model_class(**dict_fixed_model_params)
                

            priority_params = ['learning_rate', 'max_depth']
            excluded_params = ['device', 'class_weight','random_state','silent','verbose','n_jobs']

            
            ordered_params = log_model_parameters(model, priority_params, excluded_params)

            mlflow.log_params(ordered_params)
         
            print(ordered_params)
            

            for fold_n, (train_indices, test_indices) in enumerate(
                time_series_split(
                    df_train, n_splits=n_splits, n_test_splits=n_test_split
                )
            ):
                with mlflow.start_run(
                    run_name=f"fold_{fold_n+1}", nested=True, experiment_id = experiment_id
                ) as nested_run:
                    
                    mlflow.set_tag("n_trial", str(trial.number))
            
                    mask_train = df_train["factorized"].isin(train_indices)
                    mask_test = df_train["factorized"].isin(test_indices)

                    y_train = df_train.loc[mask_train, target_col]
                    y_val = df_train.loc[mask_test, target_col]
                    X_train = df_train.loc[mask_train].drop(
                        [target_col, "factorized"], axis=1
                    )
                    X_val = df_train.loc[mask_test].drop(
                        [target_col, "factorized"], axis=1
                    )

                    mlflow.log_param("train_rows", X_train.shape[0])
                    mlflow.log_param("train_cols", X_train.shape[1])

                    model.fit(
                        X_train,
                        y_train,
                        eval_set=[(X_val, y_val)],
                        eval_metric="mae",
                        callbacks=[
                            lgbm.callback.early_stopping(stopping_rounds=100),
                            lgbm.callback.log_evaluation(period=100000),
                        ],
                    )

                    log_feature_importance(
                        trial.number,
                        model,
                        X_train,
                        fold_n,
                        experiment_purpose,
                        experiment_date_str,
                    )

                    fold_score = model.best_score_["valid_0"]["l1"]

                    score_list.append(fold_score)

          
                    mlflow.log_metric("fold_score", round(fold_score, 6))
                    mlflow.log_param("fold_number", fold_n + 1)
                    mlflow.log_param("model_name", model_name)


                    mlflow.log_params(rounded_params)

                    current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                    model_log_name = f"{model_name}_{trial.number}_{current_time_str}"

                  
                    mlflow.sklearn.log_model(model, model_log_name)

                    mlflow.log_param("run_time", current_time_str)

                    nested_run_id = nested_run.info.run_id
                    model_path = f"{path_experiments_storage}/{run.info.experiment_id}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                    mlflow.log_param("model_path", model_path)

            avg_score = sum(score_list) / len(score_list)
            median_score = np.median(score_list) 
            mlflow.log_metric("avg score",round(avg_score,6))
            mlflow.log_metric("median score",round(median_score,6))

        return avg_score



def objective(trial, df_train):
    avg_score = run_mlflow_experiment(df_train, args, trial)
    return avg_score


# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train), n_trials=N_TRIALS)

[I 2023-11-11 14:17:43,182] A new study created in memory with name: Your Study Name


Experiment 'optiver_trading_at_the_close_v1' already exists with ID 462236245386961179.
OrderedDict([('learning_rate', 0.01395), ('max_depth', 43), ('boosting_type', 'gbdt'), ('colsample_bytree', 1.0), ('importance_type', 'gain'), ('min_child_samples', 18), ('min_child_weight', 0.001), ('min_split_gain', 0.20082), ('n_estimators', 500), ('n_jobs', 4), ('num_leaves', 60), ('objective', 'mae'), ('reg_alpha', 1.78651), ('reg_lambda', 2.54073), ('subsample', 0.93474), ('subsample_for_bin', 200000), ('subsample_freq', 0)])
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's l1: 6.31829


[W 2023-11-11 14:18:18,566] Trial 0 failed with parameters: {} because of the following error: NameError("name 'rounded_params' is not defined").
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1542/1971828427.py", line 275, in <lambda>
    study.optimize(lambda trial: objective(trial, df_train), n_trials=N_TRIALS)
  File "/tmp/ipykernel_1542/1971828427.py", line 264, in objective
    avg_score = run_mlflow_experiment(df_train, args, trial)
  File "/tmp/ipykernel_1542/1971828427.py", line 240, in run_mlflow_experiment
    mlflow.log_params(rounded_params)
NameError: name 'rounded_params' is not defined
[W 2023-11-11 14:18:18,567] Trial 0 failed with value None.


NameError: name 'rounded_params' is not defined

In [None]:
if TRAIN:
    df_exp = experiments_data(
        client, list_experiment_id=None, save_df=None, list_columns=None
    )
    list_base_cols = [
        "run_time",
        "experiment_id",
        "n_trial",
        "run_id",
        "model_name",
        "fold_number",
        "fold_score",
    ]
    list_dynamic_params = list(model_params_dict["LGBMR"]["dynamic_params"].keys())

    df_exp["run_time"] = pd.to_datetime(
        df_exp["run_time"], format="%Y%m%d_%H%M%S", errors="coerce"
    )

    for col in df_exp.columns:
        df_exp[col] = pd.to_numeric(df_exp[col], errors="ignore")

    for col in df_exp.select_dtypes(include=["float", "int"]):
        df_exp[col] = df_exp[col].round(5)

    list_cols_exp = ["run_time"] + list_base_cols + list_dynamic_params + ["model_path"]

    df_exp = df_exp[list_cols_exp]

In [None]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.mean(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred

In [None]:
if TRAIN:
    model_paths = [
        path
        for path in list(
            df_exp[df_exp["experiment_id"] == 223740748204133848]["model_path"]
        )
        if path is not np.nan
    ]

    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    for model_path in model_paths:
        if not os.path.exists(model_path):
            print(f"File does not exist: {model_path}")
            continue  # Skip to the next iteration

        specific_part = model_path.split("/")[-2]
        dest_path = os.path.join(models_dir, f"{specific_part}.pkl")
        if not os.path.exists(dest_path):
            print(f"Copying model to {dest_path}")
            shutil.copy(model_path, dest_path)
        else:
            print(f"File {dest_path} already exists. Skipping copy.")

    zipf = zipfile.ZipFile(
        f"/kaggle/working/{models_dir}.zip", "w", zipfile.ZIP_DEFLATED
    )

    # Navigate through the folder and add each file to the ZIP
    for root, dirs, files in os.walk(f"/kaggle/working/{models_dir}"):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(
                    os.path.join(root, file), f"/kaggle/working/{models_dir}"
                ),
            )

    zipf.close()

In [None]:
model_paths = []
models_dir_input = models_dir.replace("_", "-")
directory = f"/kaggle/input/{models_dir_input}"

# Check if the directory exists
if os.path.exists(directory):
    # Traverse the directory and collect file paths
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)

        # Check if the item is a file (and not a sub-directory)
        if os.path.isfile(full_path):
            model_paths.append(full_path)
else:
    print(f"The directory {directory} does not exist.")

# Print or return the list of file paths
print("List of file paths:", model_paths)

In [None]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [None]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
df_train = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median()
    + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std()
    + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
    - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median()
    + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std()
    + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
    - df_train.groupby("stock_id")["ask_price"].min(),
}

In [None]:
counter = 0
df_tot_test = []
for test, revealed_targets, sample_prediction in iter_test:
    test["time_id"] = counter
    test["target"] = "none"

    if counter < 17:
        df_tot_test.append(test)
    else:
        df_tot_test = df_tot_test[1:]
        df_tot_test.append(test)

    df_test = pd.concat(df_tot_test, axis=0, ignore_index=True)

    feat = feat_engineering(df_test)
    feat = feat.sort_values(["date_id", "seconds_in_bucket", "stock_id"])[-len(test) :]

    list_cols_drop = ["date_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)

    model = joblib.load("/kaggle/input/models-12/LGBMR_0_20231110_100236.pkl")

    list_features = model.feature_name_
    feat = feat[list_features]
    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1