In [30]:
# Constants and Configuration Variables
DEBUG = True
TRAIN = True
TUNING = False
OVERWRITE_PROCESSED_DATA = True
N_TRIALS = 2
STATE = 42


N_SPLITS = 5
N_TEST_SPLITS = 1
N_PURGE = 6
N_EMBARGO = 6


VERSION_NB = 6
EXPERIMENT_PURPOSE = "optiver_trading_at_the_close"


# External general-purpose modules
import gc
import zipfile
import shutil
import os
import itertools as itt
from itertools import combinations, product
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import joblib
from pathlib import Path
import warnings

from dotenv import load_dotenv

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
aws_access_key_id = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
aws_region = user_secrets.get_secret("AWS_DEFAULT_REGION")
aws_secret_access_key = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
s3_bucket_name = user_secrets.get_secret("S3_BUCKET")

# Set AWS credentials in the environment variables
os.environ['AWS_ACCESS_KEY_ID'] = aws_access_key_id
os.environ['AWS_SECRET_ACCESS_KEY'] = aws_region
os.environ['AWS_DEFAULT_REGION'] = secret_value_1

# Load environment variables
load_dotenv()
path_root_project = Path.cwd()
if path_root_project.name not in ["working", "content"]:
    path_root_project = Path(os.getenv("ROOT_PATH") or path_root_project)
    
    


In [8]:
class PathManager:
    def __init__(self, path_project_dir):
        self.path_root_project = Path(path_project_dir)

        self.path_data_dir = None
        self.path_experiments_dir = None
        self.name_train_file = None
        self.name_test_file = None

        self.name_project_dir = None

        self.path_experiments_dir = (
            "http://ec2-13-38-228-107.eu-west-3.compute.amazonaws.com:5000"
        )
        self.path_artifact_location = (
            f"s3://mlflow-v1/kaggle_optiver_trading_at_the_close"
        )

        self.initialize_paths()

    def initialize_paths(self):
        if self.path_root_project.name == "working":
            self.setup_kaggle_paths()
        else:
            self.setup_local_paths()

    def setup_kaggle_paths(self):
        self.name_project_dir = "optiver-trading-at-the-close"

        self.name_train_file = "train.csv"
        self.name_test_file = "test.csv"

        self.data_dir = Path("/kaggle/input") / self.name_project_dir
        self.path_data_train_raw = self.data_dir / self.name_train_file
        self.path_data_test_raw = self.data_dir / self.name_test_file

        self.path_dataset_processed = "/kaggle/working/processed_data"
        # path_dataset_train = os.path.join(path_dataset_processed, "train.csv")
        # path_dataset_test = os.path.join(path_dataset_processed, "test.csv")

    def setup_local_paths(self):
        self.name_project_dir = "kaggle_optiver_trading_at_the_close"
        self.data_dir = self.path_root_project / "data" / self.name_project_dir

        self.path_data_train_raw = self.data_dir / "raw" / "train.csv"
        self.path_data_test_raw = self.data_dir / "raw" / "test.csv"

        self.path_dataset_processed = self.path_root_project / "data" / "processed"
        # path_dataset_train = path_dataset_processed / "train.csv"
        # path_dataset_test = path_dataset_processed / "test.csv"
        
pm = PathManager(path_root_project)

In [9]:

# Conditional imports and settings based on TRAIN constant
if TRAIN:
    if pm.path_root_project.name == "working":
        !pip install loguru mlflow optuna > /dev/null

    # External Libraries
    import boto3
    import boto3
    from mlflow.exceptions import MlflowException
    import lightgbm as lgbm
    import mlflow
    import optuna
    from mlflow.tracking import MlflowClient
    from optuna.integration.mlflow import MLflowCallback
    from sklearn.model_selection import KFold
    from tqdm import tqdm
    from xgboost import XGBRegressor as XGBR
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR

    # Local Modules Imports
    from utils import log_feature_importance, create_model, log_training_details, aggregate_feature_importance, get_data, clean_directory_except_one, experiments_data, reduce_mem_usage
    from fe_optiver_trading_at_the_close import calculate_triplet_imbalance_numba, compute_triplet_imbalance, convert_weights_to_dict
    
    # Auto-reload modules - Specific to Jupyter Notebooks
    %load_ext autoreload
    %autoreload 2

    mlflow.set_tracking_uri(pm.path_experiments_dir)
    client = MlflowClient()
    
    s3_client = boto3.client('s3')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
if TRAIN:
    if not os.path.exists(pm.path_dataset_processed) or OVERWRITE_PROCESSED_DATA:
        df_train_raw = pd.read_csv(pm.path_data_train_raw)

        if DEBUG:
            df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2])]

        drop_idx = df_train_raw.loc[
            df_train_raw["target"].isna(), "target"
        ].index.to_list()
        df_train = df_train_raw.drop(drop_idx, axis=0)
        df_train.reset_index(drop=True, inplace=True)
    else:
        df_train = pd.read_csv(pm.path_dataset_processed)
        if DEBUG:
            df_train = df_train[df_train["stock_id"].isin([0, 1, 2])]

In [11]:
if TRAIN:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median()
        + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std()
        + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
        - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median()
        + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std()
        + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
        - df_train.groupby("stock_id")["ask_price"].min(),
    }

In [12]:

def get_experiments_df(client):
    experiments =client.search_experiments()
    data = []
    for exp in experiments:
        exp_detail = {
            "Experiment ID": exp.experiment_id,
            "Creation Time": datetime.fromtimestamp(exp.creation_time / 1000.0),
            "Name": exp.name,
            "Artifact Location": exp.artifact_location,
            "Lifecycle Stage": exp.lifecycle_stage
        }
        data.append(exp_detail)

    df = pd.DataFrame(data)
    return df

df_experiments = get_experiments_df(client)


In [7]:

def delete_runs_and_artifacts(client, experiment_ids_to_remove, bucket_name):
    """
    Deletes all the runs and their associated artifacts for the experiments listed in `experiment_ids_to_remove`
    Deletes MLflow runs with a 'FAILED' status or with a 'debug' tag set to 'True' for all others experiments.
    """
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket_name)

    for exp in client.search_experiments():
        exp_id = exp.experiment_id
        if exp_id not in experiment_ids_to_remove:
            runs = client.search_runs([exp.experiment_id], "")
            for run in runs:
                # Check run status and tags
                if run.info.status == "FAILED" or (run.data.tags.get("debug") == "True"):
                    try:
                        # Delete artifacts from S3 corresponding to the run ID
                        artifact_uri = run.info.artifact_uri
                        if "s3://" in artifact_uri:
                            artifact_path = artifact_uri.replace(f"s3://{bucket_name}/", "")
                            bucket.objects.filter(Prefix=artifact_path).delete()

                        # Delete the run
                        client.delete_run(run.info.run_id)
                        print(f"Deleted run {run.info.run_id} and its artifacts.")

                    except MlflowException as e:
                        print(f"Error deleting run {run.info.run_id}: {e}")
        else:
            try:
                # Get experiment data to find the artifact location
                experiment_data = client.get_experiment(exp_id)
                artifact_uri = experiment_data.artifact_location

                # Assuming the artifact URI is an S3 path
                if "s3://" in artifact_uri:
                    artifact_path = artifact_uri.replace(f"s3://{bucket_name}/", "")

                    # Delete artifacts from S3 corresponding to the experiment ID
                    # It's crucial that the artifact_path is specific to the experiment
                    if exp_id in artifact_path:
                        bucket.objects.filter(Prefix=artifact_path).delete()

                # Delete the experiment from MLflow
                client.delete_experiment(exp_id)
                print(f"Deleted experiment {exp_id} and its artifacts.")

            except MlflowException as e:
                print(f"Error deleting experiment {exp_id}: {e}")

# Example usage
experiment_ids_to_remove = ["999333986568643837","867182202959923683","773486292991054569","195291970476306379"]



In [8]:
delete_runs_and_artifacts(client, experiment_ids_to_remove, s3_bucket_name)

Deleted run a1b70f1f0f41457785a3deaf55d7b323 and its artifacts.
Deleted run c9629ba3989c4811810989a7b367b5ee and its artifacts.


In [20]:
import boto3
import pickle
from botocore.exceptions import NoCredentialsError

# Set up AWS credentials (ensure you have these set up in your environment or notebook)
aws_access_key_id = secret_value_0
aws_secret_access_key = secret_value_2

# Define the S3 bucket and object path
bucket_name = 'mlflow-v1'
object_key = 'kaggle_optiver_trading_at_the_close/015f4a285dcb48b292eb4522c9e265bf/artifacts/LGBMR_8_20231111_183922/model.pkl'

# Create an S3 client
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

try:
    # Download the file
    with open('model.pkl', 'wb') as f:
        s3.download_fileobj(bucket_name, object_key, f)

    # Load the model
    with open('model.pkl', 'rb') as f:
        model = pickle.load(f)

    print("Model loaded successfully.")

except NoCredentialsError:
    print("Credentials not available.")
except Exception as e:
    print(f"Error occurred: {e}")


Model loaded successfully.


In [21]:
['s3://mlflow-v1/kaggle_optiver_trading_at_the_close/d60d3f6cb2ae4e12af4414bd87c2f1f8/artifacts/LGBMR_0_20231112_010902/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/2df0a094697247c08d7f8cd15f491684/artifacts/LGBMR_0_20231112_004637/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/6891e5cf212b46fbb1bbe7ef611b875e/artifacts/LGBMR_0_20231112_002442/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/ff61f916a31143468b73932e430aadc8/artifacts/LGBMR_0_20231112_000222/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/bef29681919f455088555bc1a92206e0/artifacts/LGBMR_0_20231111_234007/model.pkl']

In [31]:
import boto3
import pickle
from botocore.exceptions import NoCredentialsError
import numpy as np
import mlflow

aws_access_key_id = secret_value_0
aws_secret_access_key = secret_value_2

# Lista dei percorsi dei modelli su S3
model_paths = ['s3://mlflow-v1/kaggle_optiver_trading_at_the_close/d60d3f6cb2ae4e12af4414bd87c2f1f8/artifacts/LGBMR_0_20231112_010902/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/2df0a094697247c08d7f8cd15f491684/artifacts/LGBMR_0_20231112_004637/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/6891e5cf212b46fbb1bbe7ef611b875e/artifacts/LGBMR_0_20231112_002442/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/ff61f916a31143468b73932e430aadc8/artifacts/LGBMR_0_20231112_000222/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close/bef29681919f455088555bc1a92206e0/artifacts/LGBMR_0_20231111_234007/model.pkl']

# Inizializza una lista per i modelli caricati
loaded_models = []

# Carica ogni modello dalla lista
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)
for path in model_paths:
    bucket_name = path.split('/')[2]
    object_key = '/'.join(path.split('/')[3:])

    try:
        with open('temp_model.pkl', 'wb') as f:
            s3.download_fileobj(bucket_name, object_key, f)

        with open('temp_model.pkl', 'rb') as f:
            model = pickle.load(f)
            loaded_models.append(model)

        print(f"Model at {path} loaded successfully.")

    except NoCredentialsError:
        print("Credentials not available.")
    except Exception as e:
        print(f"Error occurred while loading model from {path}: {e}")


Model at s3://mlflow-v1/kaggle_optiver_trading_at_the_close/d60d3f6cb2ae4e12af4414bd87c2f1f8/artifacts/LGBMR_0_20231112_010902/model.pkl loaded successfully.
Model at s3://mlflow-v1/kaggle_optiver_trading_at_the_close/2df0a094697247c08d7f8cd15f491684/artifacts/LGBMR_0_20231112_004637/model.pkl loaded successfully.
Model at s3://mlflow-v1/kaggle_optiver_trading_at_the_close/6891e5cf212b46fbb1bbe7ef611b875e/artifacts/LGBMR_0_20231112_002442/model.pkl loaded successfully.
Model at s3://mlflow-v1/kaggle_optiver_trading_at_the_close/ff61f916a31143468b73932e430aadc8/artifacts/LGBMR_0_20231112_000222/model.pkl loaded successfully.
Model at s3://mlflow-v1/kaggle_optiver_trading_at_the_close/bef29681919f455088555bc1a92206e0/artifacts/LGBMR_0_20231111_234007/model.pkl loaded successfully.


In [32]:

# Definisci il modello ensemble
class EnsembleModel:
    def __init__(self, models):
        self.models = models

    def predict(self, X):
        # Calcola la media delle previsioni di tutti i modelli
        predictions = np.mean([model.predict(X) for model in self.models], axis=0)
        return predictions

# Crea il modello ensemble
ensemble_model = EnsembleModel(loaded_models)

# Salva il modello ensemble
with open('ensemble_model.pkl', 'wb') as f:
    pickle.dump(ensemble_model, f)


In [36]:
import mlflow

# Avvia un run MLflow
with mlflow.start_run() as run:
    # Logga il modello ensemble come artefatto
    mlflow.log_artifact('ensemble_model.pkl', 'model')

    # Ottieni l'ID del run
    run_id = run.info.run_id

    # Registra il modello nel Model Registry
    model_uri = f"runs:/{run_id}/model/ensemble_model.pkl"
    mlflow.register_model(model_uri, "YourEnsembleModelName")


Registered model 'YourEnsembleModelName' already exists. Creating a new version of this model...
2023/11/12 09:51:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: YourEnsembleModelName, version 1
Created version '1' of model 'YourEnsembleModelName'.


In [39]:
import mlflow
import zipfile
import os

# Nome del modello nel Model Registry
model_name = "YourEnsembleModelName"

# Crea un client MLflow
client = mlflow.tracking.MlflowClient()

# Ottieni le informazioni sulla versione più recente del modello
model_version_info = client.get_latest_versions(model_name, stages=["None", "Staging", "Production"])[0]


In [46]:
model_source = model_version_info.source

s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key)

# Estrai il nome del bucket e il percorso dell'oggetto dal percorso sorgente del modello
bucket_name = model_source.split('/')[2]
object_key = '/'.join(model_source.split('/')[3:])

# Nome del file locale per il download
local_model_file = 'downloaded_model.pkl'

# Scarica il modello da S3
s3.download_file(bucket_name, object_key, local_model_file)

# Comprimi il modello scaricato in formato ZIP
zip_file_name = f"{model_name}_version_{model_version_info.version}.zip"
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(local_model_file)

print(f"Modello scaricato e salvato in formato ZIP come {zip_file_name}")

Modello scaricato e salvato in formato ZIP come YourEnsembleModelName_version_1.zip


In [9]:
# generate imbalance features
def imbalance_features(df):
    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [["ask_price", "bid_price", "wap", "reference_price"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values

    # V2
    weights = convert_weights_to_dict()
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df["wap_momentum"] = df.groupby("stock_id")["weighted_wap"].pct_change(periods=6)
    df["imbalance_momentum"] = (
        df.groupby(["stock_id"])["imbalance_size"].diff(periods=1) / df["matched_size"]
    )
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(["stock_id"])["price_spread"].diff()
    df["price_pressure"] = df["imbalance_size"] * (df["ask_price"] - df["bid_price"])
    df["market_urgency"] = df["price_spread"] * df["liquidity_imbalance"]
    df["depth_pressure"] = (df["ask_size"] - df["bid_size"]) * (
        df["far_price"] - df["near_price"]
    )
    df["spread_depth_ratio"] = (df["ask_price"] - df["bid_price"]) / (
        df["bid_size"] + df["ask_size"]
    )
    df["mid_price_movement"] = (
        df["mid_price"]
        .diff(periods=5)
        .apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    )
    df["micro_price"] = (
        (df["bid_price"] * df["ask_size"]) + (df["ask_price"] * df["bid_size"])
    ) / (df["bid_size"] + df["ask_size"])
    df["relative_spread"] = (df["ask_price"] - df["bid_price"]) / df["wap"]

    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)

    # V3
    for col in [
        "matched_size",
        "imbalance_size",
        "reference_price",
        "imbalance_buy_sell_flag",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby(["stock_id"])[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby(["stock_id"])[col].pct_change(window)

    for col in [
        "ask_price",
        "bid_price",
        "ask_size",
        "bid_size",
        "wap",
        "near_price",
        "far_price",
    ]:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby(["stock_id"])[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)


# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60

    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df


# generate all features
def feat_engineering(df):
    cols = [c for c in df.columns if c not in ["row_id"]]
    df = df[cols]
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    list_cols = [i for i in df.columns if i not in ["row_id"]]

    return df[list_cols]

In [10]:
if TRAIN:
    df_train_feats = feat_engineering(df_train)
    print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)


Build Online Train Feats Finished.


In [11]:
if TRAIN:
    col_split = "time_id"
    df_train_feats.sort_values([col_split], inplace=True)
    df_train_feats.reset_index(drop=True, inplace=True)
    df_train_feats["factorized"] = pd.factorize(df_train_feats[col_split])[0]

    list_cols_drop = ["date_id", "time_id"]
    df_train_feats.drop(list_cols_drop, axis=1, inplace=True)

In [12]:
def time_series_split(X, n_splits, n_test_splits, n_purge, n_embargo):
    factorized_indices = np.unique(X["factorized"])

    # Calcola i confini dei fold
    fold_bounds = [
        (fold[0], fold[-1] + 1) for fold in np.array_split(factorized_indices, n_splits)
    ]

    # Crea la lista dei confini di test che diventeranno i set di test
    selected_fold_bounds = list(itt.combinations(fold_bounds, n_test_splits))

    # Inverte per iniziare i test dalla parte più recente del dataset
    selected_fold_bounds.reverse()

    for fold_bound_list in selected_fold_bounds:
        test_factorized_indices = np.empty(0)
        test_fold_bounds = []

        for fold_start, fold_end in fold_bound_list:
            # Registra i confini dell'attuale split di test
            if not test_fold_bounds or fold_start != test_fold_bounds[-1][-1]:
                test_fold_bounds.append((fold_start, fold_end))
            elif fold_start == test_fold_bounds[-1][-1]:
                test_fold_bounds[-1] = (test_fold_bounds[-1][0], fold_end)

            # Aggiunge gli indici al set di test
            test_factorized_indices = np.union1d(
                test_factorized_indices, factorized_indices[fold_start:fold_end]
            ).astype(int)

        # Calcola gli indici del set di addestramento
        train_indices = np.setdiff1d(factorized_indices, test_factorized_indices)

        # Applica il purging
        if n_purge > 0:
            purge_indices = np.arange(test_factorized_indices[0] - n_purge, test_factorized_indices[0])
            train_indices = np.setdiff1d(train_indices, purge_indices)

        # Applica l'embargo
        if n_embargo > 0:
            embargo_indices = np.arange(test_factorized_indices[-1] + 1, test_factorized_indices[-1] + 1 + n_embargo)
            train_indices = np.setdiff1d(train_indices, embargo_indices)

        yield train_indices, test_factorized_indices

In [13]:
from collections import OrderedDict
import mlflow

def log_model_parameters(model, priority_params, excluded_params, verbose = None):
    """
    Logs the model's parameters to MLflow, with priority parameters logged first.

    Parameters:
    model: The model object with a get_params() method.
    priority_params (list): A list of parameter names to log first.
    excluded_params (list): A list of parameter names to exclude from logging.
    """
    params_to_log = model.get_params()
    
    # Create an OrderedDict to keep the priority parameters first
    ordered_params = OrderedDict()

    # Add the priority parameters with rounding if they are floats or ints
    for key in priority_params:
        if key in params_to_log:
            value = params_to_log[key]
            if isinstance(value, (int, float)):
                ordered_params[key] = round(value, 5)
            else:
                ordered_params[key] = value

    # Add the remaining parameters, excluding the ones in excluded_params and already added priority keys
    for key, value in params_to_log.items():
        if key not in excluded_params and key not in priority_params:
            if isinstance(value, (int, float)):
                ordered_params[key] = round(value, 5)
            else:
                ordered_params[key] = value

    if verbose:
        formatted_params = ' | '.join(f"{key}: {value}" for key, value in ordered_params.items())
        print(f"\n{formatted_params}\n")
    return ordered_params
    


In [25]:
def get_or_create_experiment(client, experiment_name, artifact_location):
    """
    Get the ID of an existing MLflow experiment with the given name or create a new one if it
    does not exist.

    Parameters:
    experiment_name (str): The name of the experiment.
    artifact_location (str): The location for storing artifacts for the experiment.

    Returns:
    str: The experiment ID of the existing or newly created experiment.
    """

    try:
        # Check if the experiment already exists
        experiment = client.get_experiment_by_name(experiment_name)
        if experiment:
            experiment_id = experiment.experiment_id
            print(f"Experiment '{experiment_name}' already exists with ID {experiment_id}.")
        else:
            # If the experiment does not exist, create it
            experiment_id = client.create_experiment(name=experiment_name, artifact_location=artifact_location)
            print(f"Created new experiment with ID {experiment_id}.")
            
    except MlflowException as e:
        raise e
    
    return experiment_id

In [26]:
experiment_name = f"{EXPERIMENT_PURPOSE}_v{VERSION_NB}"
name_folder_models = f"models_v{VERSION_NB}"

experiment_date_str = datetime.now().strftime("%y_%m_%d_%H%M")
experiment_id = get_or_create_experiment(client, experiment_name, artifact_location=pm.path_artifact_location)


gpu_switch = "OFF"

nbrnd_erly_stp = 130
cv_mthd = "KF"

# Cross-Validation Setup
if TRAIN:
    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

    all_cv = {"KF": KFold(n_splits=5, shuffle=True, random_state=STATE)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "mae",
                "boosting_type": "gbdt",
                "random_state": STATE,
                "n_jobs": 4,
                "verbose": -1,
                "importance_type": "gain",
            },
            "dynamic_params": {
                "n_estimators": {
                    "type": "int",
                    "low": 100,
                    "high": 100,
                },
                "learning_rate": {
                    "type": "float",
                    "low": 0.005,
                    "high": 0.06,
                },
                "max_depth": {"type": "int", "low": 10, "high": 90},
                "num_leaves": {
                    "type": "int",
                    "low": 20,
                    "high": 90,
                },
                "min_child_samples": {
                    "type": "int",
                    "low": 10,
                    "high": 70,
                },
                "subsample": {
                    "type": "float",
                    "low": 0.7,
                    "high": 1,
                },
                "colsample_bytree": {
                    "type": "float",
                    "low": 1,
                    "high": 1,
                },
                "min_split_gain": {
                    "type": "float",
                    "low": 0,
                    "high": 2,
                },
                "reg_alpha": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
                "reg_lambda": {
                    "type": "float",
                    "low": 0,
                    "high": 3,
                },
            },
        },
    }

    dict_models = {"LGBMR": LGBMR}

    log_model = True

   

    
    

def get_params_trained_models(model_path):
    model = joblib.load(model_path)
    return model.get_params()


if TRAIN:
    dict_fixed_model_params = get_params_trained_models(
        "/kaggle/input/models-6/LGBMR_0_20231108_235435.pkl"
    )

if TRAIN:
    args = {
        "cv_mthd": cv_mthd,
        "experiment_purpose": EXPERIMENT_PURPOSE,
        "experiment_name": experiment_name,
        "dict_models": dict_models,
        "model_params_dict": model_params_dict,
        "n_splits": N_SPLITS,
        "n_test_splits": N_TEST_SPLITS,
        "n_purge": N_PURGE,
        "n_embargo": N_EMBARGO,
        "experiment_date_str": experiment_date_str,
        "path_artifact_location": pm.path_artifact_location,
        "target_col": "target",
    }
    

Experiment 'optiver_trading_at_the_close_v6' already exists with ID 880376480100122300.


In [16]:

    

def run_mlflow_experiment(df_train, args, trial=None):
    cv_mthd = args["cv_mthd"]
    experiment_purpose = args["experiment_purpose"]
    experiment_name = args["experiment_name"]
    dict_models = args["dict_models"]
    model_params_dict = args["model_params_dict"]

    n_splits = args["n_splits"]
    n_test_splits = args["n_test_splits"]
    n_purge = args["n_purge"]
    n_embargo = args["n_embargo"]
    
    experiment_date_str = args["experiment_date_str"]
    path_artifact_location = args["path_artifact_location"]
    target_col = args["target_col"]

    if trial == None:
        trial = optuna.trial.FixedTrial(
            {
                "n_estimators": 500,
                "learning_rate": 0.005,
                "max_depth": 10,
                "num_leaves": 20,
                "min_child_samples": 10,
                "subsample": 0.7,
                "colsample_bytree": 1.0,
                "min_split_gain": 0.0,
                "reg_alpha": 0.0,
                "reg_lambda": 0.0,
                "device": "gpu" if gpu_switch == "ON" else "cpu",
            }
        )

    run_time_start_trial = datetime.now().strftime("%y_%m_%d_%H%M%S")
    
    with mlflow.start_run(run_name=run_time_start_trial, experiment_id = experiment_id) as run:
        score_list = []

        #mlflow.set_tag("cv_mthd", cv_mthd)
        mlflow.set_tag("n_splits", n_splits)
        mlflow.set_tag("n_test_splits", n_test_splits)
        mlflow.set_tag("n_purge", n_purge)
        mlflow.set_tag("n_embargo", n_embargo)

        for model_name, model_class in dict_models.items():
            if TUNING:
                model = create_model(
                    trial,
                    model_class,
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
            else:
                
                model = model_class(**dict_fixed_model_params)
                

            priority_params = ['learning_rate', 'max_depth']
            excluded_params = ['device', 'class_weight','random_state','silent','verbose','n_jobs']

            
            ordered_params = log_model_parameters(model, priority_params, excluded_params, verbose = True)

            mlflow.log_params(ordered_params)
         

            for fold_n, (train_indices, test_indices) in enumerate(
                time_series_split(
                    df_train, n_splits=n_splits, n_test_splits=n_test_splits, n_purge=n_purge, n_embargo=n_embargo
                )
            ):
                with mlflow.start_run(
                    run_name=f"fold_{fold_n+1}", nested=True, experiment_id = experiment_id
                ) as nested_run:
                    
                    mlflow.set_tag("n_trial", str(trial.number))
            
                    mask_train = df_train["factorized"].isin(train_indices)
                    mask_test = df_train["factorized"].isin(test_indices)

                    y_train = df_train.loc[mask_train, target_col]
                    y_val = df_train.loc[mask_test, target_col]
                    X_train = df_train.loc[mask_train].drop(
                        [target_col, "factorized"], axis=1
                    )
                    X_val = df_train.loc[mask_test].drop(
                        [target_col, "factorized"], axis=1
                    )

                    mlflow.log_param("train_rows", X_train.shape[0])
                    mlflow.log_param("train_cols", X_train.shape[1])

                    model.fit(
                        X_train,
                        y_train,
                        eval_set=[(X_val, y_val)],
                        eval_metric="mae",
                        callbacks=[
                            lgbm.callback.early_stopping(stopping_rounds=100),
                            lgbm.callback.log_evaluation(period=100000),
                        ],
                    )

                    log_feature_importance(
                        trial.number,
                        model,
                        X_train,
                        fold_n,
                        experiment_purpose,
                        experiment_date_str,
                    )

                    fold_score = model.best_score_["valid_0"]["l1"]

                    score_list.append(fold_score)

          
                    mlflow.log_metric("fold_score", round(fold_score, 6))
                    mlflow.log_param("fold_number", fold_n + 1)
                    mlflow.log_param("model_name", model_name)


                    mlflow.log_params(ordered_params)

                    current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                    model_log_name = f"{model_name}_{trial.number}_{current_time_str}"

                  
                    mlflow.sklearn.log_model(model, model_log_name)

                    mlflow.log_param("run_time", current_time_str)

                    nested_run_id = nested_run.info.run_id
                    model_path = f"{path_artifact_location}/{run.info.experiment_id}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                    mlflow.log_param("model_path", model_path)

                avg_score = sum(score_list) / len(score_list)
                median_score = np.median(score_list) 
                mlflow.log_metric("avg score",round(avg_score,6))
                mlflow.log_metric("median score",round(median_score,6))

        return avg_score



def objective(trial, df_train):
    avg_score = run_mlflow_experiment(df_train, args, trial)
    return avg_score


# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train_feats), n_trials=N_TRIALS)

[I 2023-11-11 23:18:02,023] A new study created in memory with name: Your Study Name



learning_rate: 0.01395 | max_depth: 43 | boosting_type: gbdt | colsample_bytree: 1.0 | importance_type: gain | min_child_samples: 18 | min_child_weight: 0.001 | min_split_gain: 0.20082 | n_estimators: 500 | num_leaves: 60 | objective: mae | reg_alpha: 1.78651 | reg_lambda: 2.54073 | subsample: 0.93474 | subsample_for_bin: 200000 | subsample_freq: 0

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's l1: 5.93434
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.30224
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.38112
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 7.03172
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best

[I 2023-11-12 01:09:10,608] Trial 0 finished with value: 6.254037503960918 and parameters: {}. Best is trial 0 with value: 6.254037503960918.



learning_rate: 0.01395 | max_depth: 43 | boosting_type: gbdt | colsample_bytree: 1.0 | importance_type: gain | min_child_samples: 18 | min_child_weight: 0.001 | min_split_gain: 0.20082 | n_estimators: 500 | num_leaves: 60 | objective: mae | reg_alpha: 1.78651 | reg_lambda: 2.54073 | subsample: 0.93474 | subsample_for_bin: 200000 | subsample_freq: 0

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[499]	valid_0's l1: 5.93434
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.30224
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 6.38112
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l1: 7.03172
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best

[I 2023-11-12 03:01:27,171] Trial 1 finished with value: 6.254037503960918 and parameters: {}. Best is trial 0 with value: 6.254037503960918.


In [22]:
def experiments_data(client, list_experiment_id=None, save_df=None, list_columns=None):
    """
    Every time this function is called, it reads all experiments and a new version of the file returns with all the historical experiments
    """
    experiments = client.search_experiments()
    all_runs_data = []
    for exp in experiments:
        experiment_id = exp.experiment_id
        if (list_experiment_id == None) or (experiment_id in list_experiment_id):
            run_infos = client.search_runs(experiment_ids=[experiment_id])

            for run_info in run_infos:
                run_data = {
                    "experiment_id": experiment_id,
                    "experiment_name": exp.name,
                    "run_id": run_info.info.run_id,
                }

                # Add metrics to run_data
                for key, value in run_info.data.metrics.items():
                    run_data[f"{key}"] = value

                # Add params to run_data
                for key, value in run_info.data.params.items():
                    run_data[f"{key}"] = value

                # Add tags to run_data
                for key, value in run_info.data.tags.items():
                    run_data[f"{key}"] = value

                all_runs_data.append(run_data)

    df_runs_new = pd.DataFrame(all_runs_data)

    if list_columns:
        df_runs_new = df_runs_new[list_columns]

    return df_runs_new


In [27]:
if TRAIN:
    df_exp = experiments_data(
        client, list_experiment_id=None, save_df=None, list_columns=None
    )
    list_base_cols = [
        "run_time",
        "experiment_id",
        "n_trial",
        "run_id",
        "model_name",
        "fold_number",
        "fold_score",
    ]
    list_dynamic_params = list(model_params_dict["LGBMR"]["dynamic_params"].keys())

    df_exp["run_time"] = pd.to_datetime(
        df_exp["run_time"], format="%Y%m%d_%H%M%S", errors="coerce"
    )

    for col in df_exp.columns:
        df_exp[col] = pd.to_numeric(df_exp[col], errors="ignore")

    for col in df_exp.select_dtypes(include=["float", "int"]):
        df_exp[col] = df_exp[col].round(5)

    list_cols_exp = ["run_time"] + list_base_cols + list_dynamic_params + ["model_path"]
    
    experiment_id
    df_exp = df_exp[df_exp['experiment_id'] != 0]

    df_exp = df_exp[list_cols_exp]

In [29]:
list(
            df_exp[(df_exp['experiment_id'] == 880376480100122300) &(df_exp['n_trial'] == 0) ]['model_path']
        )

['s3://mlflow-v1/kaggle_optiver_trading_at_the_close//880376480100122300/d60d3f6cb2ae4e12af4414bd87c2f1f8/artifacts/LGBMR_0_20231112_010902/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close//880376480100122300/2df0a094697247c08d7f8cd15f491684/artifacts/LGBMR_0_20231112_004637/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close//880376480100122300/6891e5cf212b46fbb1bbe7ef611b875e/artifacts/LGBMR_0_20231112_002442/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close//880376480100122300/ff61f916a31143468b73932e430aadc8/artifacts/LGBMR_0_20231112_000222/model.pkl',
 's3://mlflow-v1/kaggle_optiver_trading_at_the_close//880376480100122300/bef29681919f455088555bc1a92206e0/artifacts/LGBMR_0_20231111_234007/model.pkl']

In [28]:
df_exp

Unnamed: 0,run_time,run_time.1,experiment_id,n_trial,run_id,model_name,fold_number,fold_score,n_estimators,learning_rate,max_depth,num_leaves,min_child_samples,subsample,colsample_bytree,min_split_gain,reg_alpha,reg_lambda,model_path
0,1699758078000000000,1699758078000000000,880376480100122300,1.0,e09f317cd14b43828b61d25287dd401a,LGBMR,5.0,5.62076,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
1,1699756699000000000,1699756699000000000,880376480100122300,1.0,2abde29763e54dfa88342332d5866d3d,LGBMR,4.0,7.03172,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
2,1699755336000000000,1699755336000000000,880376480100122300,1.0,0925766b9e2a4bd5bae24e65d179a0d6,LGBMR,3.0,6.38112,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
3,1699753998000000000,1699753998000000000,880376480100122300,1.0,b0fdcb3becad4836b017e141b275f8f8,LGBMR,2.0,6.30224,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
4,1699752668000000000,1699752668000000000,880376480100122300,1.0,08f314be471446acb9831eaf9ded3723,LGBMR,1.0,5.93434,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
5,-9223372036854775808,-9223372036854775808,880376480100122300,,980298ebfc6b4323b7e9303d593af8bd,,,,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,
6,1699751342000000000,1699751342000000000,880376480100122300,0.0,d60d3f6cb2ae4e12af4414bd87c2f1f8,LGBMR,5.0,5.62076,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
7,1699749997000000000,1699749997000000000,880376480100122300,0.0,2df0a094697247c08d7f8cd15f491684,LGBMR,4.0,7.03172,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
8,1699748682000000000,1699748682000000000,880376480100122300,0.0,6891e5cf212b46fbb1bbe7ef611b875e,LGBMR,3.0,6.38112,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...
9,1699747342000000000,1699747342000000000,880376480100122300,0.0,ff61f916a31143468b73932e430aadc8,LGBMR,2.0,6.30224,500,0.01395,43,60,18,0.93474,1.0,0.20082,1.78651,2.54073,s3://mlflow-v1/kaggle_optiver_trading_at_the_c...


In [19]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.mean(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred

In [20]:
#artifact_paths = list(df_exp[(df_exp['experiment_id'] == 907402546884726807) &(df_exp['n_trial'] == 0) ]['model_path'])

In [22]:
if TRAIN:
    model_paths = [
        path
        for path in list(
            df_exp[(df_exp['experiment_id'] == 773486292991054569) &(df_exp['n_trial'] == 0) ]['model_path']
        )
        if path is not np.nan
    ]

    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    for model_path in model_paths:
        if not os.path.exists(model_path):
            print(f"File does not exist: {model_path}")
            continue  # Skip to the next iteration

        specific_part = model_path.split("/")[-2]
        dest_path = os.path.join(models_dir, f"{specific_part}.pkl")
        if not os.path.exists(dest_path):
            print(f"Copying model to {dest_path}")
            shutil.copy(model_path, dest_path)
        else:
            print(f"File {dest_path} already exists. Skipping copy.")

    zipf = zipfile.ZipFile(
        f"/kaggle/working/{models_dir}.zip", "w", zipfile.ZIP_DEFLATED
    )

    # Navigate through the folder and add each file to the ZIP
    for root, dirs, files in os.walk(f"/kaggle/working/{models_dir}"):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(
                    os.path.join(root, file), f"/kaggle/working/{models_dir}"
                ),
            )

    zipf.close()

NameError: name 'models_dir' is not defined

In [None]:
model_paths = []
models_dir_input = models_dir.replace("_", "-")
directory = f"/kaggle/input/{models_dir_input}"

# Check if the directory exists
if os.path.exists(directory):
    # Traverse the directory and collect file paths
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)

        # Check if the item is a file (and not a sub-directory)
        if os.path.isfile(full_path):
            model_paths.append(full_path)
else:
    print(f"The directory {directory} does not exist.")

# Print or return the list of file paths
print("List of file paths:", model_paths)

In [None]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [None]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
df_train = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
global_stock_id_feats = {
    "median_size": df_train.groupby("stock_id")["bid_size"].median()
    + df_train.groupby("stock_id")["ask_size"].median(),
    "std_size": df_train.groupby("stock_id")["bid_size"].std()
    + df_train.groupby("stock_id")["ask_size"].std(),
    "ptp_size": df_train.groupby("stock_id")["bid_size"].max()
    - df_train.groupby("stock_id")["bid_size"].min(),
    "median_price": df_train.groupby("stock_id")["bid_price"].median()
    + df_train.groupby("stock_id")["ask_price"].median(),
    "std_price": df_train.groupby("stock_id")["bid_price"].std()
    + df_train.groupby("stock_id")["ask_price"].std(),
    "ptp_price": df_train.groupby("stock_id")["bid_price"].max()
    - df_train.groupby("stock_id")["ask_price"].min(),
}

In [None]:
counter = 0
df_tot_test = []
for test, revealed_targets, sample_prediction in iter_test:
    test["time_id"] = counter
    test["target"] = "none"

    if counter < 17:
        df_tot_test.append(test)
    else:
        df_tot_test = df_tot_test[1:]
        df_tot_test.append(test)

    df_test = pd.concat(df_tot_test, axis=0, ignore_index=True)

    feat = feat_engineering(df_test)
    feat = feat.sort_values(["date_id", "seconds_in_bucket", "stock_id"])[-len(test) :]

    list_cols_drop = ["date_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)

    model = joblib.load("/kaggle/input/models-12/LGBMR_0_20231110_100236.pkl")

    list_features = model.feature_name_
    feat = feat[list_features]
    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1