In [1]:
# Constants and Configuration Variables
DEBUG = False
TRAIN = False
TUNING = True
OVERWRITE_PROCESSED_DATA = True
N_TRIALS = 10
STATE = 42
N_FOLD_BREAK = 7

GPU_SWITCH = "ON"
N_SPLITS = 5
N_TEST_SPLITS = 1
N_PURGE = 20
N_EMBARGO = 20

SAVE_MODELS = False

VERSION_NB = 22
EXPERIMENT_PURPOSE = "optiver_trading_at_the_close"

list_experiment_id = ["329561590225205643"]
run_name_startswith = "23_11_20_111711_fold"

model_params_dict = {
    "LGBMR": {
        "static_params": {
            "device": "gpu" if GPU_SWITCH == "ON" else "cpu",
            "objective": "mae",
            "boosting_type": "gbdt",
            "random_state": STATE,
            "n_jobs": 4,
            "verbose": -1,
            "importance_type": "gain",
            "max_bin": 254,
        },
        "dynamic_params": {
            "n_estimators": {
                "type": "int",
                "low": 1000,
                "high": 1000,
            },
            "learning_rate": {
                "type": "float",
                "low": 0.0131,
                "high": 0.0131,
            },
            "max_depth": {"type": "int", "low": 13, "high": 13},
            "num_leaves": {
                "type": "int",
                "low": 205,
                "high": 205,
            },
            "min_child_samples": {
                "type": "int",
                "low": 20,
                "high": 20,
            },
            "subsample": {
                "type": "float",
                "low": 0.45,
                "high": 0.45,
            },
            "colsample_bytree": {
                "type": "float",
                "low": 0.5,
                "high": 0.5,
            },
        },
    },
}


# Define the model name for registration in MLflow
version = 12
model_name = f"ensemble_model_{version}.pkl"
folder_model = f"models-v{version}"

In [2]:
# External general-purpose modules
import gc
import sys
import os
import pickle
import itertools as itt
from itertools import combinations, product
from datetime import datetime
import numpy as np
import pandas as pd

import joblib
import time
from tqdm import tqdm

from pathlib import Path
import warnings
from numba import njit, prange
import boto3
from botocore.exceptions import NoCredentialsError
from dotenv import load_dotenv


# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore")
warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

# Load environment variables
load_dotenv()
path_root_project = Path.cwd()
if path_root_project.name not in ["working", "content"]:
    path_root_project = Path(os.getenv("ROOT_PATH") or path_root_project)

    directories_to_add = ["utils", "feat_engineering", "validation"]
    for directory in directories_to_add:
        sys.path.append(str(path_root_project / "src" / directory))


from utils_training import create_model, experiments_data
from utils_data import load_config, load_dataset, reduce_mem_usage, PathManager
from utils_kaggle import (
    setup_kaggle,
    download_data,
    get_data,
    clean_directory_except_one,
)

from fe_optiver_trading_at_the_close import (
    calculate_triplet_imbalance_numba,
    convert_weights_to_dict,
    global_stock_id_feats,
    compute_rolling_averages,
    generate_rsi,
)

pm = PathManager(path_root_project)

if TRAIN:
    if pm.path_root_project.name == "working":
        from kaggle_secrets import UserSecretsClient

        user_secrets = UserSecretsClient()
        aws_access_key_id = user_secrets.get_secret("AWS_ACCESS_KEY_ID")
        aws_region = user_secrets.get_secret("AWS_DEFAULT_REGION")
        aws_secret_access_key = user_secrets.get_secret("AWS_SECRET_ACCESS_KEY")
        s3_bucket_name = user_secrets.get_secret("S3_BUCKET")

        # Set AWS credentials in the environment variables
        os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
        os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
        os.environ["AWS_DEFAULT_REGION"] = aws_region
    else:
        aws_access_key_id = os.getenv("AWS_ACCESS_KEY_ID")
        aws_secret_access_key = os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
model_prod = joblib.load(pm.path_model_production / folder_model / model_name)

model = model_prod.models[0]
for model in model_prod.models:
    print(model.best_score_)

defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('l1', 5.655647631686068)])})
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('l1', 7.070589227107189)])})
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('l1', 6.410497376211345)])})
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('l1', 6.336143849666122)])})
defaultdict(<class 'collections.OrderedDict'>, {'valid_0': OrderedDict([('l1', 5.967429157930154)])})


In [4]:
# df_feat = pd.read_csv("/kaggle/working/feat_impor_optiver_trading_at_the_close_23_11_18_0438.csv").sort_values("t0_imp_fold_1")

In [5]:
def aggregate_feature_importance(df_feat_importance):
    df_feat_importance["feat_imp_mean"] = df_feat_importance.loc[
        :, df_feat_importance.columns != "feat"
    ].mean(axis=1, skipna=True)

    df_feat_importance.sort_values("feat_imp_mean", inplace=True, ascending=False)
    df_feat_importance.reset_index(drop=True, inplace=True)

    cols = [
        col
        for col in df_feat_importance.columns
        if col not in ["feat", "feat_imp_mean"]
    ]

    df_feat_importance["feat_imp_std"] = df_feat_importance.loc[:, cols].std(
        axis=1, skipna=True
    )

    df_feat_importance["feat_imp_variability"] = (
        df_feat_importance["feat_imp_std"] / df_feat_importance["feat_imp_mean"]
    )

    return df_feat_importance

In [6]:
def log_feature_importance(trial_number, model, X, fold_n, exp_purpose, exp_date_str):
    """
    Logs the feature importances for a given model and fold number.
    """

    feature_importances = model.feature_importances_
    new_importance_df = pd.DataFrame(
        {"feat": X.columns, f"t{trial_number}_imp_fold_{fold_n+1}": feature_importances}
    )

    csv_path = f"feat_impor_{exp_purpose}_{exp_date_str}.csv"

    # Check if the CSV already exists
    if os.path.exists(csv_path):
        # If so, read it and merge with the new importance values
        existing_df = pd.read_csv(csv_path)
        importance_df = pd.merge(existing_df, new_importance_df, on="feat", how="outer")
    else:
        # If not, create a new DataFrame
        importance_df = new_importance_df

    # Save the updated DataFrame to CSV
    importance_df.to_csv(csv_path, index=False)


#    mlflow.log_artifact(csv_path)

In [7]:


# Conditional imports and settings based on TRAIN constant
if TRAIN:
    if pm.path_root_project.name == "working":
        !pip install loguru mlflow optuna > /dev/null

        #!pip uninstall -y polars

        #!pip install functime --no-index --find-links=file:///kaggle/input/functime/functime/
        #!pip install polars --no-index --find-links=file:///kaggle/input/polars/polars/
        !pip install polars
        from functime.functime.cross_validation import train_test_split
        #import polars.polars as pl
        import polars as pl
    from utils_mlflow import (
        get_experiments_df,
        delete_runs_and_artifacts,
        download_and_load_model,
        load_models_and_create_ensemble,
        save_and_register_model,
        log_model_parameters,
        get_or_create_experiment,
        experiments_data,
      #  list_path_models
        )
    from utils_feat_importance import log_feature_importance, aggregate_feature_importance
    from model_validation import time_series_split

    # External Libraries
    import boto3
    from botocore.exceptions import NoCredentialsError
    from mlflow.exceptions import MlflowException
    import lightgbm as lgbm
    import mlflow
    import optuna
    from mlflow.tracking import MlflowClient
    from optuna.integration.mlflow import MLflowCallback
    from sklearn.model_selection import KFold
    from xgboost import XGBRegressor as XGBR
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR



    # Auto-reload modules - Specific to Jupyter Notebooks
    %load_ext autoreload
    %autoreload 2
    if not DEBUG:
        mlflow.set_tracking_uri(pm.path_experiments_dir)

    client = MlflowClient()

    # Create an S3 client
    s3 = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )

In [8]:
def save_and_register_model(ensemble_model, model_name):
    """
    Function to save and register the ensemble model
    """
    with open(model_name, "wb") as f:
        pickle.dump(ensemble_model, f)

    with mlflow.start_run() as run:
        mlflow.log_artifact(model_name, "model")
        run_id = run.info.run_id

        # Records the model in the Model Registry
        model_uri = f"runs:/{run_id}/model"
        mlflow.register_model(model_uri, model_name)

    # Clean up the local file system
    # if os.path.exists(temp_ensemble_path):
    #     os.remove(temp_ensemble_path)

    print(f"Ensemble model registered under run_id: {run_id}")

In [9]:
if SAVE_MODELS:
    # model_paths = list_path_models(list_experiment_id, run_name_startswith)
    model_paths = [
        "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/b5c5ccea4db545a7ad3f8db19780e059/artifacts/LGBMR_0_20231130_184805/model.pkl",
        "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/cd6c9ceddd4c45c69e14311da59f66c6/artifacts/LGBMR_0_20231130_184325/model.pkl",
        "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/2438bcb4cafd401697dcad61c79e1517/artifacts/LGBMR_0_20231130_183846/model.pkl",
        "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/6f22898cbccc4959b8cd9c70bc5de2b5/artifacts/LGBMR_0_20231130_183400/model.pkl",
        "s3://mlflow-v1/kaggle_optiver_trading_at_the_close/239b00d1308a4c5faf19a5c479705a8d/artifacts/LGBMR_0_20231130_182911/model.pkl",
    ]
    s3 = boto3.client(
        "s3",
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
    )
    # Load the models and create an ensemble
    ensemble_model = load_models_and_create_ensemble(s3, model_paths)

    # Save and register the ensemble model in MLflow
    save_and_register_model(ensemble_model, model_name)

In [10]:
if TRAIN:
    if not os.path.exists(pm.path_dataset_processed) or OVERWRITE_PROCESSED_DATA:
        df_train_raw = pd.read_csv(pm.path_data_train_raw)

        if DEBUG:
            df_train_raw = df_train_raw[df_train_raw["stock_id"].isin([0, 1, 2, 3, 4])]

        drop_idx = df_train_raw.loc[
            df_train_raw["target"].isna(), "target"
        ].index.to_list()
        df_train = df_train_raw.drop(drop_idx, axis=0)
        df_train.reset_index(drop=True, inplace=True)
    else:
        df_train = pd.read_csv(pm.path_dataset_processed)
        if DEBUG:
            df_train = df_train[df_train["stock_id"].isin([0, 1, 2, 3, 4])]

    # df_train.sort_values(["time_id", "stock_id"], inplace=True)

In [11]:
def feat_engineering(df_train):
    df = pl.DataFrame(df_train)

    df = df.with_columns(
        [
            pl.col("time_id").cast(pl.Int32),
            (pl.col("imbalance_size") * pl.col("imbalance_buy_sell_flag")).alias(
                "imbalance_size"
            ),
        ]
    )

    df = df.drop("row_id", "date_id", "imbalance_buy_sell_flag")

    df_size_sum = df.groupby("time_id").agg(
        [pl.col("matched_size").sum().alias("matched_size_sum")]
    )

    df = df.join(df_size_sum, on="time_id")

    df = df.with_columns(
        (pl.col("matched_size") / pl.col("matched_size_sum")).alias(
            "matched_size_ratio"
        )
    )

    df = df.with_columns(
        [(pl.col("matched_size_ratio") * pl.col("wap")).alias("weighted_wap")]
    )

    # Aggregazione delle dimensioni totali per time_id
    df_weighted_wap_sum = df.groupby("time_id").agg(
        [pl.col("weighted_wap").sum().alias("index_approx")]
    )

    # Unione delle statistiche al DataFrame originale
    df = df.join(df_weighted_wap_sum, on="time_id")

    #################################################################################

    # Calcolo delle statistiche di mercato per WAP
    wap_market_stats = df.groupby(["time_id"]).agg(
        [
            pl.col("wap").mean().alias("wap_mean"),
            pl.col("wap").median().alias("wap_median"),
            pl.col("wap").std().alias("wap_std"),
            pl.col("wap").quantile(0.25).alias("wap_quantile_25"),
            pl.col("wap").quantile(0.75).alias("wap_quantile_75"),
        ]
    )

    # Unione delle statistiche al DataFrame originale
    df = df.join(wap_market_stats, on=["time_id"])

    df = df.sort(["stock_id", "time_id"])

    df = df.with_columns(
        [
            ((pl.col("wap") / pl.col("wap").shift(6)) * 10000)
            .over("stock_id")
            .alias("wap_return_6"),
            ((pl.col("wap_mean") / pl.col("wap_mean").shift(6)) * 10000)
            .over("stock_id")
            .alias("wap_mean_return_6"),
            ((pl.col("index_approx") / pl.col("index_approx").shift(6)) * 10000)
            .over("stock_id")
            .alias("index_approx_return_6"),
        ]
    )

    # Calcolo della differenza dei rendimenti in basis points
    df = df.with_columns(
        (pl.col("wap_return_6") - pl.col("wap_mean_return_6")).alias(
            "wap_delta_return_6"
        )
    )

    df = df.with_columns(
        (pl.col("wap_return_6") - pl.col("index_approx_return_6")).alias(
            "target_approx"
        )
    )

    list_cols = ["wap_delta_return_6", "target_approx"]

    for col in list_cols:
        for window in tqdm([3, 6, 9, 15], desc=f"rolling mean for {col}"):
            rolling_group = df.group_by_rolling(
                index_column="time_id",
                period=f"{window}i",
                by="stock_id",
                closed="left",  # Adjust as needed
            )

            df = df.join(
                rolling_group.agg(pl.col(col).mean().alias(f"{col}_mean_{window}")),
                on=["stock_id", "time_id"],
                how="left",
            )
        for window in tqdm([9, 15, 30], desc=f"rolling std for {col}"):
            rolling_group = df.group_by_rolling(
                index_column="time_id",
                period=f"{window}i",
                by="stock_id",
                closed="left",  # Adjust as needed
            )

            df = df.join(
                rolling_group.agg(pl.col(col).std().alias(f"{col}_std_{window}")),
                on=["stock_id", "time_id"],
                how="left",
            )
    return df.to_pandas()

In [12]:
if TRAIN:
    df_train_feats = feat_engineering(df_train)
    print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

In [13]:
if TRAIN:
    col_split = "time_id"
    df_train_feats.sort_values([col_split], inplace=True)

    df_train_feats["factorized"] = pd.factorize(df_train_feats[col_split])[0]

    df_train_feats.sort_values(["time_id", "stock_id"], inplace=True)
    list_cols_drop = [ "time_id"]

    df_train_feats.reset_index(drop=True, inplace=True)
    df_train_feats.drop(list_cols_drop, axis=1, inplace=True)

In [14]:
if TRAIN:
    experiment_name = f"{EXPERIMENT_PURPOSE}_v{VERSION_NB}"
    name_folder_models = f"models_v{VERSION_NB}"

    experiment_date_str = datetime.now().strftime("%y_%m_%d_%H%M")

    if DEBUG:
        experiment_name = f"{experiment_name}_debug"
        name_folder_models = f"{name_folder_models}_debug"
        path_artifact_location = "."
    else:
        path_artifact_location = pm.path_artifact_location

    experiment_id = get_or_create_experiment(
        client, experiment_name, artifact_location=path_artifact_location
    )

    nbrnd_erly_stp = 130
    cv_mthd = "KF"

    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

    all_cv = {"KF": KFold(n_splits=5, shuffle=True, random_state=STATE)}
    cv = all_cv[cv_mthd]

    dict_models = {"LGBMR": LGBMR}

    log_model = True

    args = {
        "cv_mthd": cv_mthd,
        "experiment_purpose": EXPERIMENT_PURPOSE,
        "experiment_name": experiment_name,
        "dict_models": dict_models,
        "model_params_dict": model_params_dict,
        "n_splits": N_SPLITS,
        "n_test_splits": N_TEST_SPLITS,
        "n_purge": N_PURGE,
        "n_embargo": N_EMBARGO,
        "experiment_date_str": experiment_date_str,
        "path_artifact_location": pm.path_artifact_location,
        "target_col": "target",
    }

In [15]:
def run_mlflow_experiment(df_train, args, trial=None):
    cv_mthd = args["cv_mthd"]
    experiment_purpose = args["experiment_purpose"]
    experiment_name = args["experiment_name"]
    dict_models = args["dict_models"]
    model_params_dict = args["model_params_dict"]

    n_splits = args["n_splits"]
    n_test_splits = args["n_test_splits"]
    n_purge = args["n_purge"]
    n_embargo = args["n_embargo"]

    experiment_date_str = args["experiment_date_str"]
    path_artifact_location = args["path_artifact_location"]
    target_col = args["target_col"]

    if trial == None:
        trial = optuna.trial.FixedTrial(
            {
                "n_estimators": 500,
                "learning_rate": 0.005,
                "max_depth": 10,
                "num_leaves": 20,
                "min_child_samples": 10,
                "subsample": 0.7,
                "colsample_bytree": 1.0,
                "min_split_gain": 0.0,
                "reg_alpha": 0.0,
                "reg_lambda": 0.0,
                "device": "gpu" if GPU_SWITCH == "ON" else "cpu",
            }
        )

    run_time_start_trial = datetime.now().strftime("%y_%m_%d_%H%M%S")

    with mlflow.start_run(
        run_name=run_time_start_trial, experiment_id=experiment_id
    ) as run:
        score_list = []

        # mlflow.set_tag("cv_mthd", cv_mthd)
        mlflow.set_tag("n_splits", n_splits)
        mlflow.set_tag("n_test_splits", n_test_splits)
        mlflow.set_tag("n_purge", n_purge)
        mlflow.set_tag("n_embargo", n_embargo)

        for model_name, model_class in dict_models.items():
            if TUNING:
                model = create_model(
                    trial,
                    model_class,
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )

            else:
                params = model_prod.get_params()
                params["device"] = "gpu" if GPU_SWITCH == "ON" else "cpu"

                model = model_class(**params)

            priority_params = ["learning_rate", "max_depth"]
            excluded_params = [
                "device",
                "class_weight",
                "random_state",
                "silent",
                "verbose",
                "n_jobs",
            ]

            ordered_params = log_model_parameters(
                model, priority_params, excluded_params, verbose=True
            )

            mlflow.log_params(ordered_params)

            for fold_n, (train_indices, test_indices) in enumerate(
                time_series_split(
                    df_train,
                    n_splits=n_splits,
                    n_test_splits=n_test_splits,
                    n_purge=n_purge,
                    n_embargo=n_embargo,
                )
            ):
                with mlflow.start_run(
                    run_name=f"{run_time_start_trial}_fold_{fold_n+1}",
                    nested=True,
                    experiment_id=experiment_id,
                ) as nested_run:
                    mlflow.set_tag("n_trial", str(trial.number))

                    mask_train = df_train["factorized"].isin(train_indices)
                    mask_test = df_train["factorized"].isin(test_indices)

                    y_train = df_train.loc[mask_train, target_col]
                    y_val = df_train.loc[mask_test, target_col]
                    X_train = df_train.loc[mask_train].drop(
                        [target_col, "factorized"], axis=1
                    )
                    X_val = df_train.loc[mask_test].drop(
                        [target_col, "factorized"], axis=1
                    )

                    print(f"X_train shape: {X_train.shape}")

                    mlflow.log_param("train_rows", X_train.shape[0])
                    mlflow.log_param("train_cols", X_train.shape[1])

                    model.fit(
                        X_train,
                        y_train,
                        eval_set=[(X_val, y_val)],
                        eval_metric="mae",
                        callbacks=[
                            lgbm.callback.early_stopping(stopping_rounds=100),
                            lgbm.callback.log_evaluation(period=100),
                        ],
                    )

                    log_feature_importance(
                        trial.number,
                        model,
                        X_train,
                        fold_n,
                        experiment_purpose,
                        experiment_date_str,
                    )

                    del mask_train, mask_test, X_train, y_train

                    fold_score = model.best_score_["valid_0"]["l1"]

                    score_list.append(fold_score)

                    mlflow.log_metric("fold_score", round(fold_score, 6))
                    mlflow.log_param("fold_number", fold_n + 1)
                    mlflow.log_param("model_name", model_name)

                    mlflow.log_params(ordered_params)

                    current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                    model_log_name = f"{model_name}_{trial.number}_{current_time_str}"

                    mlflow.sklearn.log_model(model, model_log_name)

                    mlflow.log_param("run_time", current_time_str)

                    nested_run_id = nested_run.info.run_id
                    model_path = f"{path_artifact_location}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                    mlflow.log_param("model_path", model_path)

                avg_score = sum(score_list) / len(score_list)
                median_score = np.median(score_list)
                mlflow.log_metric("avg score", round(avg_score, 6))
                mlflow.log_metric("median score", round(median_score, 6))
                gc.collect()
                if fold_n >= N_FOLD_BREAK:
                    break

        return avg_score


def objective(trial, df_train):
    avg_score = run_mlflow_experiment(df_train, args, trial)
    return avg_score


# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, df_train_feats), n_trials=N_TRIALS)

In [16]:
if TRAIN:
    df_exp = experiments_data(
        client, list_experiment_id=None, save_df=None, list_columns=None
    )
    list_base_cols = [
        "run_time",
        "experiment_id",
        "n_trial",
        "run_id",
        "model_name",
        "fold_number",
        "fold_score",
    ]
    list_dynamic_params = list(model_params_dict["LGBMR"]["dynamic_params"].keys())

    df_exp["run_time"] = pd.to_datetime(
        df_exp["run_time"], format="%Y%m%d_%H%M%S", errors="coerce"
    )

    for col in df_exp.columns:
        df_exp[col] = pd.to_numeric(df_exp[col], errors="ignore")

    for col in df_exp.select_dtypes(include=["float", "int"]):
        df_exp[col] = df_exp[col].round(5)

    list_cols_exp = ["run_time"] + list_base_cols + list_dynamic_params + ["model_path"]

    experiment_id
    df_exp = df_exp[df_exp["experiment_id"] != 0]

    df_exp = df_exp[list_cols_exp]

In [17]:
import polars as pl

In [18]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out


import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
y_min, y_max = -64, 64
qps, predictions = [], []
cache = pd.DataFrame()

for test, revealed_targets, sample_prediction in iter_test:
    now_time = time.time()

    test["time_id"] = counter

    cache = pd.concat([cache, test], ignore_index=True, axis=0)
    if counter > 0:
        cache = (
            cache.groupby(["stock_id"])
            .tail(21)
            .sort_values(by=["date_id", "seconds_in_bucket", "stock_id"])
            .reset_index(drop=True)
        )

    feat = feat_engineering(cache)[-len(test) :]

    # added after new API, reference: https://www.kaggle.com/competitions/optiver-trading-at-the-close/discussion/455690#2526672
    if test.currently_scored.iloc[0] == False:
        sample_prediction["target"] = 0
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, "qps:", np.mean(qps))
        continue

    # feat = feat.drop(columns = ["currently_scored"])
    # end of new codes for new API

    # Generate predictions for each model and calculate the weighted average

    list_cols_drop = ["date_id"]
    feat.drop(list_cols_drop, axis=1, inplace=True)

    list_features = model_prod.models[0].feature_name_
    feat = feat[list_features]
    lgb_predictions = model_prod.predict(feat, "mean")

    lgb_predictions = zero_sum(lgb_predictions, test["bid_size"] + test["ask_size"])
    clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
    sample_prediction["target"] = clipped_predictions
    env.predict(sample_prediction)
    counter += 1
    qps.append(time.time() - now_time)
    if counter % 10 == 0:
        print(counter, "qps:", np.mean(qps))

time_cost = 1.146 * np.mean(qps)
print(
    f"The code will take approximately {np.round(time_cost, 4)} hours to reason about"
)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 588.47it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 1327.45it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 916.84it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 1316.34it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 682.03it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 658.03it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 708.80it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 646.57it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 646.55it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 598.05it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 744.10it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 734.00it/s]
rolling mean for wap_delta

10 qps: 0.08384740352630615


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 434.63it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 481.38it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 553.72it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 373.79it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 439.50it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 411.46it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 539.22it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 426.34it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 433.30it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 404.28it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 534.95it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 391.42it/s]
rolling mean for wap_delta_r

20 qps: 0.08209611177444458


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 327.68it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 224.62it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 251.52it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 200.86it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 318.41it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 247.74it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 342.46it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 243.52it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 314.33it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 233.03it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 311.83it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 231.05it/s]
rolling mean for wap_delta_r

30 qps: 0.08518937428792318


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 336.59it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 248.62it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 338.71it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 264.90it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 339.90it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 251.41it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 342.29it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 249.30it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 331.78it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 252.95it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 341.95it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 252.39it/s]
rolling mean for wap_delta_r

40 qps: 0.08651257753372192


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 340.03it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 248.51it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 337.03it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 248.88it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 305.26it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 215.64it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 297.66it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 230.25it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 318.69it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 229.19it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 309.74it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 241.64it/s]
rolling mean for wap_delta_r

50 qps: 0.08758815288543702


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 300.25it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 231.41it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 315.06it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 238.98it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 317.78it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 224.17it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 303.39it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 254.74it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 329.35it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 241.07it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 344.48it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 251.97it/s]
rolling mean for wap_delta_r

60 qps: 0.08850327730178834


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 319.56it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 224.46it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 267.59it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 251.47it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 300.15it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 248.69it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 344.90it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 246.64it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 326.28it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 244.24it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 278.16it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 207.33it/s]
rolling mean for wap_delta_r

70 qps: 0.08951924868992396


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 320.52it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 240.47it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 320.78it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 260.94it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 311.01it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 252.55it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 342.71it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 260.57it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 323.84it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 245.32it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 317.26it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 245.47it/s]
rolling mean for wap_delta_r

80 qps: 0.08987179398536682


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 325.20it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 242.75it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 330.32it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 225.38it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 331.00it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 238.80it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 342.33it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 249.96it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 305.77it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 235.10it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 306.28it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 233.65it/s]
rolling mean for wap_delta_r

90 qps: 0.09194361633724636


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 312.74it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 230.10it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 323.83it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 228.20it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 307.37it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 230.67it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 284.55it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 235.52it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 304.66it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 214.97it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 293.40it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 214.42it/s]
rolling mean for wap_delta_r

100 qps: 0.09350323915481568


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 269.51it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 227.17it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 292.89it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 238.43it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 309.54it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 233.59it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 315.86it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 215.09it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 324.84it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 238.94it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 345.02it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 248.05it/s]
rolling mean for wap_delta_r

110 qps: 0.09366058869795366


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 287.25it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 237.98it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 310.33it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 203.68it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 304.76it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 250.01it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 333.77it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 234.62it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 297.26it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 235.83it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 277.08it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 212.92it/s]
rolling mean for wap_delta_r

120 qps: 0.09407228032747904


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 319.98it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 245.10it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 322.36it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 225.84it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 308.98it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 232.27it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 319.97it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 204.47it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 314.96it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 237.54it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 333.87it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 236.19it/s]
rolling mean for wap_delta_r

130 qps: 0.09406865743490365


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 303.98it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 237.88it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 334.59it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 251.67it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 318.99it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 233.81it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 277.72it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 227.35it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 311.38it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 208.26it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 306.30it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 234.45it/s]
rolling mean for wap_delta_r

140 qps: 0.09409404311861311


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 311.37it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 234.52it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 323.24it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 227.44it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 270.00it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 244.42it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 325.45it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 251.57it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 310.17it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 222.90it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 298.67it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 254.71it/s]
rolling mean for wap_delta_r

150 qps: 0.09422580242156982


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 330.43it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 246.49it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 337.16it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 247.93it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 324.22it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 252.59it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 338.99it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 247.82it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 322.58it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 242.45it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 300.62it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 227.80it/s]
rolling mean for wap_delta_r

160 qps: 0.09425586313009263


rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 283.48it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 221.64it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 286.70it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 233.35it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 301.93it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 208.92it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 325.00it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 216.79it/s]
rolling mean for wap_delta_return_6: 100%|██████████| 4/4 [00:00<00:00, 308.06it/s]
rolling std for wap_delta_return_6: 100%|██████████| 3/3 [00:00<00:00, 238.08it/s]
rolling mean for target_approx: 100%|██████████| 4/4 [00:00<00:00, 318.78it/s]
rolling std for target_approx: 100%|██████████| 3/3 [00:00<00:00, 244.73it/s]
rolling mean for wap_delta_r

The code will take approximately 0.1083 hours to reason about


In [19]:
# clean_directory_except_one('/kaggle/working/','submission.csv')