In [2]:
# Constants
TRAIN = False
OVERWRITE = True
DEBUG = True

download_kaggle_data = False

# External general-purpose modules
import os
import shutil
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
from dotenv import load_dotenv
from joblib import dump
import joblib

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Load environment variables
load_dotenv()

# Setting up the project directory path
path_project_dir = os.getcwd()
if path_project_dir not in ["/kaggle/working", "/content"]:
    path_project_dir = os.getenv("ROOT_PATH")

# Imports and setup for training
if TRAIN:
    # Install packages and import logging libraries
    if path_project_dir == '/kaggle/working':
        !pip install loguru mlflow optuna > /dev/null
    
    from loguru import logger
    import mlflow
    import optuna
    from optuna.integration.mlflow import MLflowCallback
    from mlflow.tracking import MlflowClient
    
    from termcolor import colored
    from tqdm import tqdm

    # Import machine learning libraries
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR
    from sklearn.model_selection import KFold
    from xgboost import XGBRegressor as XGBR

    # Set logging
    logger.add("logs.log", format="{time:YYYY-MM-DD HH:mm} | {level} | {message}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
    
    # Auto-reload modules
    %load_ext autoreload
    %autoreload 2

    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )


In [3]:
def log_training_details(logger, model, trial, model_name):
    logger.info(colored(f"Training model: {model_name}", "blue"))

    dynamic_params = {key: value for key, value in trial.params.items()}

    logger.info(
        colored(
            f"Trial {trial.number:<4} | "
            + " | ".join(f"{key}: {value}" for key, value in dynamic_params.items()),
            "green",
        )
    )

    logger.info(f"{'Fold':<5} {'|':<2} {'MAE':<20}")
    logger.info(f"{'-----':<5} {'|':<2} {'--------------------':<20}")

## Feature Engineering


In [5]:
def generate_features(df):
    features = [
        "seconds_in_bucket",
        "imbalance_buy_sell_flag",
        "imbalance_size",
        "matched_size",
        "bid_size",
        "ask_size",
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
        "imb_s1",
        "imb_s2",
    ]

    df["imb_s1"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["imb_s2"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )

    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]

    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            if i > j:
                df[f"{a}_{b}_imb"] = df.eval(f"({a}-{b})/({a}+{b})")
                features.append(f"{a}_{b}_imb")

    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            for k, c in enumerate(prices):
                if i > j and j > k:
                    max_ = df[[a, b, c]].max(axis=1)
                    min_ = df[[a, b, c]].min(axis=1)
                    mid_ = df[[a, b, c]].sum(axis=1) - min_ - max_

                    df[f"{a}_{b}_{c}_imb2"] = (max_ - mid_) / (mid_ - min_)
                    features.append(f"{a}_{b}_{c}_imb2")

    return df[features]

In [None]:
if path_project_dir == "/kaggle/working":
    path_data_project_dir = "/kaggle/input/optiver-trading-at-the-close"
    path_experiments_storage = os.path.join(path_project_dir, "experiments_storage")

    path_dataset_train_raw = "/kaggle/input/optiver-trading-at-the-close/train.csv"
    path_dataset_test_raw = (
        "/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv"
    )

    path_dataset_processed = "/kaggle/working/processed_data"

else:
    name_folder_data_project = "kaggle_optiver_trading_at_the_close"

    path_data_dir = os.path.join(path_project_dir, "data")
    path_data_project_dir = os.path.join(path_data_dir, name_folder_data_project)

    path_config_dir = os.path.join(path_project_dir, "config")
    path_config_train = os.path.join(path_config_dir, "train_config.yaml")

    path_experiments_storage = os.path.join(
        path_data_project_dir, "experiments_storage"
    )

    if download_kaggle_data:
        dataset_name = "ravi20076/optiver-memoryreduceddatasets"
        kaggle_json_path = os.path.join(path_project_dir, "kaggle.json")
        get_data(
            kaggle_json_path,
            path_data_project_dir,
            dataset_name=dataset_name,
            specific_file=None,
        )

    file_name_df_train = "train.csv"
    file_name_df_test = "test.csv"

    path_dataset_train = os.path.join(path_data_project_dir, file_name_df_train)
    path_dataset_test = os.path.join(path_data_project_dir, file_name_df_test)

## Constants and Settings


In [None]:
testing_sample = 1000
gpu_switch = "OFF"
n_splits = 3
n_test_split = 1
embargo_td = 100
state = 42

n_repeats = 1
nbrnd_erly_stp = 1000


cv_mthd = "KF"

In [6]:
if TRAIN:
    if not os.path.exists(path_dataset_processed):
        os.makedirs(path_dataset_processed)

    train_file_path = os.path.join(path_dataset_processed, "train.csv")
    if not os.path.exists(train_file_path) or OVERWRITE:
        df_train_raw = pd.read_csv(path_dataset_train_raw)
        df_train = generate_features(df_train_raw)
        df_train.to_csv(train_file_path, index=False)
    else:
        df_train = pd.read_csv(train_file_path)

    test_file_path = os.path.join(path_dataset_processed, "test.csv")
    if not os.path.exists(test_file_path) or OVERWRITE:
        df_test_raw = pd.read_csv(path_dataset_test_raw)
        df_test = generate_features(df_test_raw)
        df_test.to_csv(test_file_path, index=False)
    else:
        df_test = pd.read_csv(test_file_path)

    if DEBUG:
        X_train = df_train.sample(n=testing_sample)
    else:
        X_train = df_train.copy()

    y_train = df_train_raw["target"].loc[X_train.index].squeeze()
    del df_train_raw, df_test_raw

In [None]:
# Cross-Validation Setup
if TRAIN:
    all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "regression_l1",
                "boosting_type": "gbdt",
                "random_state": state,
                "colsample_bytree": 0.7,
                "subsample": 0.65,
                "max_depth": 6,
                "verbose": -1,
                "num_leaves": 150,
                "reg_alpha": 0.01,
                "reg_lambda": 3.25,
                "verbose_eval": False,
            },
            "dynamic_params": {
                "n_estimators": {"type": "int", "low": 100, "high": 500},
                "learning_rate": {"type": "float", "low": 0.01, "high": 0.1},
            },
        },
        "XGBR": {
            "static_params": {
                "tree_method": "gpu_hist" if gpu_switch == "ON" else "hist",
                "objective": "reg:absoluteerror",
                "random_state": state,
                "colsample_bytree": 0.7,
                "max_depth": 6,
                "reg_alpha": 0.025,
                "reg_lambda": 1.75,
                "min_child_weight": 1000,
                "early_stopping_rounds": nbrnd_erly_stp,
            },
            "dynamic_params": {
                "n_estimators": {"type": "int", "low": 100, "high": 500},
                "learning_rate": {"type": "float", "low": 0.01, "high": 0.1},
            },
        },
    }

    dict_models = {"LGBMR": LGBMR, "XGBR": XGBR}

    log_model = True

    current_date_str = datetime.now().strftime("%Y%m%d")
    experiment_purpose = "hyperparam_optimization"
    experiment_name = f"{experiment_purpose}_{current_date_str}"

    mlflow.set_tracking_uri(path_experiments_storage)
    mlflow.set_experiment(experiment_name)

    client = MlflowClient()


def create_model(trial, model_class, static_params, dynamic_params):
    dynamic_params_values = {}
    for param_name, suggestions in dynamic_params.items():
        suggestion_type = suggestions["type"]
        if suggestion_type == "int":
            dynamic_params_values[param_name] = trial.suggest_int(
                param_name, suggestions["low"], suggestions["high"]
            )
        elif suggestion_type == "float":
            dynamic_params_values[param_name] = trial.suggest_float(
                param_name, suggestions["low"], suggestions["high"]
            )
        elif suggestion_type == "categorical":
            dynamic_params_values[param_name] = trial.suggest_categorical(
                param_name, suggestions["choices"]
            )
        elif suggestion_type == "discrete_uniform":
            dynamic_params_values[param_name] = trial.suggest_discrete_uniform(
                param_name, suggestions["low"], suggestions["high"], suggestions["q"]
            )
        elif suggestion_type == "loguniform":
            dynamic_params_values[param_name] = trial.suggest_loguniform(
                param_name, suggestions["low"], suggestions["high"]
            )
        else:
            raise ValueError(f"Unsupported suggestion type: {suggestion_type}")

    model_params = {**static_params, **dynamic_params_values}
    return model_class(**model_params)


def objective(trial, X, y):
    try:
        with mlflow.start_run() as run:
            mlflow.log_param("cv_mthd", cv_mthd)
            mlflow.set_tag("experiment_purpose", experiment_purpose)
            mlflow.set_tag("experiment_name", experiment_name)
            for model_name, model_class in dict_models.items():
                model = create_model(
                    trial,
                    dict_models[model_name],
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
                mae_list = []

                log_training_details(logger, model, trial, model_name)

                for fold_n, (train_idx, val_idx) in enumerate(cv.split(X, y)):
                    with mlflow.start_run(run_name=f"Fold_{fold_n+1}", nested=True):
                        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                        mlflow.log_param("training_data_rows", X_train.shape[0])
                        mlflow.log_param("training_data_columns", X_train.shape[1])

                        model.fit(
                            X_train,
                            y_train,
                            eval_set=[(X_val, y_val)],
                            eval_metric="mae",
                            callbacks=[
                                log_evaluation(0),
                                early_stopping(nbrnd_erly_stp, verbose=False),
                            ],
                        )

                        fold_mae = model.best_score_["valid_0"]["l1"]
                        mae_list.append(fold_mae)
                        logger.info(f"{fold_n + 1:<5} {'|':<2} {fold_mae:<20}")

                        mlflow.log_metric("mae", fold_mae)
                        mlflow.log_param("fold_number", fold_n + 1)
                        mlflow.log_param("model_name", model_name)
                        mlflow.log_param("log_model", log_model)

                        params_to_log = model.get_params()
                        mlflow.log_params(params_to_log)

                        if log_model:
                            current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                            model_log_name = (
                                f"{model_name}_{trial.number}_{current_time_str}"
                            )

                            mlflow.log_param("model_log_name", model_log_name)

                            mlflow.sklearn.log_model(model, model_log_name)

                avg_mae = sum(mae_list) / len(mae_list)

                mlflow.log_param("model_name", model_name)

                return avg_mae

    except Exception as e:
        logger.error(f"An exception occurred: {e}")
        return float("inf")

In [None]:
# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=3)

In [7]:
def save_and_version_df(df_new):
    # Read existing df_runs CSV files and concatenate them with the new df_runs
    existing_files = [
        f for f in os.listdir() if f.startswith("df_runs_") and f.endswith(".csv")
    ]
    dfs = [pd.read_csv(f) for f in existing_files]

    for old_file, old_df in zip(existing_files, dfs):
        print(f"Removed old file: {old_file}, Shape: {old_df.shape}")
        os.remove(old_file)

    dfs.append(df_new)
    df_concatenated = pd.concat(dfs, ignore_index=True)

    # Remove duplicates
    df_unique = df_concatenated.drop_duplicates()

    # Save the new concatenated and deduplicated df_runs to a new versioned CSV file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    csv_filename = f"df_runs_{timestamp}.csv"
    df_unique.to_csv(csv_filename, index=False)

    print(f"DataFrame saved to {csv_filename}, Shape: {df_unique.shape}")

    return df_unique

In [8]:
def gather_runs_data():
    experiments = client.search_experiments()
    all_runs_data = []
    for exp in experiments:
        experiment_id = exp.experiment_id
        run_infos = client.search_runs(experiment_ids=[experiment_id])

        for run_info in run_infos:
            run_data = {
                "experiment_id": experiment_id,
                "experiment_name": exp.name,
                "run_id": run_info.info.run_id,
            }

            # Add metrics to run_data
            for key, value in run_info.data.metrics.items():
                run_data[f"{key}"] = value

            # Add params to run_data
            for key, value in run_info.data.params.items():
                run_data[f"{key}"] = value

            all_runs_data.append(run_data)

    df_runs_new = pd.DataFrame(all_runs_data)

    df_runs_new["date_exp"] = df_runs_new["experiment_name"].apply(
        lambda x: x.split("_")[-1]
    )

    df_runs_unique = save_and_version_df(df_runs_new)

    df_runs_unique = df_runs_unique[~df_runs_unique["model_name"].isna()]

    return df_runs_unique

In [9]:
cols_id = ["experiment_id", "run_id"]
cols_info = ["training_data_rows", "training_data_columns"]
cols_metrics = ["mae"]
cols_model_params = ["date_exp", "log_model", "cv_mthd", "fold_number"]

In [None]:
if TRAIN:
    df_runs = gather_runs_data()
    all_cols = df_runs.columns.tolist()

    for col in cols_model_params + cols_metrics + cols_info + cols_id:
        all_cols.remove(col)

    new_col_order = cols_model_params + cols_metrics + cols_info + cols_id + all_cols
    df_runs = df_runs[new_col_order]

In [None]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        print(f"Loading model from: {model_path}")
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.mean(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred


# List of full artifact paths for the models you want to ensemble
model_paths = [
    "/kaggle/working/experiments_storage/730598797542573601/2225c7d78c4840758e330998f813371c/artifacts/LGBMR_1_20231023_213446/model.pkl"
]

In [None]:
# Check if the 'models' directory exists, and create it if it doesn't
models_dir = "models"
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

# Copy each model to the 'models' directory
for model_path in model_paths:
    model_name = os.path.basename(model_path)
    dest_path = os.path.join(models_dir, model_name)

    shutil.copy(model_path, dest_path)

print(f"Models have been copied to {models_dir}.")

In [None]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [None]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
for test, revealed_targets, sample_prediction in iter_test:
    feat = generate_features(test)

    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1

In [None]:
def clean_directory_except_one(dir_path, file_to_keep):
    """
    Remove all files and folders in a directory except for one specified file.

    Parameters:
    - dir_path (str): The path of the directory to clean.
    - file_to_keep (str): The name of the file to keep.
    """
    # Check if the directory exists
    if os.path.exists(dir_path):
        # Loop through each file and folder in the directory
        for filename in os.listdir(dir_path):
            # Skip the file you want to keep
            if filename == file_to_keep:
                continue

            file_path = os.path.join(dir_path, filename)

            # Remove file or directory
            if os.path.isfile(file_path):
                os.remove(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)

        print(
            f"All files and folders in {dir_path} have been removed, except for {file_to_keep}."
        )
    else:
        print(f"Directory {dir_path} does not exist.")


# Example usage:
# clean_directory_except_one('/kaggle/working/', 'submission.csv')