In [2]:
# Built-in modules
import os
import warnings

# External general-purpose modules
import pandas as pd
import numpy as np 
from dotenv import load_dotenv
from tqdm import tqdm
from termcolor import colored

# Logging and optimization modules
from loguru import logger
import mlflow
import optuna
from optuna.integration.mlflow import MLflowCallback

# Machine learning and model validation modules
from sklearn.model_selection import KFold
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR

# Custom modules
from src.utils.utils_kaggle import get_data
from src.utils.utils_general import get_project_directory, load_config
from src.experiments.mlflow_optuna_init import initialize_optuna
from src.feat_engineering.fe_opriver_trading_at_the_close import generate_features
# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Auto-reload modules
%load_ext autoreload
%autoreload 2

# Load environment variables
load_dotenv()

# Configure Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Configure Loguru
logger.add(
    "objective_logs.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
)


# Initialize MLflow callback
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
)

name_folder_data_project = "kaggle_optiver_trading_at_the_close"

file_name_df_train = "train.csv"
file_name_df_test = "test.csv"

download_kaggle_data = False

#####################################################################################
path_project_dir = get_project_directory()
path_data_dir = os.path.join(path_project_dir, "data")
path_config_dir = os.path.join(path_project_dir, "config")

path_data_project_dir = os.path.join(path_data_dir, name_folder_data_project)

path_experiments_storage = os.path.join(path_data_project_dir, "experiments_storage")

path_config_train = os.path.join(path_config_dir, "train_config.yaml")
path_dataset_train = os.path.join(path_data_project_dir, file_name_df_train)
path_dataset_test = os.path.join(path_data_project_dir, file_name_df_test)

config = load_config(path_config_train)

if download_kaggle_data:
    dataset_name = "ravi20076/optiver-memoryreduceddatasets"
    kaggle_json_path = os.path.join(path_project_dir, "kaggle.json")
    get_data(
        kaggle_json_path,
        path_data_project_dir,
        dataset_name=dataset_name,
        specific_file=None,
    )
# Constants and Settings
debug = True
testing_sample = 1000
gpu_switch = "OFF"
n_splits = 3
n_test_split = 1
embargo_td = 100
state = 42
cv_mthd = "KF"  # "KF" or "PurgedKF"
n_repeats = 1
model_mthd = "LGBMR"
nbrnd_erly_stp = 1000
mlflow.set_tracking_uri(path_experiments_storage)
mlflow.set_experiment(config.get('experiment_name', 'Default_Experiment2'))
study = initialize_optuna(path_experiments_storage, config)
df_train_raw = pd.read_csv(path_dataset_train)
df_test_raw = pd.read_csv(path_dataset_test)
df_train = generate_features(df_train_raw)

if debug:
    X_train = df_train.sample(n=testing_sample)
else:
    X_train = df_train.copy()

y_train = df_train_raw["target"].loc[X_train.index].squeeze()
del df_train_raw, df_test_raw
# Cross-Validation Setup
all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}
cv = all_cv[cv_mthd]

# Model Setup
dict_models = {
    "LGBMR": LGBMR(
        device="gpu" if gpu_switch == "ON" else "cpu",
        objective="regression_l1",
        boosting_type="gbdt",
        random_state=state,
        colsample_bytree=0.7,
        subsample=0.65,
        learning_rate=0.065,
        max_depth=6,
        n_estimators=500,
        verbose=-1,
        num_leaves=150,
        reg_alpha=0.01,
        reg_lambda=3.25,
        verbose_eval=False,
    )
}

model = dict_models[model_mthd]

def objective(trial, X, y):
    try:
        with mlflow.start_run() as run:
            mae_list = []
            n_estimators = trial.suggest_int("n_estimators", 100, 500)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
            model.set_params(n_estimators=n_estimators, learning_rate=learning_rate)

            logger.info(
                colored("------------------------------------------------", "blue")
            )
            logger.info(
                colored(
                    f"Trial {trial.number:<4} | n_estimators: {n_estimators:<4} | learning_rate: {learning_rate:<10}",
                    "green",
                )
            )

            logger.info(f"{'Fold':<5} {'|':<2} {'MAE':<20}")
            logger.info(f"{'-----':<5} {'|':<2} {'--------------------':<20}")

            for fold_n, (train_idx, val_idx) in enumerate(cv.split(X, y)):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                model.fit(
                    X_train,
                    y_train,
                    eval_set=[(X_val, y_val)],
                    eval_metric="mae",
                    callbacks=[
                        log_evaluation(0),
                        early_stopping(nbrnd_erly_stp, verbose=False),
                    ],
                )

                fold_mae = model.best_score_["valid_0"]["l1"]
                mae_list.append(fold_mae)
                logger.info(f"{fold_n + 1:<5} {'|':<2} {fold_mae:<20}")

            avg_mae = sum(mae_list) / len(mae_list)
            logger.warning(colored(f"Average MAE: {avg_mae}", "yellow"))
            mlflow.log_metric("mae", avg_mae)
            mlflow.log_params(
                {"n_estimators": n_estimators, "learning_rate": learning_rate}
            )
            mlflow.sklearn.log_model(model, "model")
            return avg_mae

    except Exception as e:
        logger.error(f"An exception occurred: {e}")
        return float("inf")


# Suppress warnings from Optuna and other libraries
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)

# Run the Optuna study
study = optuna.create_study(
    direction="minimize",
    study_name="Your Study Name",
    storage="sqlite:///data/kaggle_optiver_trading_at_the_close/experiments_storage/optuna.db",
    load_if_exists=True,
)
study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=3)



2023/10/21 21:25:05 INFO mlflow.tracking.fluent: Experiment with name 'Default_Experiment2' does not exist. Creating a new experiment.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[32m2023-10-21 21:25:34.005[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m143[0m - [1m[34m------------------------------------------------[0m[0m
[32m2023-10-21 21:25:34.006[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m146[0m - [1m[32mTrial 6    | n_estimators: 237  | learning_rate: 0.08549053734229291[0m[0m
[32m2023-10-21 21:25:34.007[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m153[0m - [1mFold  |  MAE                 [0m
[32m2023-10-21 21:25:34.008[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m154[0m - [1m----- |  --------------------[0m
[32m2023-10-21 21:25:34.102[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m173[0m - [1m1     |  6.5699646311969335  [0m
[32m2023-10-21 21:25:34.199[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m173[0m - [1m2     |  6.220175976074886   [0m
[32m2023-10-21 21:25:34.292[0m | [1mINFO    [0m | [36m_