In [6]:
# Built-in modules
import os
import warnings

# External general-purpose modules
import pandas as pd
import numpy as np 
from dotenv import load_dotenv
from tqdm import tqdm
from termcolor import colored

# Logging and optimization modules
from loguru import logger
import mlflow
import optuna
from optuna.integration.mlflow import MLflowCallback

# Machine learning and model validation modules
from sklearn.model_selection import KFold
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR

# Custom modules
from src.utils.utils_kaggle import get_data
from src.utils.utils_general import get_project_directory, load_config
from src.experiments.mlflow_optuna_init import initialize_mlflow, initialize_optuna

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Auto-reload modules
%load_ext autoreload
%autoreload 2

# Load environment variables
load_dotenv()


  from .autonotebook import tqdm as notebook_tqdm


True

In [8]:
# Get project directory and load configuration
path_project_directory = get_project_directory()
config_path = os.path.join(path_project_directory, "config/train_config.yaml")
config = load_config(config_path)

# Define paths
dataset_path = os.path.join(
    path_project_directory, "data/processed/synthetic_ticker_data.csv"
)
path_experiments_storage = os.path.join(
    path_project_directory, "data/experiments_storage"
)

# Kaggle dataset parameters
kaggle_json_path = os.path.join(path_project_directory, "kaggle.json")

dest_folder = os.path.join(
    path_project_directory, "data/kaggle_optiver_trading_at_the_close"
)
dataset_name = "ravi20076/optiver-memoryreduceddatasets"
specific_file = "XTrIntCmpNewFtre.parquet"

In [None]:
# Download data from Kaggle
get_data(
    kaggle_json_path,
    dest_folder,
    dataset_name=dataset_name,
    specific_file=specific_file,
)

# Initialize MLFlow and Optuna
initialize_mlflow(path_experiments_storage, config)
study = initialize_optuna(path_experiments_storage, config)

# Configure Optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Configure Loguru
logger.add(
    "objective_logs.log", format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}"
)

# Initialize MLflow callback
mlflow_callback = MLflowCallback(
    tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
)

In [4]:
def generate_features(df):
    features = [
        "seconds_in_bucket",
        "imbalance_buy_sell_flag",
        "imbalance_size",
        "matched_size",
        "bid_size",
        "ask_size",
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
        "imb_s1",
        "imb_s2",
    ]

    df["imb_s1"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["imb_s2"] = df.eval(
        "(imbalance_size-matched_size)/(matched_size+imbalance_size)"
    )

    prices = [
        "reference_price",
        "far_price",
        "near_price",
        "ask_price",
        "bid_price",
        "wap",
    ]

    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            if i > j:
                df[f"{a}_{b}_imb"] = df.eval(f"({a}-{b})/({a}+{b})")
                features.append(f"{a}_{b}_imb")

    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            for k, c in enumerate(prices):
                if i > j and j > k:
                    max_ = df[[a, b, c]].max(axis=1)
                    min_ = df[[a, b, c]].min(axis=1)
                    mid_ = df[[a, b, c]].sum(axis=1) - min_ - max_

                    df[f"{a}_{b}_{c}_imb2"] = (max_ - mid_) / (mid_ - min_)
                    features.append(f"{a}_{b}_{c}_imb2")

    return df[features]

In [13]:
df_train = pd.read_csv(
    os.path.join(
        path_project_directory, "data/kaggle_optiver_trading_at_the_close/train.csv"
    )
)

In [14]:
df_ = generate_features(df_train)

In [None]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [None]:
counter = 0
for test, revealed_targets, sample_prediction in iter_test:
    feat = generate_features(test)

    sample_prediction["target"] = np.mean([model.predict(feat) for model in models], 0)
    env.predict(sample_prediction)
    counter += 1

In [None]:
df_train = pd.read_csv( path_project_directory'/kaggle/input/optiver-trading-at-the-close/train.csv')


In [None]:
# Constants and Settings
debug = True
testing_sample = 1000
gpu_switch = "OFF"
n_splits = 3
n_test_split = 1
embargo_td = 100
state = 42
cv_mthd = "KF"  # "KF" or "PurgedKF"
n_repeats = 1
model_mthd = "LGBMR"
nbrnd_erly_stp = 1000

# Data Loading
if debug:
    X = pd.read_parquet(
        os.path.join(
            path_project_directory,
            "data/kaggle_optiver_trading_at_the_close/XTrIntCmpNewFtre.parquet",
        )
    ).sample(n=testing_sample)
else:
    X = pd.read_parquet(
        os.path.join(
            path_project_directory,
            "data/kaggle_optiver_trading_at_the_close/XTrIntCmpNewFtre.parquet",
        )
    )

y = (
    pd.read_parquet(
        os.path.join(
            path_project_directory,
            "data/kaggle_optiver_trading_at_the_close/Ytrain.parquet",
        )
    )
    .loc[X.index]
    .squeeze()
)

# Logging Data Shapes
print(f"X: {X.shape}, y: {y.shape[0]}")

# Cross-Validation Setup
all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}
cv = all_cv[cv_mthd]

# Model Setup
dict_models = {
    "LGBMR": LGBMR(
        device="gpu" if gpu_switch == "ON" else "cpu",
        objective="regression_l1",
        boosting_type="gbdt",
        random_state=state,
        colsample_bytree=0.7,
        subsample=0.65,
        learning_rate=0.065,
        max_depth=6,
        n_estimators=500,
        verbose=-1,
        num_leaves=150,
        reg_alpha=0.01,
        reg_lambda=3.25,
        verbose_eval=False,
    )
}

model = dict_models[model_mthd]

In [9]:
def objective(trial, X, y):
    try:
        with mlflow.start_run() as run:
            mae_list = []
            n_estimators = trial.suggest_int("n_estimators", 100, 500)
            learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1)
            model.set_params(n_estimators=n_estimators, learning_rate=learning_rate)

            logger.info(
                colored("------------------------------------------------", "blue")
            )
            logger.info(
                colored(
                    f"Trial {trial.number:<4} | n_estimators: {n_estimators:<4} | learning_rate: {learning_rate:<10}",
                    "green",
                )
            )

            logger.info(f"{'Fold':<5} {'|':<2} {'MAE':<20}")
            logger.info(f"{'-----':<5} {'|':<2} {'--------------------':<20}")

            for fold_n, (train_idx, val_idx) in enumerate(cv.split(X, y)):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                model.fit(
                    X_train,
                    y_train,
                    eval_set=[(X_val, y_val)],
                    eval_metric="mae",
                    callbacks=[
                        log_evaluation(0),
                        early_stopping(nbrnd_erly_stp, verbose=False),
                    ],
                )

                fold_mae = model.best_score_["valid_0"]["l1"]
                mae_list.append(fold_mae)
                logger.info(f"{fold_n + 1:<5} {'|':<2} {fold_mae:<20}")

            avg_mae = sum(mae_list) / len(mae_list)
            logger.warning(colored(f"Average MAE: {avg_mae}", "yellow"))
            mlflow.log_metric("mae", avg_mae)
            mlflow.log_params(
                {"n_estimators": n_estimators, "learning_rate": learning_rate}
            )
            mlflow.sklearn.log_model(model, "model")
            return avg_mae

    except Exception as e:
        logger.error(f"An exception occurred: {e}")
        return float("inf")


# Suppress warnings from Optuna and other libraries
warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)

# Run the Optuna study
study = optuna.create_study(
    direction="minimize",
    study_name="Your Study Name",
    storage="sqlite:///data/optuna.db",
    load_if_exists=True,
)
study.optimize(lambda trial: objective(trial, X, y), n_trials=3)

[32m2023-10-21 11:19:32.267[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m9[0m - [1m[34m------------------------------------------------[0m[0m
[32m2023-10-21 11:19:32.268[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m12[0m - [1m[32mTrial 78   | n_estimators: 150  | learning_rate: 0.010518498605383094[0m[0m
[32m2023-10-21 11:19:32.269[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m19[0m - [1mFold  |  MAE                 [0m
[32m2023-10-21 11:19:32.269[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m20[0m - [1m----- |  --------------------[0m
[32m2023-10-21 11:19:32.343[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m39[0m - [1m1     |  6.346515089764429   [0m
[32m2023-10-21 11:19:32.409[0m | [1mINFO    [0m | [36m__main__[0m:[36mobjective[0m:[36m39[0m - [1m2     |  6.544533798190073   [0m
[32m2023-10-21 11:19:32.475[0m | [1mINFO    [0m | [36m__main_