In [9]:
# Built-in modules
import os
import warnings
from tqdm import tqdm 

# External modules
import pandas as pd
from dotenv import load_dotenv

# Custom modules
from src.utils.utils_kaggle import get_data
from src.utils.utils_general import get_project_directory, load_config
from src.experiments.mlflow_optuna_init import initialize_mlflow, initialize_optuna
from src.validation.cv_setup import initialize_cv_method
from src.experiments.optuna_objective import objective

# Suppress warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# Auto-reload modules
%load_ext autoreload
%autoreload 2

load_dotenv()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


True

In [2]:
path_project_directory = get_project_directory()

In [3]:
# Load configuration
config_path = os.path.join(path_project_directory, "config/train_config.yaml")
config = load_config(config_path)

# Define paths
dataset_path = os.path.join(
    path_project_directory, "data/processed/synthetic_ticker_data.csv"
)
path_experiments_storage = os.path.join(
    path_project_directory, "data/experiments_storage"
)

# Kaggle dataset parameters
kaggle_json_path = "/home/sam/github/kaggle.json"
dest_folder = "/home/sam/github/mlops-hub/data/kaggle_optiver_trading_at_the_close"
dataset_name = "ravi20076/optiver-memoryreduceddatasets"
specific_file = "XTrIntCmpNewFtre.parquet"

# Download data from Kaggle
get_data(
    kaggle_json_path,
    dest_folder,
    dataset_name=dataset_name,
    specific_file=specific_file,
)

# Initialize MLFlow and Optuna
initialize_mlflow(path_experiments_storage, config)
study = initialize_optuna(path_experiments_storage, config)

2023/10/21 08:50:04 INFO mlflow.tracking.fluent: Experiment with name 'Default_Experiment' does not exist. Creating a new experiment.


Kaggle package is already installed.
Directory /home/sam/github/mlops-hub/data/kaggle_optiver_trading_at_the_close already exists.
File XTrIntCmpNewFtre.parquet already exists. Skipping download.


[I 2023-10-21 08:50:04,746] A new study created in RDB with name: Default_Study_Name


In [5]:
# testing
debug = True
testing_sample = 1000

In [8]:
%%time 
if debug:
    X = pd.read_parquet(path_project_directory + 'data/kaggle_optiver_trading_at_the_close/XTrIntCmpNewFtre.parquet').sample(n = testing_sample)
else:
    X = pd.read_parquet(path_project_directory + 'data/kaggle_optiver_trading_at_the_close/XTrIntCmpNewFtre.parquet')
    
y = pd.read_parquet(path_project_directory + "data/kaggle_optiver_trading_at_the_close/Ytrain.parquet").loc[X.index].squeeze()
X.index, y.index = range(len(X)), range(len(y))
print(f"X:{X.shape}, y:{y.shape[0]}")

X:(1000, 52), y:1000
CPU times: user 3.02 s, sys: 2.33 s, total: 5.35 s
Wall time: 1.74 s


In [21]:
n_splits = 3
n_test_splits = 1
embargo_td = 100

state = 42

cv_mthd = "KF"  # "KF" or "PurgedKF"
n_repeats = 1

In [22]:
from src.validation.model_validation import CombPurgedKFoldCV
from sklearn.model_selection import (
    KFold,
)

In [23]:
all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}

In [None]:
for fold_nb, (train_idx, dev_idx) in tqdm(
    enumerate(cv.split(X, y)), f"{cv_mthd} CV {n_splits}x{n_repeats}"
):
    # Creating the cv folds:-
    Xtr = X.iloc[train_idx]
    Xdev = X.iloc[dev_idx]
    ytr = y.iloc[train_idx]
    ydev = y.iloc[dev_idx]

    print(f"Fold {fold_nb} - Train: {Xtr.shape}, Dev: {Xdev.shape}")

In [16]:
cv = CombPurgedKFoldCV(
    n_splits=n_splits, n_test_splits=n_test_splits, embargo_td=embargo_td
)

In [17]:
cv.split(X, y)

<generator object CombPurgedKFoldCV.split at 0x7f33a8512650>