In [1]:
    # Constants
TRAIN = False
OVERWRITE = False
DEBUG = False
DEBUG_SAMPLE = 10000

N_TRIALS = 10

VERSION_NB = 1 

state = 42

download_kaggle_data = False

# External general-purpose modules
import os
import shutil
import warnings
from datetime import datetime
import glob

import numpy as np
import pandas as pd
import polars as pl
from dotenv import load_dotenv
from joblib import dump
import joblib
import os

# Setting pandas options and warning filters
pd.set_option("display.max_columns", None)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Load environment variables
load_dotenv()

False

In [2]:
# Setting up the project directory path
path_project_dir = os.getcwd()
if path_project_dir not in ["/kaggle/working", "/content"]:
    path_project_dir = os.getenv("ROOT_PATH")

print(path_project_dir)

# Imports and setup for training
if TRAIN:
    # Install packages and import logging libraries
    if path_project_dir == '/kaggle/working':
        !pip install loguru mlflow optuna > /dev/null
        from utils_1 import get_data, log_training_details, clean_directory_except_one

    from utils_1 import log_feature_importance
    
    from loguru import logger
    import mlflow
    import optuna
    from optuna.integration.mlflow import MLflowCallback
    from mlflow.tracking import MlflowClient
    import zipfile
    
    from tqdm import tqdm

    # Import machine learning libraries
    from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR
    from sklearn.model_selection import KFold
    from xgboost import XGBRegressor as XGBR

    # Set logging
    logger.add("logs.log", format="{time:YYYY-MM-DD HH:mm} | {level} | {message}")
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    warnings.filterwarnings("ignore", category=optuna.exceptions.ExperimentalWarning)
    
    # Auto-reload modules
    %load_ext autoreload
    %autoreload 2

    # Initialize MLflow callback
    mlflow_callback = MLflowCallback(
        tracking_uri=mlflow.get_tracking_uri(), metric_name="mae"
    )

/kaggle/working


In [3]:
if path_project_dir == "/kaggle/working":
    path_data_project_dir = "/kaggle/input/optiver-trading-at-the-close"
    path_experiments_storage = os.path.join(path_project_dir, "experiments_storage")

    path_dataset_train_raw = "/kaggle/input/optiver-trading-at-the-close/train.csv"
    path_dataset_test_raw = (
        "/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv"
    )

    path_dataset_processed = "/kaggle/working/processed_data"
    path_dataset_train = os.path.join(path_dataset_processed, "train.csv")
    path_dataset_test = os.path.join(path_dataset_processed, "test.csv")

else:
    name_folder_data_project = "kaggle_optiver_trading_at_the_close"

    path_data_dir = os.path.join(path_project_dir, "data")
    path_dataset_train_raw = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/raw", "train.csv"
    )
    path_dataset_processed = os.path.join(
        path_data_dir, "kaggle_optiver_trading_at_the_close/processed"
    )

    path_data_project_dir = os.path.join(path_data_dir, name_folder_data_project)

    path_config_dir = os.path.join(path_project_dir, "config")
    path_config_train = os.path.join(path_config_dir, "train_config.yaml")

    path_experiments_storage = os.path.join(
        path_data_project_dir, "experiments_storage"
    )

    if download_kaggle_data:
        dataset_name = "ravi20076/optiver-memoryreduceddatasets"
        kaggle_json_path = os.path.join(path_project_dir, "kaggle.json")
        get_data(
            kaggle_json_path,
            path_data_project_dir,
            dataset_name=dataset_name,
            specific_file=None,
        )

    file_name_df_train = "train.csv"
    file_name_df_test = "test.csv"

    path_dataset_train = os.path.join(path_data_project_dir, file_name_df_train)
    path_dataset_test = os.path.join(path_data_project_dir, file_name_df_test)

if TRAIN:
    mlflow.set_tracking_uri(path_experiments_storage)
    client = MlflowClient()

## Constants and Settings


In [4]:
if TRAIN:
    if not os.path.exists(path_dataset_processed):
        os.makedirs(path_dataset_processed)

    if not os.path.exists(path_dataset_train) or OVERWRITE:
        df_train = pd.read_csv(path_dataset_train_raw)
    else:
        df_train = pd.read_csv(path_dataset_train)

    if DEBUG:
        df_train = df_train[df_train["stock_id"].isin([0, 1, 2, 3, 4, 5])]

In [5]:
if TRAIN:
    # Dropping rows with null targets:-
    drop_idx = df_train.loc[df_train["target"].isna(), "target"].index.to_list()
    df_train = df_train.drop(drop_idx, axis=0)
    # df_train.drop("row_id", axis=1, inplace=True)

In [6]:

def feat_engineering(df_train):
    df_train_pol = pl.DataFrame(df_train)
    df = df_train_pol
    # 7. Handle Missing Values
    df = df.with_columns(
        [
            pl.col("far_price").fill_null(strategy="forward").alias("far_price"),
            pl.col("near_price").fill_null(strategy="forward").alias("near_price"),
        ]
    )
    # Level 1 Features
    level_one_features = [
        (pl.col("imbalance_size") / pl.col("matched_size")).alias(
            "imbalance_to_matched_size"
        ),
        (pl.col("imbalance_size") * pl.col("imbalance_buy_sell_flag")).alias(
            "imbalance_flag_to_size"
        ),
        (pl.col("ask_price") - pl.col("bid_price")).alias("spread"),
        (pl.col("bid_size") - pl.col("ask_size")).alias("bid_ask_imbalance"),
        (pl.col("bid_size") / pl.col("ask_size")).alias("liquidity"),
        (pl.col("bid_price") - pl.col("wap")).alias("price_diff_bid_to_wap"),
        (pl.col("ask_price") - pl.col("wap")).alias("price_diff_ask_to_wap"),
        (pl.col("bid_size") - pl.col("wap")).alias("size_diff_bid_to_wap"),
        (pl.col("ask_size") - pl.col("wap")).alias("size_diff_ask_to_wap"),
        (pl.col("wap") - pl.col("wap").shift(1).over(["stock_id", "date_id"])).alias(
            "wap_velocity"
        ),
        (
            pl.col("wap") / pl.col("wap").shift(5).over(["stock_id", "date_id"]) - 1
        ).alias("wap_momentum_5"),
        (
            pl.col("wap")
            .std()
            .over(["stock_id", "date_id"])
            .alias("short_term_volatility")
        ),
        (
            (
                pl.col("imbalance_size")
                / (pl.col("matched_size") + pl.col("imbalance_size"))
            ).alias("price_impact")
        ),
        (
            (pl.col("bid_size") - pl.col("ask_size"))
            / (pl.col("bid_size") + pl.col("ask_size"))
        ).alias("order_imbalance_ratio"),
        (
            (pl.col("ask_price") - pl.col("bid_price"))
            / (pl.col("ask_price") + pl.col("bid_price"))
        ).alias("price_skewness"),
        (pl.col("seconds_in_bucket") / 600).alias("time_decay"),
    ]

    # Level 2 Features
    level_two_features = [
        (
            pl.col("wap_velocity")
            - pl.col("wap_velocity").shift(1).over(["stock_id", "date_id"])
        ).alias("wap_acceleration"),
        (
            pl.col("short_term_volatility").shift(1).over(["stock_id", "date_id"])
            - pl.col("short_term_volatility")
        ).alias("volatility_rate_of_change"),
        (
            (
                pl.col("liquidity")
                - pl.col("liquidity").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("liquidity").shift(1).over(["stock_id", "date_id"])
        ).alias("liquidity_ratio_change"),
        (
            (
                pl.col("order_imbalance_ratio")
                - pl.col("order_imbalance_ratio").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("order_imbalance_ratio").shift(1).over(["stock_id", "date_id"])
        ).alias("order_imbalance_over_time"),
        (
            (
                pl.col("price_skewness")
                - pl.col("price_skewness").shift(1).over(["stock_id", "date_id"])
            )
            / pl.col("price_skewness").shift(1).over(["stock_id", "date_id"])
        ).alias("price_skewness_rate_of_change"),
    ]

    # Level 3 Features
    level_three_aggregations = [
        pl.col("wap").mean().alias("avg_wap_by_market"),
        pl.col("near_price").mean().alias("avg_near_price_by_market"),
        pl.col("matched_size").mean().alias("avg_matched_size_by_market"),
        pl.col("imbalance_to_matched_size")
        .mean()
        .alias("avg_imbalance_to_matched_size_by_market"),
        pl.col("spread").mean().alias("avg_spread_by_market"),
        pl.col("liquidity").mean().alias("avg_liquidity_by_market"),
        pl.col("short_term_volatility").mean().alias("avg_market_volatility"),
        pl.col("order_imbalance_ratio").mean().alias("avg_market_imbalance"),
        pl.col("liquidity").mean().alias("avg_market_liquidity"),
        pl.col("price_impact").mean().alias("avg_market_price_impact"),
        pl.col("price_skewness").mean().alias("avg_market_price_skewness"),
    ]

    # Adding all features and performing join operation
    df = df.with_columns(level_one_features)
    df = df.with_columns(level_two_features)
    group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
        *level_three_aggregations
    )
    df = df.join(group_by_market, on=["date_id", "seconds_in_bucket"], how="left")


    polynomial_and_interaction_features = [
        (pl.col("seconds_in_bucket") * pl.col("near_price")).alias(
            "seconds_in_bucket_X_near_price"
        ),
        (pl.col("matched_size") * pl.col("near_price")).alias(
            "matched_size_X_near_price"
        ),
        (pl.col("near_price") ** 2).alias("near_price_squared"),
        (pl.col("matched_size") ** 2).alias("matched_size_squared"),
        (pl.col("seconds_in_bucket") * pl.col("imbalance_flag_to_size")).alias(
            "seconds_in_bucket_X_imbalance_flag_to_size"
        ),
        (pl.col("seconds_in_bucket") ** 2).alias("seconds_in_bucket_squared"),
        (pl.col("imbalance_flag_to_size") ** 2).alias("imbalance_flag_to_size_squared"),
    ]

    # Relative to Market Features
    relative_to_market_features = [
        (pl.col("wap") / pl.col("avg_wap_by_market")).alias("relative_wap_to_market"),
        (pl.col("near_price") / pl.col("avg_near_price_by_market")).alias(
            "relative_near_price_to_market"
        ),
        (pl.col("matched_size") / pl.col("avg_matched_size_by_market")).alias(
            "relative_matched_size_to_market"
        ),
        (
            pl.col("imbalance_to_matched_size")
            / pl.col("avg_imbalance_to_matched_size_by_market")
        ).alias("relative_imbalance_to_matched_size_to_market"),
        (pl.col("spread") / pl.col("avg_spread_by_market")).alias(
            "relative_spread_to_market"
        ),
        (pl.col("liquidity") / pl.col("avg_liquidity_by_market")).alias(
            "relative_liquidity_to_market"
        ),
    ]

    # Combine all Level 4 features and add them to the DataFrame
    all_level_four_features = (
        polynomial_and_interaction_features + relative_to_market_features
    )
    df = df.with_columns(all_level_four_features)

    for window in [5, 10]:
        rolling_group = df.group_by_rolling(
            index_column="seconds_in_bucket",
            period=f"{window}i",  # 'i' denotes index count (integer)
            by=["stock_id", "date_id"],
            closed="left",  # Adjust as needed
        )

        # Apply to basic and new features
        for col in [
            "wap",
            "imbalance_size",
            "bid_price",
            "ask_price",
            "relative_wap_to_market",
            "wap_momentum_5",
        ]:
            df = df.join(
                rolling_group.agg(pl.col(col).mean().alias(f"{col}_mean_{window}")),
                on=["stock_id", "date_id", "seconds_in_bucket"],
                how="left",
            )

    low_importance_cols = [
        "wap_mean_5",
        "imbalance_buy_sell_flag",
        "imbalance_flag_to_size_squared",
        # Add more columns as needed
    ]

    existing_cols = df.columns

    # Drop columns only if they exist in DataFrame
    cols_to_drop = [col for col in low_importance_cols if col in existing_cols]

    if cols_to_drop:
        engineered_df = df.drop(cols_to_drop)
    else:
        engineered_df = df.to_pandas()

    engineered_df = engineered_df.to_pandas()

    return engineered_df

In [7]:
if TRAIN:
    engineered_df = feat_engineering(df_train)

In [8]:
if TRAIN:
    list_cols_drop = ["stock_id", "date_id", "row_id", "time_id", "target"]

    y_train = engineered_df["target"]

    X_train = engineered_df.drop(list_cols_drop, axis=1).copy()

    y_train = y_train.loc[X_train.index].squeeze()

In [9]:
gpu_switch = "OFF"
n_splits = 5
n_test_split = 1
embargo_td = 100

n_repeats = 1
nbrnd_erly_stp = 130

cv_mthd = "KF"

# Cross-Validation Setup
if TRAIN:
    all_cv = {"KF": KFold(n_splits=n_splits, shuffle=True, random_state=state)}
    cv = all_cv[cv_mthd]

    model_params_dict = {
        "LGBMR": {
            "static_params": {
                "device": "gpu" if gpu_switch == "ON" else "cpu",
                "objective": "regression_l1",
                "boosting_type": "gbdt",
                "random_state": state,
                "verbose": -1,
                "verbose_eval": False,
            },
            "dynamic_params": {
                "n_estimators": {
                    "type": "int",
                    "low": 620,
                    "high": 730,
                },
                "learning_rate": {
                    "type": "float",
                    "low": 0.05,
                    "high": 0.07,
                },
                "max_depth": {"type": "int", "low": 14, "high": 19},
                "num_leaves": {
                    "type": "int",
                    "low": 65,
                    "high": 85,
                },
                "min_child_samples": {
                    "type": "int",
                    "low": 71,
                    "high": 74,
                },
                "subsample": {
                    "type": "float",
                    "low": 0.63,
                    "high": 0.71,
                },
                "colsample_bytree": {
                    "type": "float",
                    "low": 0.74,
                    "high": 0.77,
                },
                "min_split_gain": {
                    "type": "float",
                    "low": 0.08,
                    "high": 0.11,
                },
                "reg_alpha": {
                    "type": "float",
                    "low": 0.7,
                    "high": 1.3,
                },
                "reg_lambda": {
                    "type": "float",
                    "low": 1.9,
                    "high": 3.0,
                },
            },
        },
    }

    dict_models = {"LGBMR": LGBMR}

    log_model = True

    experiment_date_str = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_purpose = "optiver_trading_at_the_close"
    experiment_name = f"{experiment_purpose}_{experiment_date_str}"

    mlflow.set_experiment(experiment_name)


def create_model(trial, model_class, static_params, dynamic_params):
    dynamic_params_values = {}
    for param_name, suggestions in dynamic_params.items():
        suggestion_type = suggestions["type"]
        if suggestion_type == "int":
            dynamic_params_values[param_name] = trial.suggest_int(
                param_name, suggestions["low"], suggestions["high"]
            )
        elif suggestion_type == "float":
            dynamic_params_values[param_name] = trial.suggest_float(
                param_name, suggestions["low"], suggestions["high"]
            )
        elif suggestion_type == "categorical":
            dynamic_params_values[param_name] = trial.suggest_categorical(
                param_name, suggestions["choices"]
            )
        elif suggestion_type == "discrete_uniform":
            dynamic_params_values[param_name] = trial.suggest_discrete_uniform(
                param_name, suggestions["low"], suggestions["high"], suggestions["q"]
            )
        elif suggestion_type == "loguniform":
            dynamic_params_values[param_name] = trial.suggest_loguniform(
                param_name, suggestions["low"], suggestions["high"]
            )
        else:
            raise ValueError(f"Unsupported suggestion type: {suggestion_type}")

    model_params = {**static_params, **dynamic_params_values}
    return model_class(**model_params)


def objective(trial, X, y):
    try:
        with mlflow.start_run() as run:
            mlflow.log_param("cv_mthd", cv_mthd)
            mlflow.set_tag("experiment_purpose", experiment_purpose)
            mlflow.set_tag("experiment_name", experiment_name)
            mlflow.set_tag("version_nb",VERSION_NB)
            for model_name, model_class in dict_models.items():
                model = create_model(
                    trial,
                    dict_models[model_name],
                    model_params_dict[model_name]["static_params"],
                    model_params_dict[model_name]["dynamic_params"],
                )
                mae_list = []

                log_training_details(logger, model, trial, model_name)

                for fold_n, (train_idx, val_idx) in enumerate(cv.split(X, y)):
                    with mlflow.start_run(
                        run_name=f"fold_{fold_n+1}", nested=True
                    ) as nested_run:
                        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

                        mlflow.log_param("training_data_rows", X_train.shape[0])
                        mlflow.log_param("training_data_columns", X_train.shape[1])

                        model.fit(
                            X_train,
                            y_train,
                            eval_set=[(X_val, y_val)],
                            eval_metric="mae",
                            early_stopping_rounds=nbrnd_erly_stp,
                            callbacks=[
                                log_evaluation(0),
                                early_stopping(nbrnd_erly_stp, verbose=False),
                            ],
                        )

                        log_feature_importance(model, X, fold_n, experiment_purpose, experiment_date_str)

                        fold_mae = model.best_score_["valid_0"]["l1"]
                        mae_list.append(fold_mae)
                        logger.info(f"{fold_n + 1:<5} {'|':<2} {fold_mae:<20}")

                        mlflow.log_metric("mae", fold_mae)
                        mlflow.log_param("fold_number", fold_n + 1)
                        mlflow.log_param("model_name", model_name)
                        mlflow.log_param("log_model", log_model)

                        params_to_log = model.get_params()
                        mlflow.log_params(params_to_log)

                        if log_model:
                            current_time_str = datetime.now().strftime("%Y%m%d_%H%M%S")
                            model_log_name = (
                                f"{model_name}_{trial.number}_{current_time_str}"
                            )

                            mlflow.log_param("model_log_name", model_log_name)

                            mlflow.sklearn.log_model(model, model_log_name)

                            mlflow.log_param("run_time", current_time_str)

                        nested_run_id = nested_run.info.run_id
                        model_path = f"{path_experiments_storage}/{run.info.experiment_id}/{nested_run_id}/artifacts/{model_log_name}/model.pkl"
                        mlflow.log_param("model_path", model_path)
                avg_mae = sum(mae_list) / len(mae_list)

                mlflow.log_param("model_name", model_name)

                return avg_mae

    except Exception as e:
        logger.error(f"An exception occurred: {e}")
        return float("inf")

In [10]:
# Run the Optuna study
if TRAIN:
    study = optuna.create_study(
        direction="minimize",
        study_name="Your Study Name",
        load_if_exists=True,
    )
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=N_TRIALS)

In [11]:
def save_and_version_df(df_new):
    # Read existing df_runs CSV files and concatenate them with the new df_runs
    existing_files = [
        f for f in os.listdir() if f.startswith("df_runs_") and f.endswith(".csv")
    ]
    dfs = [pd.read_csv(f) for f in existing_files]

    for old_file, old_df in zip(existing_files, dfs):
        print(f"Removed old file: {old_file}, Shape: {old_df.shape}")
        os.remove(old_file)

    dfs.append(df_new)
    df_concatenated = pd.concat(dfs, ignore_index=True)

    # Remove duplicates
    df_unique = df_concatenated.drop_duplicates()

    # Save the new concatenated and deduplicated df_runs to a new versioned CSV file
    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
    csv_filename = f"df_runs_{timestamp}.csv"
    df_unique.to_csv(csv_filename, index=False)

    print(f"DataFrame saved to {csv_filename}, Shape: {df_unique.shape}")

    return df_unique

In [12]:
def gather_runs_data():
    experiments = client.search_experiments()
    all_runs_data = []
    for exp in experiments:
        experiment_id = exp.experiment_id
        run_infos = client.search_runs(experiment_ids=[experiment_id])

        for run_info in run_infos:
            run_data = {
                "experiment_id": experiment_id,
                "experiment_name": exp.name,
                "run_id": run_info.info.run_id,
            }

            # Add metrics to run_data
            for key, value in run_info.data.metrics.items():
                run_data[f"{key}"] = value

            # Add params to run_data
            for key, value in run_info.data.params.items():
                run_data[f"{key}"] = value

            all_runs_data.append(run_data)

    df_runs_new = pd.DataFrame(all_runs_data)


    df_runs_unique = save_and_version_df(df_runs_new)

    df_runs_unique = df_runs_unique[~df_runs_unique["model_name"].isna()]

    return df_runs_unique

In [13]:
cols_id = ["experiment_id", "run_id"]
cols_info = ["training_data_rows", "training_data_columns"]
cols_metrics = ["mae"]
cols_param_exp = ["date_exp", "log_model", "cv_mthd", "fold_number"]
cols_others_info = [
    "experiment_name",
    "model_path",
    "device",
    "n_jobs",
    "importance_type",
    "random_state",
    "model_name",
    "subsample_freq",
    "verbose_eval",
    "class_weight",
    "model_log_name",
    "verbose",
    "silent",
]

In [14]:
if TRAIN:
    df_runs = gather_runs_data()
    df_runs = df_runs.drop_duplicates()
    cols_params = df_runs.columns.tolist()

    for col in cols_param_exp + cols_metrics + cols_info + cols_id + cols_others_info:
        cols_params.remove(col)

    new_col_order = (
        cols_param_exp
        + cols_metrics
        + cols_info
        + cols_id
        + cols_params
        + cols_others_info
    )

    df_runs = df_runs[~df_runs["fold_number"].isna()]
    df_runs = df_runs[new_col_order]

In [15]:
#df_runs[["date_exp"] + cols_info + cols_metrics + cols_params].drop_duplicates()

In [16]:
def aggregate_feature_importance(path_to_csvs):
    """
    Aggregates feature importances from multiple CSV files and calculates mean importance.
    """
    all_files = glob.glob(f"{path_to_csvs}/*.csv")
    list_of_dfs = [pd.read_csv(filename) for filename in all_files]

    if not list_of_dfs:
        print("No CSV files found.")
        return None

    # Concatenate all Jdataframes
    aggregated_df = pd.concat(list_of_dfs, ignore_index=True)

    # Calculate mean importance for each feature
    mean_importance = (
        aggregated_df.groupby("Feature")["Importance"].mean().reset_index()
    )
    mean_importance = mean_importance.sort_values(by="Importance", ascending=False)

    return mean_importance


def analyze_feature_importance(aggregated_df, top_n=100):
    """
    Analyzes aggregated feature importances.
    """
    if aggregated_df is None:
        print("No aggregated DataFrame provided.")
        return None

    top_n_features = aggregated_df.nlargest(top_n, "Importance")

    return top_n_features


def count_top_n_features(path_to_csvs, top_n=100):
    """
    Counts how many times each feature appears in the top N most important features across multiple experiments.

    Parameters:
    path_to_csvs (str): Path to the folder containing feature importance CSV files.
    top_n (int): The number of top features to consider.

    Returns:
    DataFrame: A DataFrame showing the count of appearances in the top N features for each feature.
    """

    all_files = glob.glob(f"{path_to_csvs}/*.csv")
    list_of_dfs = [pd.read_csv(filename) for filename in all_files]

    if not list_of_dfs:
        print("No CSV files found.")
        return None

    feature_count = {}

    # Count how many times each feature appears in the top N features
    for df in list_of_dfs:
        top_features = df.nlargest(top_n, "Importance")["Feature"].tolist()
        for feature in top_features:
            if feature in feature_count:
                feature_count[feature] += 1
            else:
                feature_count[feature] = 1

    # Convert to DataFrame for easier analysis
    feature_count_df = pd.DataFrame(
        list(feature_count.items()), columns=["Feature", "Count"]
    )
    feature_count_df = feature_count_df.sort_values(by="Count", ascending=False)

    return feature_count_df

In [17]:
# Example usageJ
if TRAIN:
    path_to_csvs = "/kaggle/working/"
    aggregated_df = aggregate_feature_importance(path_to_csvs)
    top_n_features = analyze_feature_importance(aggregated_df)
    list_features = [
        col for col in list(top_n_features["Feature"]) if col in engineered_df.columns
    ]

In [18]:
#top_n_features[top_n_features["Feature"].isin(list_features)]

In [19]:
def ensemble_predict(model_paths, X_test):
    models = []
    predictions = []

    # Load models based on full artifact paths
    for model_path in model_paths:
        try:
            # If using direct path to pkl
            if model_path.endswith(".pkl"):
                model = joblib.load(model_path)
            else:
                print(f"Unsupported model format for {model_path}. Skipping.")
                continue  # Skip this iteration

            models.append(model)
        except Exception as e:
            print(f"Failed to load model at {model_path}. Error: {e}")

    # Make predictions
    for model in models:
        try:
            pred = model.predict(X_test)
            predictions.append(pred)
        except Exception as e:
            print(f"Failed to make prediction with model. Error: {e}")

    # Average predictions
    if len(predictions) > 0:
        ensemble_pred = np.mean(predictions, axis=0)
    else:
        print("No valid models loaded. Cannot make ensemble predictions.")
        ensemble_pred = None

    return ensemble_pred

In [20]:
#df_runs = df_runs[df_runs['date_exp'] == 134604.0]

In [21]:

models_dir = "models_0"

if TRAIN:
    

    model_paths = list(
        df_runs[~df_runs["model_path"].isna()].sort_values("mae")["model_path"]
    )[:5]

    
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    for model_path in model_paths:
        print(f"Checking if model path exists: {model_path}")

        if not os.path.exists(model_path):
            print(f"File does not exist: {model_path}")
            continue  # Skip to the next iteration

        specific_part = model_path.split("/")[-2]
        dest_path = os.path.join(models_dir, f"{specific_part}.pkl")
        if not os.path.exists(dest_path):
            print(f"Copying from {model_path} to {dest_path}")
            shutil.copy(model_path, dest_path)
        else:
            print(f"File {dest_path} already exists. Skipping copy.")

    zipf = zipfile.ZipFile(f"/kaggle/working/{models_dir}.zip", "w", zipfile.ZIP_DEFLATED)

    # Navigate through the folder and add each file to the ZIP
    for root, dirs, files in os.walk(f"/kaggle/working/{models_dir}"):
        for file in files:
            zipf.write(
                os.path.join(root, file),
                os.path.relpath(os.path.join(root, file), f"/kaggle/working/{models_dir}"),
            )


    zipf.close()


In [22]:
model_paths = []
# Directory to search for files
directory = f"/kaggle/input/{models_dir}"

# Check if the directory exists
if os.path.exists(directory):
    # Traverse the directory and collect file paths
    for filename in os.listdir(directory):
        full_path = os.path.join(directory, filename)

        # Check if the item is a file (and not a sub-directory)
        if os.path.isfile(full_path):
            model_paths.append(full_path)
else:
    print(f"The directory {directory} does not exist.")

# Print or return the list of file paths
print("List of file paths:", model_paths)

The directory /kaggle/input/models_0 does not exist.
List of file paths: []


In [23]:
# Assuming X_test for predict
# ensemble_predictions = ensemble_predict(model_paths, df_test, mlflow_client)

In [24]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()

In [25]:
counter = 0
for test, revealed_targets, sample_prediction in iter_test:
    #df_test_raw = pl.DataFrame(test)

    feat = feat_engineering(test)
    #feat = df_test.to_pandas()

    list_cols_drop = ["stock_id", "date_id", "row_id"]
    feat = feat.drop(list_cols_drop, axis=1)

    sample_prediction["target"] = ensemble_predict(model_paths, feat)
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(
  group_by_market = df.groupby(["date_id", "seconds_in_bucket"]).agg(


No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.
No valid models loaded. Cannot make ensemble predictions.


In [26]:
#clean_directory_except_one('/kaggle/working/', 'submission.csv')