In [None]:
"""
This script builds a final, advanced stacked ensemble model for the GoQuant competition.

This version uses a non-linear meta-model to intelligently combine predictions
and includes a full suite of robust features.
"""

import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr

# File Paths 
ETH_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\ETH.csv"
ETH_TEST_PATH  = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\test\ETH.csv"
BTC_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\BTC.csv"
SUB_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission.csv"
OUT_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv"

# Base Model 
LGBM_PARAMS = {
    'objective': 'regression_l2',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'min_child_samples': 100,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'n_estimators': 10_000,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_jobs': -1
}

CATBOOST_PARAMS = {
    'iterations': 10000,
    'learning_rate': 0.02,
    'depth': 8,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 400
}

# Meta-Model Parameters 
META_MODEL_PARAMS = {
    'objective': 'regression_l2',
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'num_leaves': 16,
    'random_state': 42,
    'n_jobs': -1
}


def create_features(df: pd.DataFrame, is_primary_asset=True):
    df = df.copy()
    
    if is_primary_asset:
        df['timestamp_dt'] = pd.to_datetime(df['timestamp'], unit='s')
        df['hour'] = df['timestamp_dt'].dt.hour
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    df['wap'] = ((df['bid_price1'] * df['ask_volume1']) + (df['ask_price1'] * df['bid_volume1'])) / \
                (df['bid_volume1'] + df['ask_volume1'])
    df['log_return'] = np.log(df['wap'] / df['wap'].shift(1))
    
    if is_primary_asset:
        df['bid_ask_spread'] = df['ask_price1'] - df['bid_price1']
        df['total_volume'] = df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) + \
                             df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)
        df['full_obi'] = (df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) - \
                          df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)) / df['total_volume']
        df['wap_diff_10'] = df['wap'].diff(10)

    df['realized_vol_100'] = df['log_return'].rolling(window=100).std()
    return df


def load_and_prepare_data(eth_train_path, eth_test_path, btc_train_path, sub_path):
    """Loads, de-duplicates, cleans, and merges all data with robust timestamp handling."""
    # Load all raw data
    train_df = pd.read_csv(eth_train_path)
    test_df  = pd.read_csv(eth_test_path)
    btc_df   = pd.read_csv(btc_train_path)
    submission_df = pd.read_csv(sub_path)
    print(f"Loaded raw data: train={train_df.shape}, test={test_df.shape}, btc={btc_df.shape}")

    print("Converting timestamps from text to a numerical format...")
    for df in [train_df, test_df, btc_df, submission_df]:
        if 'timestamp' in df.columns:
      
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    # Drop any rows where timestamp conversion failed
    train_df.dropna(subset=['timestamp'], inplace=True)
    test_df.dropna(subset=['timestamp'], inplace=True)
    btc_df.dropna(subset=['timestamp'], inplace=True)
    
    # changed for MemoryError: Aggregate by timestamp to remove duplicates 
    print("Aggregating data to ensure unique timestamps...")
    train_df = train_df.groupby('timestamp').mean().reset_index()
    test_df  = test_df.groupby('timestamp').mean().reset_index()
    btc_df   = btc_df.groupby('timestamp').mean().reset_index()
    
   
    # Convert the clean datetime objects to Unix timestamp integers.
    for df in [train_df, test_df, btc_df]:
        df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9
        
    print("Timestamp conversion and de-duplication complete.")

    
    train_featured = create_features(train_df, is_primary_asset=True)
    test_featured  = create_features(test_df, is_primary_asset=True)
    btc_featured   = create_features(btc_df, is_primary_asset=False)

    # Merge cross-asset features
    btc_features_to_merge = btc_featured[['timestamp', 'log_return', 'realized_vol_100']].rename(
        columns={'log_return': 'log_return_btc', 'realized_vol_100': 'realized_vol_100_btc'}
    )
    train_featured = pd.merge(train_featured, btc_features_to_merge, on='timestamp', how='left')
    test_featured  = pd.merge(test_featured, btc_features_to_merge, on='timestamp', how='left')
    
    # rolling correlation
    train_featured['eth_btc_corr_100'] = train_featured['log_return'].rolling(window=100).corr(train_featured['log_return_btc'])
    test_featured['eth_btc_corr_100']  = test_featured['log_return'].rolling(window=100).corr(test_featured['log_return_btc'])
    print("Feature engineering and merging complete.")
    
    # Prepare Final Model Matrices
    TARGET_COL = 'label'
    features_to_drop = ['timestamp', 'timestamp_dt', 'hour', TARGET_COL, 'log_return', 'log_return_btc']
    MODEL_FEATURES = [col for col in train_featured.columns if col not in features_to_drop]

    X = train_featured[MODEL_FEATURES]
    y = train_featured[TARGET_COL]
    X_test = test_featured[MODEL_FEATURES]

    # Final cleanup
    for df in [X, X_test]:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
    y.fillna(y.mean(), inplace=True)
    
    print(f"Data prepared for training with {len(MODEL_FEATURES)} features.")
    
    return X, y, X_test, submission_df
X, y, X_test, submission_df = load_and_prepare_data(
    ETH_TRAIN_PATH, ETH_TEST_PATH, BTC_TRAIN_PATH, SUB_PATH
)

time_series_splitter = TimeSeriesSplit(n_splits=5)
lgbm_oof_preds = np.zeros(len(X))
cat_oof_preds = np.zeros(len(X))

print("\nStarting cross-validation for base models...")
for fold, (train_idx, val_idx) in enumerate(time_series_splitter.split(X), 1):
    print(f"Fold {fold} ")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
    lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(400, verbose=False)])
    lgbm_oof_preds[val_idx] = lgbm.predict(X_val)

    cat = CatBoostRegressor(**CATBOOST_PARAMS)
    cat.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    cat_oof_preds[val_idx] = cat.predict(X_val)


print("\n Training Stacking Meta-Model ")
meta_features_train = pd.DataFrame({'lgbm_pred': lgbm_oof_preds, 'cat_pred': cat_oof_preds})

# Use a simple but non-linear model to combine predictions
meta_model = lgb.LGBMRegressor(**META_MODEL_PARAMS)
meta_model.fit(meta_features_train, y)

# Evaluate all models
corr_lgbm_oof, _ = pearsonr(y, lgbm_oof_preds)
corr_cat_oof, _ = pearsonr(y, cat_oof_preds)
stacked_preds_oof = meta_model.predict(meta_features_train)
corr_stacked_oof, _ = pearsonr(y, stacked_preds_oof)

print(f"LightGBM CV Pearson : {corr_lgbm_oof:.6f}")
print(f"CatBoost CV Pearson : {corr_cat_oof:.6f}")
print(f"Stacked CV Pearson  : {corr_stacked_oof:.6f}")


# ==============================================================================
#  5. Final Prediction
# ==============================================================================
print("\nTraining final base models on all data")
final_lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
final_lgbm.fit(X, y)
lgbm_final_preds = final_lgbm.predict(X_test)

final_cat = CatBoostRegressor(**CATBOOST_PARAMS)
final_cat.fit(X, y)
cat_final_preds = final_cat.predict(X_test)

print("Generating final predictions with the trained meta-model")
meta_features_test = pd.DataFrame({'lgbm_pred': lgbm_final_preds, 'cat_pred': cat_final_preds})
final_predictions = meta_model.predict(meta_features_test)

submission_df['labels'] = final_predictions

try:
    submission_df.to_csv(OUT_PATH, index=False)
    print(f"\nStacked ensemble submission successfully saved to: {OUT_PATH}")
except Exception as e:
    print(f"Error saving submission file: {e}")

Loaded raw data: train=(631292, 23), test=(270548, 22), btc=(631292, 22)
Converting timestamps from text to a numerical format...


  df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')


Aggregating data to ensure unique timestamps...
Timestamp conversion and de-duplication complete.
Feature engineering and merging complete.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
  df.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='bfill', inplace=True)


Data prepared for training with 31 features.

Starting cross-validation for base models...
--- Fold 1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6943
[LightGBM] [Info] Number of data points in the train set: 105217, number of used features: 30
[LightGBM] [Info] Start training from score 0.000063
--- Fold 2 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6953
[LightGBM] [Info] Number of data points in the train set: 210432, number of used features: 30
[LightGBM] [Info] Start training from score 0.000060
--- Fold 3 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info

In [None]:
# ==============================================================================
# GoQuant Implied Volatility Forecasting
# Final Submission Notebook
#
# Author: [Your Name]
# Kaggle Username: [Your Kaggle Username]
#
# ---
#
# ## 1. My Approach & Objective
#
# This is my final notebook for the GoQuant assignment. My goal is to forecast
# the 10-second-ahead implied volatility for ETH using the provided order book data.
#
# After a lot of experimenting, I landed on a stacking ensemble. It combines
# LightGBM and CatBoost and gave me the best and most stable cross-validation
# score on the official Pearson Correlation metric.
#
# ==============================================================================

import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr

# ==============================================================================
# ## 2. Configuration
#
# I'm putting all my file paths and model parameters here so they're easy to change later.
# ==============================================================================

# --- File Paths ---
# Pointing to my local data files.
ETH_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\ETH.csv"
ETH_TEST_PATH  = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\test\ETH.csv"
SUB_PATH   = r"C:\Users\HP\Downloads\gq-impl-volatility-forecasting\submission.csv"
OUT_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv"

# --- Base Model Parameters ---
# These are the settings for my two main models. I tuned them to be a bit simpler
# and more robust to avoid overfitting on the noisy data.
LGBM_PARAMS = {
    'objective': 'regression_l2', 'learning_rate': 0.02, 'num_leaves': 64,
    'min_child_samples': 100, 'colsample_bytree': 0.8, 'subsample': 0.8,
    'n_estimators': 10_000, 'reg_lambda': 1.0, 'random_state': 42, 'n_jobs': -1
}
CATBOOST_PARAMS = {
    'iterations': 10000, 'learning_rate': 0.02, 'depth': 8,
    'loss_function': 'RMSE', 'eval_metric': 'RMSE',
    'random_seed': 42, 'verbose': 0, 'early_stopping_rounds': 400
}

# --- Meta-Model Parameters (for Stacking) ---
# This is a simpler model that just learns how to best combine the predictions
# from the two base models.
META_MODEL_PARAMS = {
    'objective': 'regression_l2', 'n_estimators': 1000,
    'learning_rate': 0.01, 'num_leaves': 16,
    'random_state': 42, 'n_jobs': -1
}

# ==============================================================================
# ## 3. Data Prep & Feature Engineering
#
# This is where most of the work happened. I'm loading the data, cleaning it,
# and creating a bunch of features to try and capture what's happening in the market.
# ==============================================================================

def create_features(df: pd.DataFrame) -> pd.DataFrame:
    """This function is where I create all my features for a given dataframe."""
    df = df.copy()
    
    # --- Time Features ---
    # The hour of the day seems important for volatility, but I need to encode it
    # cyclically so the model understands hour 23 is close to hour 0.
    df['timestamp_dt'] = pd.to_datetime(df['timestamp'], unit='s')
    df['hour'] = df['timestamp_dt'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    # --- Core Price & Volume Features ---
    # WAP is way better than mid-price because it accounts for volume. This is my main price signal.
    df['wap'] = (df['bid_price1'] * df['ask_volume1'] + df['ask_price1'] * df['bid_volume1']) / \
                (df['bid_volume1'] + df['ask_volume1'])
    
    # Using log returns because they are standard for financial data.
    df['log_return'] = np.log(df['wap'] / df['wap'].shift(1))
    
    # Basic features for market liquidity and pressure.
    df['bid_ask_spread'] = df['ask_price1'] - df['bid_price1']
    df['total_volume'] = df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) + \
                         df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)
    df['full_obi'] = (df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) - \
                      df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)) / df['total_volume']
    
    # --- Momentum & Volatility Features ---
    # Adding some features to capture recent trends.
    df['wap_diff_10'] = df['wap'].diff(10) # How much has the price changed in 10 seconds?
    df['realized_vol_100'] = df['log_return'].rolling(window=100).std() # Volatility over the last ~1.5 mins.
    
    return df

def load_and_prepare_data(paths: dict) -> tuple:
    """
    This is my main data pipeline. It handles loading, cleaning, and feature creation all in one place.
    I built this to be robust after running into a bunch of data errors.
    """
    print("--- Starting Data Preparation ---")
    train_df = pd.read_csv(paths['eth_train'])
    test_df  = pd.read_csv(paths['eth_test'])
    submission_df = pd.read_csv(paths['sub'])
    print(f"Loaded raw data: train={train_df.shape}, test={test_df.shape}")

    # --- De-duplication and Timestamp Conversion ---
    # This was a critical step. The raw data had duplicate timestamps which caused memory errors.
    # I'm grouping by timestamp to get a clean 1-second interval.
    for name, df in {'train': train_df, 'test': test_df}.items():
        # Using to_datetime because the timestamps are date strings.
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df.dropna(subset=['timestamp'], inplace=True)
        
        df = df.groupby('timestamp').mean().reset_index()
        # Finally, converting the clean datetime to a simple number (Unix timestamp).
        df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9
        
        if name == 'train': train_df = df
        else: test_df = df
    print("Timestamp conversion and de-duplication complete.")

    # --- Feature Engineering ---
    train_featured = create_features(train_df)
    test_featured  = create_features(test_df)
    print("Feature engineering complete.")
    
    # --- Final Data Matrices ---
    # Getting my X and y ready for the models.
    TARGET_COL = 'label'
    features_to_drop = ['timestamp', 'timestamp_dt', 'hour', TARGET_COL, 'log_return']
    MODEL_FEATURES = [col for col in train_featured.columns if col not in features_to_drop]

    X = train_featured[MODEL_FEATURES]
    y = train_featured[TARGET_COL]
    X_test = test_featured[MODEL_FEATURES]

    # Final cleanup of any NaNs created by rolling features.
    for df in [X, X_test]:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
    y.fillna(y.mean(), inplace=True)
    
    if X.empty:
        raise ValueError("Dataframe X is empty after preparation. Something went wrong.")
        
    print(f"Data prepared for training with {len(MODEL_FEATURES)} features.")
    return X, y, X_test, submission_df

# Let's run the pipeline.
paths = {'eth_train': ETH_TRAIN_PATH, 'eth_test': ETH_TEST_PATH, 'sub': SUB_PATH}
X_train, y_train, X_test, submission_df = load_and_prepare_data(paths)

# ==============================================================================
# ## 4. Model Training & Stacking
#
# My modeling strategy is a stacking ensemble.
# 1.  I use TimeSeriesSplit for cross-validation to make sure I'm not looking into the future.
# 2.  I train two strong base models: LightGBM and CatBoost.
# 3.  I then train a final, simpler "meta-model" on their predictions to learn how to best combine them.
# ==============================================================================

time_series_splitter = TimeSeriesSplit(n_splits=5)
lgbm_oof_predictions = np.zeros(len(X_train))
catboost_oof_predictions = np.zeros(len(X_train))

print("\n--- Starting Cross-Validation for Base Models ---")
for fold, (train_index, validation_index) in enumerate(time_series_splitter.split(X_train), 1):
    print(f"  Training Fold {fold}...")
    # Splitting the data for this fold.
    X_train_fold, X_validation_fold = X_train.iloc[train_index], X_train.iloc[validation_index]
    y_train_fold, y_validation_fold = y_train.iloc[train_index], y_train.iloc[validation_index]

    # --- Train LightGBM ---
    lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
    lgbm.fit(X_train_fold, y_train_fold, eval_set=[(X_validation_fold, y_validation_fold)], callbacks=[lgb.early_stopping(400, verbose=False)])
    lgbm_oof_predictions[validation_index] = lgbm.predict(X_validation_fold)

    # --- Train CatBoost ---
    cat = CatBoostRegressor(**CATBOOST_PARAMS)
    cat.fit(X_train_fold, y_train_fold, eval_set=[(X_validation_fold, y_validation_fold)])
    catboost_oof_predictions[validation_index] = cat.predict(X_validation_fold)

# --- Train the Stacking Meta-Model ---
print("\n--- Training Stacking Meta-Model ---")
# The features for my meta-model are just the predictions from the base models.
meta_features_train = pd.DataFrame({
    'lgbm_pred': lgbm_oof_predictions,
    'cat_pred': catboost_oof_predictions
})
# I have to make sure I only train the meta-model on the validation folds.
oof_mask = lgbm_oof_predictions != 0
meta_model = lgb.LGBMRegressor(**META_MODEL_PARAMS)
meta_model.fit(meta_features_train[oof_mask], y_train[oof_mask])

# ==============================================================================
# ## 5. Evaluation & Final Submission
#
# Time to see how I did. I'll check the Pearson score for each model and the final
# stacked ensemble, then generate the submission file.
# ==============================================================================

# --- Evaluate OOF Predictions ---
y_for_oof_eval = y_train[oof_mask]
lgbm_oof_eval = lgbm_oof_predictions[oof_mask]
catboost_oof_eval = catboost_oof_predictions[oof_mask]
stacked_oof_predictions = meta_model.predict(meta_features_train[oof_mask])

# Calculate Pearson Correlation for each model.
corr_lgbm, _ = pearsonr(y_for_oof_eval, lgbm_oof_eval)
corr_cat, _ = pearsonr(y_for_oof_eval, catboost_oof_eval)
corr_stacked, _ = pearsonr(y_for_oof_eval, stacked_oof_predictions)

print("\n-------------------------------------------")
print("  Cross-Validation Performance (Pearson)")
print("-------------------------------------------")
print(f"  LightGBM CV Pearson : {corr_lgbm:.6f}")
print(f"  CatBoost CV Pearson : {corr_cat:.6f}")
print(f"  Stacked CV Pearson  : {corr_stacked:.6f} ✨")
print("-------------------------------------------")

# --- Train Final Models on All Data ---
print("\nTraining final models on all the data for the submission...")
final_lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
final_lgbm.fit(X_train, y_train)
lgbm_final_predictions = final_lgbm.predict(X_test)

final_catboost = CatBoostRegressor(**CATBOOST_PARAMS)
final_catboost.fit(X_train, y_train)
catboost_final_predictions = final_catboost.predict(X_test)

# --- Generate Final Predictions with the Meta-Model ---
print("Generating final predictions with the trained meta-model...")
meta_features_test = pd.DataFrame({
    'lgbm_pred': lgbm_final_predictions,
    'cat_pred': catboost_final_predictions
})
final_predictions = meta_model.predict(meta_features_test)

# --- Create and Save Submission File ---
submission_df['labels'] = final_predictions
try:
    submission_df.to_csv(OUT_PATH, index=False)
    print(f"\n✅ Stacked ensemble submission successfully saved to: {OUT_PATH}")
except Exception as e:
    print(f"Error saving submission file: {e}")

# ==============================================================================
# To improve the model even more, I'd look into adding other data sources, like
# the OHLCV data or order book data from other cryptos like BTC.
# ==============================================================================


Loaded raw data: train=(631292, 23), test=(270548, 22)
Timestamp conversion and de-duplication complete.
Feature engineering complete.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
  df.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='bfill', inplace=True)


Data prepared for training with 29 features.

Starting cross-validation for base models...
--- Fold 1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006719 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6688
[LightGBM] [Info] Number of data points in the train set: 105217, number of used features: 29
[LightGBM] [Info] Start training from score 0.000063
--- Fold 2 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6698
[LightGBM] [Info] Number of data points in the train set: 210432, number of used features: 29
[LightGBM] [Info] Start training from score 0.000060
--- Fold 3 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info