In [1]:
import os
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error

# check env -> set path
if os.path.exists("/kaggle/input"):
    DATA_PATH = "/kaggle/input/gq-implied-volatility-forecasting"
    print("data exists")
else:
    DATA_PATH = "/content/gq-implied-volatility-forecasting"  # change if diff
    print("local/colab mode")

# --- load + prep data ---
def load_data():
    # read csv
    tr = pd.read_csv(f"{DATA_PATH}/train/ETH.csv")
    ts = pd.read_csv(f"{DATA_PATH}/test/ETH.csv")
    sub = pd.read_csv(f"{DATA_PATH}/submission.csv")

    # new col: spread
    tr["bid_ask_spread"] = tr["ask_price1"] - tr["bid_price1"]
    ts["bid_ask_spread"] = ts["ask_price1"] - ts["bid_price1"]

    # only keep features (no time, no label)
    feats = [c for c in tr.columns if c not in ["timestamp", "label"]]
    tgt = "label"

    return tr[feats], tr[tgt], ts[feats], sub

X_train, y_train, X_test, sub_df = load_data()

print("train:", X_train.shape)
print("test:", X_test.shape)
print("cols:", X_train.columns.tolist())

# --- train model ---
print("train lgbm...")
lgbm = LGBMRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
lgbm.fit(X_train, y_train)
print("done")

# --- predict ---
preds = lgbm.predict(X_test)

# # --- rmse ---
# train_preds = lgbm.predict(X_train)
# rmse = mean_squared_error(y_train, train_preds, squared=False)
# print("train RMSE:", rmse)

# --- save sub ---
sub_df["labels"] = preds
sub_df.to_csv("/kaggle/working/submission.csv", index=False)
print("sub saved: submission.csv")


data exists
train: (631292, 22)
test: (270548, 22)
cols: ['mid_price', 'bid_price1', 'bid_volume1', 'bid_price2', 'bid_volume2', 'bid_price3', 'bid_volume3', 'bid_price4', 'bid_volume4', 'bid_price5', 'bid_volume5', 'ask_price1', 'ask_volume1', 'ask_price2', 'ask_volume2', 'ask_price3', 'ask_volume3', 'ask_price4', 'ask_volume4', 'ask_price5', 'ask_volume5', 'bid_ask_spread']
train lgbm...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5388
[LightGBM] [Info] Number of data points in the train set: 631292, number of used features: 22
[LightGBM] [Info] Start training from score 0.000062
done
sub saved: submission.csv
