In [1]:
import polars as pl
import glob
import matplotlib.pyplot as plt
import lightgbm as lgb

In [2]:

# Step 1: Gather parquet files
all_files = glob.glob("/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=*/part-0.parquet")
print("files: ", all_files)

# Step 2: Create a lazy scan for each file and combine
df_list = [pl.scan_parquet(f) for f in all_files]
df_lazy = pl.concat(df_list)

# Step 3: Collect into an eager DataFrame
df_eager = df_lazy.collect()
print("shape of dataframe: ", df_eager.shape)

# Step 4: Interpolate numeric columns
df_interpolated = df_eager.with_columns(
    [pl.col(c).interpolate() for c in df_eager.select(pl.all().exclude(pl.Utf8)).columns]
)
print(df_interpolated)

files:  ['/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=1/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=6/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=8/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=9/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=7/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=5/part-0.parquet', '/Users/nicky/Documents/jane_street_kaggle/jan

1.	Time-based Train/Validation Split
Since this is a forecasting problem, you usually want a chronological split by date_id (or (date_id, time_id)) so your model learns from “past” data and is tested on a “future” portion.

In [3]:
cutoff_date = 700

# LazyFrame for the full data (already have df_interpolated as an *eager* frame).
# Convert it back to a LazyFrame for streaming:
df_lazy = df_interpolated.lazy()

# TRAIN split
df_lazy.filter(pl.col("date_id") < cutoff_date) \
       .collect(streaming=True) \
       .write_parquet("train_split.parquet")

# VAL split
df_lazy.filter(pl.col("date_id") >= cutoff_date) \
       .collect(streaming=True) \
       .write_parquet("val_split.parquet")


More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  .collect(streaming=True) \

More information on the new streaming engine: https://github.com/pola-rs/polars/issues/20947
  .collect(streaming=True) \


In [4]:
# Lazy scan of the train split
df_train_lazy = pl.scan_parquet("train_split.parquet")

chunk_size = 2_000_000
offset = 0

model = None
done = False

while not done:
    chunk = df_train_lazy.slice(offset, chunk_size).collect()
    if chunk.is_empty():
        done = True
        break

    # Convert to pandas
    chunk_pd = chunk.to_pandas()
    X_chunk = chunk_pd[[c for c in chunk_pd.columns if c.startswith("feature_")]]
    y_chunk = chunk_pd["responder_6"]

    # Construct LightGBM dataset
    train_data_chunk = lgb.Dataset(X_chunk, label=y_chunk)

    if model is None:
        # First training pass
        params = {
            "objective": "regression",
            "metric": "rmse",
            "learning_rate": 0.05,
            "num_leaves": 64
            # ...
        }
        model = lgb.train(params, train_data_chunk, num_boost_round=100)
    else:
        # Continue training from existing model
        model = lgb.train(params, train_data_chunk, num_boost_round=100,
                          init_model=model)

    offset += chunk_size

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.118973 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18261
[LightGBM] [Info] Number of data points in the train set: 2000000, number of used features: 75
[LightGBM] [Info] Start training from score -0.003645
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.128170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19322
[LightGBM] [Info] Number of data points in the train set: 2000000, number of used features: 79
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.127373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19289
[LightGBM] [Info] Number of data points in the train set: 2000000, number of used features: 79
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead

In [5]:
df_val_lazy = pl.scan_parquet("val_split.parquet")

all_val_chunks = []
offset = 0
chunk_size = 2_000_000
done = False

while not done:
    chunk_val = df_val_lazy.slice(offset, chunk_size).collect()
    if chunk_val.is_empty():
        done = True
        break
    
    chunk_val_pd = chunk_val.to_pandas()
    X_val_chunk = chunk_val_pd[[c for c in chunk_val_pd.columns if c.startswith("feature_")]]
    y_val_chunk = chunk_val_pd["responder_6"]
    
    y_pred_chunk = model.predict(X_val_chunk, num_iteration=model.best_iteration)
    # accumulate errors for an overall metric, e.g. MSE
    all_val_chunks.append((y_val_chunk.values, y_pred_chunk))
    
    offset += chunk_size

# Combine predictions to compute a final metric
import numpy as np

ys = []
preds = []
for y_arr, pred_arr in all_val_chunks:
    ys.append(y_arr)
    preds.append(pred_arr)

y_all = np.concatenate(ys)
pred_all = np.concatenate(preds)

# e.g. MSE
mse = np.mean((pred_all - y_all)**2)
rmse = np.sqrt(mse)
print("Validation RMSE:", rmse)

Validation RMSE: 0.9172958049421145


In [6]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(y_all, pred_all)
print("MAE:", mae)

MAE: 0.6099333041939601


In [7]:
from sklearn.metrics import r2_score

r2 = r2_score(y_all, pred_all)
print("R²:", r2)

R²: -0.11066256629964832


: 

In [8]:
import polars as pl
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score

##############################
# 1) Lazy-load & chunk train
##############################
df_train_lazy = pl.scan_parquet("train_split.parquet")  # partitioned or single file

exclude_cols = {
    "row_id", "date_id", "time_id", "symbol_id",
    "responder_0", "responder_1", "responder_2",
    "responder_3", "responder_4", "responder_5",
    "responder_6", "responder_7", "responder_8",
    "weight"
}
# Gather columns from a small collect
all_cols = df_train_lazy.slice(0,1_000).collect().columns
feature_cols = [c for c in all_cols if c not in exclude_cols]
target_col = "responder_6"

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 31,        # Fewer leaves -> smaller, faster model
    "max_bin": 63,           # Lower max_bin reduces memory in histogram building
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "seed": 42,
    "num_threads": 2
}

chunk_size = 50_000
offset = 0
done = False
partial_model_path = "lightgbm_incremental.txt"
model = None
num_boost_round_per_chunk = 50

while not done:
    chunk = df_train_lazy.slice(offset, chunk_size).collect()
    if chunk.is_empty():
        done = True
        break

    train_pd = chunk.to_pandas()
    X_chunk = train_pd[feature_cols]
    y_chunk = train_pd[target_col]

    train_data = lgb.Dataset(X_chunk, label=y_chunk)

    if model is None:
        # First chunk -> new model
        model = lgb.train(
            params,
            train_data,
            num_boost_round=num_boost_round_per_chunk
        )
    else:
        # Continue from previous
        model = lgb.train(
            params,
            train_data,
            num_boost_round=num_boost_round_per_chunk,
            init_model=model
        )

    offset += chunk_size

##############################
# 2) Validation
##############################
df_val = pl.read_parquet("val_split.parquet")
val_pd = df_val.to_pandas()

X_val = val_pd[feature_cols]
y_val = val_pd[target_col]

pred_val = model.predict(X_val)
mse = mean_squared_error(y_val, pred_val)
rmse = mse**0.5
r2 = r2_score(y_val, pred_val)
print("Validation RMSE:", rmse)
print("Validation R²:", r2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001657 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4184
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 70
[LightGBM] [Info] Start training from score -0.036732
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001668 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4191
[LightGBM] [Info] Number of data points in the train set: 50000, number of used features: 70
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001626 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [In