In [None]:
import pandas as pd
import polars as pl
import numpy as np
import os
import gc
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold
#import xgboost as xgb
#from xgboost import XGBRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor, log_evaluation
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import pickle
import time

gc.enable()

'''
pd.options.display.max_columns = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)
'''

In [None]:
'''
from pathlib import Path
DATA_DIR = Path('/kaggle/input/jane-street-real-time-market-data-forecasting')

date_offset = 1500

is_score_dates = 5

pl_all = pl.scan_parquet(DATA_DIR/"train.parquet").filter(pl.col("date_id") >= date_offset-1).collect()

# make syn_test 
syn_test = pl_all.with_columns(
    # pl.lit(True).alias("is_scored"),
    pl.col('date_id') - date_offset
    ).with_row_index(name="row_id", offset=0)

syn_test = syn_test.with_columns(
    pl.when(pl.col('date_id')<is_score_dates-1).then(pl.lit(False)).otherwise(pl.lit(True)).alias("is_scored")
)

syn_test = syn_test.select(
    ['row_id', 'date_id', 'time_id', 'symbol_id', 'weight', 'is_scored'] + [f'feature_{x:02}' for x in range(79)]
)

syn_test_partition = syn_test.partition_by('date_id', maintain_order=True, as_dict=True)

output_dir = "synthetic_test.parquet"
os.makedirs(output_dir, exist_ok=True)

row_id_offset = syn_test.filter(pl.col('date_id')<0).select('row_id').max().item()
print("row_id_offset:", row_id_offset)

for key, _df in syn_test_partition.items():
    if key[0] >= 0:
        os.makedirs(f"{output_dir}/date_id={key[0]}", exist_ok=True)
        _df = _df.with_columns(pl.col('row_id')-row_id_offset)
        _df.write_parquet(f"{output_dir}/date_id={key[0]}/part-0.parquet")

# make syn_lag

syn_lag = pl_all.select(
    ['date_id', 'time_id', 'symbol_id'] + [f'responder_{x}' for x in range(9)]
).with_columns(pl.col('date_id')-date_offset)

syn_lag = syn_lag.rename({f'responder_{x}': f'responder_{x}_lag_1' for x in range(9)})

syn_lag_partition = syn_lag.partition_by('date_id', maintain_order=True, as_dict=True)

output_dir = "synthetic_lags.parquet"
os.makedirs(output_dir, exist_ok=True)

for key, _df in syn_lag_partition.items():
    os.makedirs(f"{output_dir}/date_id={key[0]+1}", exist_ok=True)
    _df = _df.with_columns(pl.col('date_id')+1)
    _df.write_parquet(f"{output_dir}/date_id={key[0]+1}/part-0.parquet")
'''

In [None]:
path = '/kaggle/input/jane-street-real-time-market-data-forecasting/'

In [None]:
models_path = '/kaggle/input/js_lgb_20250101_lags_as_features/other/default/1/lgb_model.pkl'

In [None]:
with open(models_path, "rb") as f:
    model = pickle.load(f)

In [None]:
class TimerCallback:
    def __init__(self, max_time_seconds, loop_start_time):
        self.max_time_seconds = max_time_seconds
        #self.start_time = None
        self.loop_start_time = loop_start_time

    def __call__(self, env):
        #if self.start_time is None:
        #    self.start_time = time.time()

        elapsed_time = time.time() - self.loop_start_time
        if elapsed_time > self.max_time_seconds:
            print(f"Stopping training after {elapsed_time:.2f} seconds.")
            best_iteration = env.model.best_iteration
            best_score = env.model.best_score
            raise lgb.callback.EarlyStopException(best_iteration, best_score)

In [None]:
cols = [
    'date_id', 'time_id', 'symbol_id', 'weight', 'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10',
    'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
    'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38',
    'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52',
    'feature_53', 'feature_54', 'feature_55', 'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66',
    'feature_67', 'feature_68', 'feature_69', 'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74', 'feature_75', 'feature_76', 'feature_77', 'feature_78', 'responder_6', 'responder_0_lag_1',
    'responder_1_lag_1', 'responder_2_lag_1', 'responder_3_lag_1', 'responder_4_lag_1', 'responder_5_lag_1', 'responder_6_lag_1', 'responder_7_lag_1', 'responder_8_lag_1'
]

In [None]:
feature_cols = [
    'feature_00', 'feature_01', 'feature_02', 'feature_03', 'feature_04', 'feature_05', 'feature_06', 'feature_07', 'feature_08', 'feature_09', 'feature_10', 'feature_11', 'feature_12', 'feature_13',
    'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27',
    'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41',
    'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49', 'feature_50', 'feature_51', 'feature_52', 'feature_53', 'feature_54', 'feature_55',
    'feature_56', 'feature_57', 'feature_58', 'feature_59', 'feature_60', 'feature_61', 'feature_62', 'feature_63', 'feature_64', 'feature_65', 'feature_66', 'feature_67', 'feature_68', 'feature_69',
    'feature_70', 'feature_71', 'feature_72', 'feature_73', 'feature_74', 'feature_75', 'feature_76', 'feature_77', 'feature_78', 'responder_0_lag_1', 'responder_1_lag_1', 'responder_2_lag_1',
    'responder_3_lag_1', 'responder_4_lag_1', 'responder_5_lag_1', 'responder_6_lag_1', 'responder_7_lag_1', 'responder_8_lag_1'
]

In [None]:
date_id_max = pl.scan_parquet(path + 'train.parquet/').select('date_id').max().collect()['date_id'][0]
date_id_max

In [None]:
train_df = pl.scan_parquet(path + 'train.parquet/').filter(pl.col('date_id') >= date_id_max - 10).collect()
lags_df = train_df.with_columns(pl.col('date_id') + 1).drop(['weight', 'partition_id'] + [col for col in train_df.columns if 'feature' in col]).rename({f'responder_{x}': f'responder_{x}_lag_1' for x in range(9)})
train_df = train_df.drop(['responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_7', 'responder_8', 'partition_id']).select(pl.all().shrink_dtype())
train_df = train_df.join(lags_df, on=['date_id', 'time_id', 'symbol_id'], how='left').select(pl.all().shrink_dtype()).filter(pl.col('date_id') > date_id_max - 10)
train_df = train_df.with_columns(pl.col('date_id') - date_id_max - 1).with_columns([pl.col(col).cast(pl.Float32) for col in train_df.columns])
del lags_df
gc.collect()
train_df

In [None]:
best_params = {
    'learning_rate': 0.04588738403235412,
    'max_depth': 12,
    'min_data_in_leaf': 60,
    'num_leaves': 4763,
    'min_gain_to_split': 0.25,
    'lambda_l1': 4.0,
    'lambda_l2': 1786.5166849320328,
    'feature_fraction': 0.9547872173111335
}

In [None]:
import kaggle_evaluation.jane_street_inference_server

In [None]:
lags_df : pl.DataFrame | None = None
streaming_data_y_list = []
streaming_data_X_list = []
y_concat = None
X_concat = None
streaming_data_X_concat_list = []
temp_i = 0
updated_model = None

# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_df, streaming_data_y_list, streaming_data_X_list, temp_i, streaming_data_lags_list, y_concat, X_concat, streaming_data_X_concat_list, updated_model, train_df

    if lags is not None:
        timer_start_time = time.time()
        lags_df = lags
        date_id = lags_df['date_id'][0]
    
    test_df = test.drop(['row_id', 'is_scored'])    
    test_df = test_df.join(lags_df, on=['date_id', 'time_id', 'symbol_id'], how='left')
    test_df = test_df.with_columns([pl.col(col).cast(pl.Float32) for col in test_df.columns])
    streaming_data_X_list.append(test_df)

    if lags is not None:
        y_df = lags_df.select(['date_id', 'time_id', 'symbol_id', 'responder_6_lag_1']).with_columns(pl.col('date_id') - 1).rename({'responder_6_lag_1':'responder_6'})
        y_df = y_df.with_columns([pl.col(col).cast(pl.Float32) for col in y_df.columns])
        streaming_data_y_list.append(y_df)

        y_concat = pl.concat(streaming_data_y_list)
        X_concat = pl.concat(streaming_data_X_list)
        X_concat = X_concat.with_columns([pl.col(col).cast(pl.Float32) for col in X_concat.columns])
        streaming_data_X_concat_list.append(X_concat)
        streaming_data_X_list = []

        if len(streaming_data_X_concat_list) > 2:
            del streaming_data_X_concat_list[0], streaming_data_y_list[0]

        streaming_data_X_concat_concat = pl.concat(streaming_data_X_concat_list)
        streaming_data_X_concat_concat = streaming_data_X_concat_concat.with_columns([pl.col(col).cast(pl.Float32) for col in streaming_data_X_concat_concat.columns])

        streaming_data_df = streaming_data_X_concat_concat.join(y_concat, on=['date_id', 'time_id', 'symbol_id'], how='left').drop_nulls(subset=['responder_6'])
        streaming_data_df = streaming_data_df.with_columns([pl.col(col).cast(pl.Float32) for col in streaming_data_df.columns])
        streaming_data_df = streaming_data_df.filter(pl.col('date_id') == date_id - 1).select(cols)
        if streaming_data_df.shape[0] > 0:
            train_df = pl.concat([train_df, streaming_data_df])

        if train_df['date_id'].n_unique() > 1:
            if train_df.estimated_size() / 1e9 > 25:
                train_df_date_id_min = train_df['date_id'].min()
                train_df = train_df.filter(pl.col('date_id') > train_df_date_id_min)
                gc.collect()
            streaming_data_date_id_max = train_df['date_id'].max()
            streaming_data_val_df = train_df.filter(pl.col('date_id') == streaming_data_date_id_max)
            streaming_data_train_df = train_df.filter(pl.col('date_id') < streaming_data_date_id_max)
            if streaming_data_train_df.shape[0] > 1100000:
                streaming_data_train_df = streaming_data_train_df.sample(n=1100000)

            base_params = {
                'verbosity': -1,
                'device': 'gpu',
                'early_stopping_round': 20,
            }

            tuned_params = {
                'learning_rate': best_params['learning_rate'],
                'max_depth': best_params['max_depth'],
                'min_data_in_leaf': best_params['min_data_in_leaf'],
                'num_leaves': best_params['num_leaves'],
                'min_gain_to_split': best_params['min_gain_to_split'],
                'lambda_l1': best_params['lambda_l1'],
                'lambda_l2': best_params['lambda_l2'],
                'feature_fraction': best_params['feature_fraction'],
            }

            updated_model = LGBMRegressor(
                **base_params,
                **tuned_params,
                n_estimators=100000
            )

            X_train = streaming_data_train_df.select(feature_cols).select(pl.all().shrink_dtype()).to_pandas()
            X_val = streaming_data_val_df.select(feature_cols).select(pl.all().shrink_dtype()).to_pandas()

            y_train = streaming_data_train_df.select('responder_6').to_series().to_pandas()
            y_val = streaming_data_val_df.select('responder_6').to_series().to_pandas()

            weights_train = streaming_data_train_df.select('weight').to_series().to_pandas()
            weights_val = streaming_data_val_df.select('weight').to_series().to_pandas()

            if temp_i == 0:
                timer_callback = TimerCallback(max_time_seconds=110, loop_start_time=timer_start_time)
            else:
                timer_callback = TimerCallback(max_time_seconds=58, loop_start_time=timer_start_time)

            try:
                updated_model.fit(
                    X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val], callbacks=[timer_callback],#, log_evaluation(period=10)],
                    init_model=model
                )
            except lgb.callback.EarlyStopException as e:
                print(f"Training stopped. Best iteration: {e.best_iteration}, Best score: {e.best_score}")


        temp_i += 1

        gc.collect()

    test_df = test_df.select(feature_cols).select(pl.all().shrink_dtype()).to_numpy()
    
    if updated_model == None:
        preds = model.predict(test_df)
    else:
        preds = updated_model.predict(test_df)

    predictions = test.select(
        'row_id',
        pl.lit(preds).alias('responder_6'),
    ).select(pl.all().shrink_dtype())

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            #'/kaggle/working/synthetic_test.parquet',
            #'/kaggle/working/synthetic_lags.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )