In [None]:
import pandas as pd
import polars as pl
import numpy as np
import os
import gc
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, log_evaluation
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
#from sklearn.impute import IterativeImputer
import pickle
import optuna
import shap

gc.enable()

pd.options.display.max_columns = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)

In [None]:
path = 'I:/Kaggle/jane-street-real-time-market-data-forecasting/'

In [None]:
os.listdir(path)

In [None]:
train_df = pl.read_parquet(path + 'train.parquet/')
y_sr = train_df['responder_6']
train_df = train_df.drop(['responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_6', 'responder_7', 'responder_8', 'partition_id']).select(pl.all().shrink_dtype())
print(train_df.shape)
train_df.head()

In [None]:
train_df.estimated_size() / 1e9

In [None]:
test_df = pl.read_parquet(path + 'test.parquet/').drop(['row_id', 'is_scored']).select(pl.all().shrink_dtype())
print(test_df.shape)
test_df.head()

In [None]:
unique_date_ids = train_df['date_id'].unique()
unique_date_ids

In [None]:
len(unique_date_ids)

In [None]:
sample_date_ids = unique_date_ids[:len(unique_date_ids) // 5]

In [None]:
len(sample_date_ids)

In [None]:
sample_date_ids

In [None]:
sample_train_df = train_df.filter(pl.col('date_id').is_in(sample_date_ids))
print(sample_train_df.shape)

In [None]:
sample_train_df['date_id'].n_unique()

In [None]:
sample_train_df['date_id'].max()

In [None]:
sample_train_df['feature_01'].describe()

In [None]:
features_df = pl.read_csv(path + 'features.csv').select(pl.all().shrink_dtype())
print(features_df.shape)
features_df.head()

In [None]:
tag_0_features = features_df.filter(pl.col("tag_0") == True)["feature"].to_list()
print(len(tag_0_features))
tag_0_features

In [None]:
sample_train_df = sample_train_df.with_columns(
    pl.sum_horizontal(tag_0_features).alias('tag_0_sum'),
    pl.mean_horizontal(tag_0_features).alias('tag_0_mean')
)

In [None]:
tags_list = features_df.columns
tags_list.remove('feature')
tags_list

In [None]:
for tag in tqdm(tags_list):
    tag_features = features_df.filter(pl.col(f"{tag}") == True)["feature"].to_list()
    train_df = train_df.with_columns(
        pl.sum_horizontal(tag_features).alias(tag + '_sum'),
        pl.mean_horizontal(tag_features).alias(tag + '_mean'),
        pl.min_horizontal(tag_features).alias(tag + '_min'),
        pl.max_horizontal(tag_features).alias(tag + '_max')
    )

In [None]:
train_df.head()

In [None]:
train_df.estimated_size() / 1e9

In [None]:
train_df = train_df.select(pl.all().shrink_dtype())

In [None]:
train_df.estimated_size() / 1e9

In [None]:
train_symbol_ids_list = sorted(train_df.select('symbol_id').unique()['symbol_id'].to_list())
print(len(train_symbol_ids_list))
train_symbol_ids_list

In [None]:
test_symbol_ids_list = sorted(test_df.select('symbol_id').unique()['symbol_id'].to_list())
print(len(test_symbol_ids_list))
test_symbol_ids_list

In [None]:
unique_symbol_ids_list = sorted(list(set(train_symbol_ids_list + test_symbol_ids_list)))
print(len(unique_symbol_ids_list))
unique_symbol_ids_list

In [None]:
lags_df = pl.read_parquet(path + 'lags.parquet/')
print(lags_df.shape)
lags_df

In [None]:
sample_df = pl.read_csv(path + 'sample_submission.csv')
print(sample_df.shape)
sample_df

In [None]:
def one_hot_cat_cols(df):
    for v in tqdm(range(39)):
        new_col_name = 'symbol_id_' + str(v)
        df = df.with_columns(
            (df['symbol_id'] == v).cast(int).alias(new_col_name)
        )
    
    df = df.drop('symbol_id')

    return df

In [None]:
train_df = one_hot_cat_cols(train_df).select(pl.all().shrink_dtype())
print(train_df.shape)
train_df.head()

In [None]:
test_df = one_hot_cat_cols(test_df).select(pl.all().shrink_dtype())
print(test_df.shape)
test_df.head()

In [None]:
feature_cols = [col for col in train_df.columns if ('feature' in col) | ('symbol_id_' in col)]
print(len(feature_cols))
feature_cols

In [None]:
n_splits = 5

In [None]:
# https://www.kaggle.com/code/yuanzhezhou/jane-street-baseline-lgb-xgb-and-catboost

def r2_lgb(y_true, y_pred, sample_weight):
    r2 = 1 - np.average((y_pred - y_true) ** 2, weights=sample_weight) / (np.average((y_true) ** 2, weights=sample_weight) + 1e-38)
    return 'r2', r2, True

In [None]:
def lgb_train(train_data, y):
    X = train_data.drop(['date_id', 'time_id', 'weight'])
    weights = train_data['weight']
    print(X.shape)
    display(X.head())
     
    oof = np.zeros(len(y), dtype=float)

    models_list = []

    base_params = {
        'verbosity': -1,
        'learning_rate': 0.3,
        'feature_fraction': 0.8,
        'device': 'gpu',
        'early_stopping_round': 50,
        'lambda_l2': 100
    }

    cv_fold = KFold(n_splits=n_splits)

    shap_importance_list = []

    for fold, (train_idx, test_idx) in enumerate(cv_fold.split(X, y)):
        if fold not in [0, 4]:
            continue
        
        print('')
        print(f"{fold} Fold Start")
        
        X_train, X_val = X[train_idx, :].to_pandas(), X[test_idx, :].to_pandas()
        y_train, y_val = y[train_idx].to_pandas(), y[test_idx].to_pandas()
        weights_train, weights_val = weights[train_idx].to_pandas(), weights[test_idx].to_pandas()

        model = LGBMRegressor(
            **base_params,
            n_estimators=100000
        )

        model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val], callbacks=[log_evaluation(period=50)])
        #model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val], eval_metric=[r2_lgb], callbacks=[log_evaluation(period=50)])
        #model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_val, y_val, weights_val)], eval_metric=[r2_lgb], callbacks=[log_evaluation(period=50)])

        val_preds = model.predict(X_val)
        
        oof[test_idx] = val_preds

        plt.figure()
        lgb.plot_metric(model)
        plt.ylim(0, 1)
        plt.show()

        models_path = path + 'my_folder/models/20241225_01/'

        if not os.path.exists(models_path):
            os.makedirs(models_path)

        with open(models_path + f"lgb_model_{fold}.pkl", 'wb') as file:
            pickle.dump(model, file)

        print('Fold Val R2 score is:', r2_score(y_val, val_preds))
        print('Fold Val Weighted R2 score is:', r2_score(y_val, val_preds, sample_weight=weights_val))

        sample_val = X_val.sample(frac=0.001)
        sample_y = y_val.iloc[sample_val.index]

        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X=sample_val, y=sample_y)
        shap_importance = np.abs(shap_values).mean(axis=0)

        shap_importance_list.append(shap_importance)

        print(f"{fold} Fold End")
        print('')

    print('R2 score is:', r2_score(y, oof))
    print('Weighted R2 score is:', r2_score(y, oof, sample_weight=weights))

    return np.mean(shap_importance_list, axis=0)

In [None]:
mean_shap_importance = lgb_train(train_df, y_sr)