In [1]:
import pandas as pd
import polars as pl
df = pd.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")
df_pl = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")

# Three Cases
# 1. Fully Empty columns : we can just drop them
# 2. Partially Empty Columns: we can impute them
# 3. Full Columns : No imputation necessary

empty_columns = []
fully_filled_columns = []
partially_empty_columns = []

for feature in df_pl.columns:
    # Count empty and non-empty rows
    empty_rows = df_pl[feature].is_null().sum()
    nonempty_rows = len(df_pl[feature]) - empty_rows

    # Classify the columns based on the counts
    if nonempty_rows == 0:
        empty_columns.append(feature)
    elif empty_rows == 0:
        fully_filled_columns.append(feature)
    else:
        partially_empty_columns.append(feature)

# We don't use df_pl again, so delete it to conserve memory
del df_pl

df.sort_values(['time_id','date_id'])

# drop empty columns using imputer
df = df.drop(empty_columns, axis = 1)

# Fill in partially empty columns
for feature in partially_empty_columns:
    df[feature] = df.groupby('symbol_id')[feature].transform(lambda x: x.ffill().bfill())

# Split data temporally - in partition0 there are 170 days and 849 unique time ids per day
df = df.sort_values(['date_id', 'time_id'])
date_counts = df.date_id.value_counts()
date_counts = pd.DataFrame(date_counts.sort_index())
date_counts['cumulative_sum'] = date_counts['count'].cumsum()

total = len(df)
train_percentage = 0.6
val_percentage = 0.2
test_percentage = 0.2
apprx_train_len = int(total*train_percentage)
apprx_val_len = int(total*val_percentage) 
apprx_test_len = total - apprx_train_len - apprx_val_len

# Determine Splitting points

def split_func(row):
    s = row['cumulative_sum']
    if s <= apprx_train_len:
        return 'Train'
    elif (s > apprx_train_len) and (s <= apprx_train_len + apprx_val_len):
        return 'Val'
    elif (s > apprx_train_len + apprx_val_len):
        return 'Test'
    else:
        raise ValueError

date_counts['Split'] = date_counts.apply(split_func, axis = 1)
# print(date_counts.Split.value_counts())

last_train_data = date_counts[date_counts.Split == 'Train'].tail(1) 
first_test_data = date_counts[date_counts.Split == 'Test'].head(1)

# Once Splitting points are determined, then make the necessary splits
 
def split_func_df(row):
    s = row['date_id']
    if s <= last_train_data.index[0]:
        return 'Train'
    elif (s > last_train_data.index[0]) and (s < first_test_data.index[0]):
        return 'Val'
    elif (s >= first_test_data.index[0]):
        return 'Test'
    else:
        raise ValueError

# Potentially encode time_id as sine/cosine to capture cyclical nature
# train_df['time_sin'] = np.sin(2 * np.pi * train_df['time_id'] / max_time_id)
# train_df['time_cos'] = np.cos(2 * np.pi * train_df['time_id'] / max_time_id)


df['Split'] = df.apply(split_func_df, axis = 1)
df['Split'].value_counts()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)

train_df = scaled_df[df.Split == 'Train']
val_df = scaled_df[df.Split == 'Val']
test_df = scaled_df[df.Split == 'Test']

# 

# Import libraries
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.linear_model import LinearRegression
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# # Setup steps to sort columns into different categories

# TEMPORAL_FEATURES = ['date_id', 'time_id','symbol_id']
# MARKET_FEATURES = [f'feature_{i:02}' for i in range(0,79) if f'feature_{i:02}' in df.columns]
# RESPONDER_FEATURES = [f'responder_{i}' for i in range(0,9) if f'responder_{i}' in df.columns]
# RESPONDER_FEATURES.remove('responder_6')
# SYMBOL_FEATURES = ['symbol_id']
# # SYMBOL_FEATURES = [f'symbol_id_{i}' for i in range(max_symbol_id) if f'symbol_id_{i}' in df.columns]
# WEIGHT = ['WEIGHT']

# # Assemble Features
# ALL_FEATURES = MARKET_FEATURES + RESPONDER_FEATURES
# ALL_FEATURES = ALL_FEATURES + SYMBOL_FEATURES

# train_x = train_df[ALL_FEATURES]
# train_y = train_df[['responder_6']]

# val_x = val_df[ALL_FEATURES]
# val_y = val_df[['responder_6']]

ValueError: could not convert string to float: 'Train'

# LSTM model testing

In [2]:
import numpy as np
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        seq = data[i:i+sequence_length]
        target = data['responder_6'].iloc[i+sequence_length]
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

# Create sequences
features = ['date_id', 'time_id', 'feature_06', 'feature_07', 'feature_05', 'feature_68', 'symbol_id', 'responder_6']
X_train, y_train = create_sequences(train_df[features], sequence_length=50)
X_val, y_val = create_sequences(val_df[features], sequence_length=50)

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense

# model = Sequential()
# model.add(LSTM(50, activation='relu', input_shape=(window_size, num_features)))
# model.add(Dense(1))  # Predicting a single value
# model.compile(optimizer='adam', loss='mse')
# model.fit(X_train, y_train, epochs=50, batch_size=32)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.mixed_precision import set_global_policy
from tensorflow.keras.callbacks import TensorBoard

# Enable mixed precision
# set_global_policy('mixed_float16')

# # Enable XLA optimization
# tf.config.optimizer.set_jit(True)

# Define the model
model = Sequential([
    LSTM(64, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='tanh'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Create TensorFlow datasets
batch_size = 128  # Adjust for optimal GPU utilization
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_data = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# TensorBoard callback with profiling
tensorboard_cb = TensorBoard(log_dir="./logs", profile_batch='2,10')

# Train the model
history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=50,
    verbose=1,
    callbacks=[tensorboard_cb]
)


  super().__init__(**kwargs)


Epoch 1/50


I0000 00:00:1732942060.097745     122 service.cc:145] XLA service 0x7cdcf80130f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1732942060.097789     122 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1732942060.097795     122 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1732942060.353761     122 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 8ms/step - loss: 0.6930 - mae: 0.5415 - val_loss: 0.7864 - val_mae: 0.5747
Epoch 2/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 8ms/step - loss: 0.6879 - mae: 0.5369 - val_loss: 0.7864 - val_mae: 0.5747
Epoch 3/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 8ms/step - loss: 0.6873 - mae: 0.5366 - val_loss: 0.7864 - val_mae: 0.5747
Epoch 4/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 8ms/step - loss: 0.6883 - mae: 0.5368 - val_loss: 0.7864 - val_mae: 0.5747
Epoch 5/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 8ms/step - loss: 0.6872 - mae: 0.5364 - val_loss: 0.7863 - val_mae: 0.5747
Epoch 6/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 8ms/step - loss: 0.6868 - mae: 0.5364 - val_loss: 0.7863 - val_mae: 0.5747
Epoch 7/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 8ms/step - loss: 0.5649 - mae: 0.5079 - val_loss: 0.7072 - val_mae: 0.5572
Epoch 43/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.5833 - mae: 0.5117 - val_loss: 0.6800 - val_mae: 0.5560
Epoch 44/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.5564 - mae: 0.5024 - val_loss: 0.7145 - val_mae: 0.5637
Epoch 45/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 8ms/step - loss: 0.5648 - mae: 0.5041 - val_loss: 0.7170 - val_mae: 0.5595
Epoch 47/50
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.5708 - mae: 0.5114 - val_loss: 0.7365 - val_mae: 0.5616
Epoch 48/50
[1m5766/9047[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m21s[0m 7ms/step - loss: 0.6566 - mae: 0.5406