In [1]:
import pandas as pd
import polars as pl
df = pd.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")
df_pl = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")

# Three Cases
# 1. Fully Empty columns : we can just drop them
# 2. Partially Empty Columns: we can impute them
# 3. Full Columns : No imputation necessary

empty_columns = []
fully_filled_columns = []
partially_empty_columns = []

for feature in df_pl.columns:
    # Count empty and non-empty rows
    empty_rows = df_pl[feature].is_null().sum()
    nonempty_rows = len(df_pl[feature]) - empty_rows

    # Classify the columns based on the counts
    if nonempty_rows == 0:
        empty_columns.append(feature)
    elif empty_rows == 0:
        fully_filled_columns.append(feature)
    else:
        partially_empty_columns.append(feature)

# We don't use df_pl again, so delete it to conserve memory
del df_pl

df.sort_values(['time_id','date_id'])

# drop empty columns using imputer
df = df.drop(empty_columns, axis = 1)

# Fill in partially empty columns
for feature in partially_empty_columns:
    df[feature] = df.groupby('symbol_id')[feature].transform(lambda x: x.ffill().bfill())

# Split data temporally - in partition0 there are 170 days and 849 unique time ids per day
df = df.sort_values(['date_id', 'time_id'])
date_counts = df.date_id.value_counts()
date_counts = pd.DataFrame(date_counts.sort_index())
date_counts['cumulative_sum'] = date_counts['count'].cumsum()

# Organize columns into different groups
TIME_FEATURES = ['relative_timestamp']
MARKET_FEATURES = [f'feature_{i:02}' for i in range(0,79) if f'feature_{i:02}' in df.columns]
RESPONDER_FEATURES = [f'responder_{i}' for i in range(0,9) if f'responder_{i}' in df.columns]
# RESPONDER_FEATURES.remove('responder_6')
SYMBOL_FEATURES = ['symbol_id']

# Normalize Data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols_to_normalize = MARKET_FEATURES + RESPONDER_FEATURES
df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])

# Determine splitting points
total = len(df)
train_percentage = 0.6
val_percentage = 0.2
test_percentage = 0.2
apprx_train_len = int(total*train_percentage)
apprx_val_len = int(total*val_percentage) 
apprx_test_len = total - apprx_train_len - apprx_val_len

# Determine Splitting points

def split_func(row):
    s = row['cumulative_sum']
    if s <= apprx_train_len:
        return 'Train'
    elif (s > apprx_train_len) and (s <= apprx_train_len + apprx_val_len):
        return 'Val'
    elif (s > apprx_train_len + apprx_val_len):
        return 'Test'
    else:
        raise ValueError

date_counts['Split'] = date_counts.apply(split_func, axis = 1)
# print(date_counts.Split.value_counts())

last_train_data = date_counts[date_counts.Split == 'Train'].tail(1) 
first_test_data = date_counts[date_counts.Split == 'Test'].head(1)

# Once Splitting points are determined, then make the necessary splits
 
def split_func_df(row):
    s = row['date_id']
    if s <= last_train_data.index[0]:
        return 'Train'
    elif (s > last_train_data.index[0]) and (s < first_test_data.index[0]):
        return 'Val'
    elif (s >= first_test_data.index[0]):
        return 'Test'
    else:
        raise ValueError

# Potentially encode time_id as sine/cosine to capture cyclical nature
# train_df['time_sin'] = np.sin(2 * np.pi * train_df['time_id'] / max_time_id)
# train_df['time_cos'] = np.cos(2 * np.pi * train_df['time_id'] / max_time_id)

df['Split'] = df.apply(split_func_df, axis = 1)
df['Split'].value_counts()

train_df = df[df.Split == 'Train']
val_df = df[df.Split == 'Val']
test_df = df[df.Split == 'Test']

# LSTM model testing

In [2]:
import numpy as np
def create_sequences(data, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        seq = data[i:i+sequence_length]
        target = data['responder_6'].iloc[i+sequence_length]
        X.append(seq)
        y.append(target)
    return np.array(X), np.array(y)

# Create sequences
features = ['date_id', 'time_id', 'feature_06', 'feature_07', 'feature_05', 'feature_68', 'symbol_id', 'responder_6']
X_train, y_train = create_sequences(train_df[features], sequence_length=50)
X_val, y_val = create_sequences(val_df[features], sequence_length=50)

In [3]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense

# model = Sequential()
# model.add(LSTM(50, activation='relu', input_shape=(window_size, num_features)))
# model.add(Dense(1))  # Predicting a single value
# model.compile(optimizer='adam', loss='mse')
# model.fit(X_train, y_train, epochs=50, batch_size=32)

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.mixed_precision import set_global_policy
from tensorflow.keras.callbacks import TensorBoard

# Enable mixed precision
set_global_policy('mixed_float16')

# Enable XLA optimization
tf.config.optimizer.set_jit(True)

# Define the model
model = Sequential([
    LSTM(64, activation='tanh', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.2),
    LSTM(32, activation='tanh'),
    Dropout(0.2),
    Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Create TensorFlow datasets
batch_size = 128  # Adjust for optimal GPU utilization
train_data = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_data = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# TensorBoard callback with profiling
tensorboard_cb = TensorBoard(log_dir="./logs", profile_batch='2,10')

# Train the model
history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=20,
    verbose=1,
    callbacks=[tensorboard_cb]
)


  super().__init__(**kwargs)


Epoch 1/20


I0000 00:00:1733035403.709660     120 service.cc:145] XLA service 0x5cbd87e92d40 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1733035403.709703     120 service.cc:153]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
I0000 00:00:1733035403.709707     120 service.cc:153]   StreamExecutor device (1): Tesla T4, Compute Capability 7.5
I0000 00:00:1733035403.852975     123 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 8ms/step - loss: 0.9134 - mae: 0.6206 - val_loss: 1.0376 - val_mae: 0.6602
Epoch 2/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 8ms/step - loss: 0.9069 - mae: 0.6163 - val_loss: 1.0376 - val_mae: 0.6602
Epoch 3/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 8ms/step - loss: 0.9067 - mae: 0.6163 - val_loss: 1.0375 - val_mae: 0.6601
Epoch 4/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.9063 - mae: 0.6162 - val_loss: 1.0375 - val_mae: 0.6602
Epoch 5/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.9055 - mae: 0.6161 - val_loss: 1.0375 - val_mae: 0.6602
Epoch 6/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.9021 - mae: 0.6166 - val_loss: 1.0348 - val_mae: 0.6601
Epoch 7/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8

In [4]:
history = model.fit(
    train_data,
    validation_data=val_data,
    epochs=20,
    verbose=1,
    callbacks=[tensorboard_cb]
)

Epoch 1/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.7421 - mae: 0.5828 - val_loss: 0.8990 - val_mae: 0.6190
Epoch 2/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.7291 - mae: 0.5784 - val_loss: 0.8653 - val_mae: 0.6246
Epoch 3/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 8ms/step - loss: 0.6961 - mae: 0.5634 - val_loss: 0.9520 - val_mae: 0.6333
Epoch 4/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 8ms/step - loss: 0.6916 - mae: 0.5643 - val_loss: 0.9896 - val_mae: 0.6523
Epoch 5/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 8ms/step - loss: 0.7195 - mae: 0.5762 - val_loss: 0.9825 - val_mae: 0.6547
Epoch 6/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 8ms/step - loss: 0.8013 - mae: 0.5975 - val_loss: 0.8977 - val_mae: 0.6291
Epoch 7/20
[1m9047/9047[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

In [None]:
import matplotlib.pyplot as plt

# Get the data for responder_6 in the test set
responder_6_test_data = test_df[['date_id', 'time_id', 'feature_06', 'feature_07', 'feature_05', 'feature_68', 'symbol_id']]  # Assuming X_test is your test set features
responder_6_actual_values = test_df[responder_6]  # Assuming y_test is your test set labels

# Make predictions for responder_6's test data
responder_6_predictions = model.predict(responder_6_test_data)

# Plot the actual vs predicted values
plt.figure(figsize=(10, 6))  # Adjust figure size as needed 
plt.scatter(range(len(responder_6_actual_values)), responder_6_actual_values, label='Actual Values')
plt.plot(range(len(responder_6_predictions.squeeze())), responder_6_predictions.squeeze(), label='Predicted Values')  # Squeeze to remove extra dimension
plt.xlabel('Time Step')
plt.ylabel('Values')
plt.title(f'Responder 6 - Actual vs Predicted Values')
plt.legend()
plt.grid(True)
plt.show()