In [1]:
import pandas as pd
import polars as pl
df = pd.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")
df_pl = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")

In [2]:
# Three Cases
# 1. Fully Empty columns : we can just drop them
# 2. Partially Empty Columns: we can impute them
# 3. Full Columns : No imputation necessary

empty_columns = []
fully_filled_columns = []
partially_empty_columns = []

for feature in df_pl.columns:
    # Count empty and non-empty rows
    empty_rows = df_pl[feature].is_null().sum()
    nonempty_rows = len(df_pl[feature]) - empty_rows

    # Classify the columns based on the counts
    if nonempty_rows == 0:
        empty_columns.append(feature)
    elif empty_rows == 0:
        fully_filled_columns.append(feature)
    else:
        partially_empty_columns.append(feature)

# We don't use df_pl again, so delete it to conserve memory
del df_pl

df.sort_values(['time_id','date_id'])

# drop empty columns using imputer
df = df.drop(empty_columns, axis = 1)

# Fill in partially empty columns
for feature in partially_empty_columns:
    df[feature] = df.groupby('symbol_id')[feature].transform(lambda x: x.ffill().bfill())

# Split data temporally - in partition0 there are 170 days and 849 unique time ids per day
df = df.sort_values(['date_id', 'time_id'])
date_counts = df.date_id.value_counts()
date_counts = pd.DataFrame(date_counts.sort_index())
date_counts['cumulative_sum'] = date_counts['count'].cumsum()

total = len(df)
train_percentage = 0.6
val_percentage = 0.2
test_percentage = 0.2
apprx_train_len = int(total*train_percentage)
apprx_val_len = int(total*val_percentage) 
apprx_test_len = total - apprx_train_len - apprx_val_len

# Determine Splitting points

def split_func(row):
    s = row['cumulative_sum']
    if s <= apprx_train_len:
        return 'Train'
    elif (s > apprx_train_len) and (s <= apprx_train_len + apprx_val_len):
        return 'Val'
    elif (s > apprx_train_len + apprx_val_len):
        return 'Test'
    else:
        raise ValueError

date_counts['Split'] = date_counts.apply(split_func, axis = 1)
# print(date_counts.Split.value_counts())

last_train_data = date_counts[date_counts.Split == 'Train'].tail(1) 
first_test_data = date_counts[date_counts.Split == 'Test'].head(1)

# Once Splitting points are determined, then make the necessary splits
 
def split_func_df(row):
    s = row['date_id']
    if s <= last_train_data.index[0]:
        return 'Train'
    elif (s > last_train_data.index[0]) and (s < first_test_data.index[0]):
        return 'Val'
    elif (s >= first_test_data.index[0]):
        return 'Test'
    else:
        raise ValueError


df['Split'] = df.apply(split_func_df, axis = 1)
df['Split'].value_counts()

train_df = df[df.Split == 'Train']
val_df = df[df.Split == 'Val']
test_df = df[df.Split == 'Test']

# Import libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# Setup steps to sort columns into different categories

TEMPORAL_FEATURES = ['date_id', 'time_id','symbol_id']
MARKET_FEATURES = [f'feature_{i:02}' for i in range(0,79) if f'feature_{i:02}' in df.columns]
RESPONDER_FEATURES = [f'responder_{i}' for i in range(0,9) if f'responder_{i}' in df.columns]
RESPONDER_FEATURES.remove('responder_6')
SYMBOL_FEATURES = ['symbol_id']
# SYMBOL_FEATURES = [f'symbol_id_{i}' for i in range(max_symbol_id) if f'symbol_id_{i}' in df.columns]
WEIGHT = ['WEIGHT']

# Assemble Features
ALL_FEATURES = MARKET_FEATURES + RESPONDER_FEATURES
ALL_FEATURES = ALL_FEATURES + SYMBOL_FEATURES

train_x = train_df[ALL_FEATURES]
train_y = train_df[['responder_6']]

val_x = val_df[ALL_FEATURES]
val_y = val_df[['responder_6']]

# VAR model testing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.api import VAR
from statsmodels.tsa.stattools import adfuller

model = VAR(train_x)  # train_data should be a pandas DataFrame
lags = model.select_order()
print(lags.summary())
# results = model.fit(lags)  # Determine the optimal lag using `model.select_order()`
# forecast = results.forecast(train_data.values[-lags:], steps=steps)