### Reading the data

In [None]:
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
df = pd.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")
df_pl = pl.read_parquet("/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id=0/part-0.parquet")

In [None]:
# Sort the DataFrame
df = df.sort_values(by=['symbol_id', 'date_id', 'time_id'])
df = df.reset_index(drop=True)
df.head(100)

### Exploring the data

In [None]:
df.info()

In [None]:
empty_columns = []
fully_filled_columns = []
partially_empty_columns = []

## pl

for feature in df_pl.columns:
    # Count empty and non-empty rows
    empty_rows = df_pl[feature].is_null().sum()
    nonempty_rows = len(df_pl[feature]) - empty_rows

    # Classify the columns based on the counts
    if nonempty_rows == 0:
        empty_columns.append(feature)
    elif empty_rows == 0:
        fully_filled_columns.append(feature)
    else:
        partially_empty_columns.append(feature)

    # Print feature statistics
    print(f'{feature} : total - {len(df_pl[feature])} - empty - {empty_rows} - nonempty - {nonempty_rows}')


In [None]:
from IPython.display import display
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    display(df.describe().drop(['date_id', 'time_id', 'symbol_id', 'weight'], axis = 1).T.style.background_gradient(cmap='coolwarm'))

In [None]:
df[df.date_id==0].symbol_id.unique()
df[df.date_id==1].symbol_id.unique()
df[df.symbol_id == 1].date_id.unique()
set(df[df.symbol_id == 1].date_id.unique()) - set(df[df.symbol_id == 7].date_id.unique())

In [None]:
mask1 = (df.symbol_id == 1) & (df.date_id == 0)
mask2 = (df.symbol_id == 7) & (df.date_id == 0)
list(set(df[mask1].time_id.unique()) - set(df[mask2].date_id.unique()))[:10]

### Data Processing

#### One Hot Encoding

In [None]:
encoded = pd.get_dummies(df['symbol_id'], prefix='symbol_id')
max_symbol_id = df['symbol_id'].max()
print(max_symbol_id)
encoded.head()

In [None]:
df = pd.concat([df, encoded], axis = 1)
df = df.drop(['symbol_id'], axis = 1)
df.head()

#### Percentage Based Column Filtering

In [None]:
processed_groups = pd.DataFrame(columns = ['Symbol ID', 'Dropped Columns', 'Filled Columns', '% Filled'])

percentage_threshold = 10.0

for symbol_id, group in df.groupby('symbol_id'):

    # print(f"\nSymbol ID: {symbol_id}")
    # processed_groups['Symbol ID'].append(symbol_id)

    dropped_columns = []
    filled_columns = []
    percent_filled = []

    for feature in group.columns:
        total_rows = len(group)
        empty_rows = group[feature].isnull().sum()
        empty_rows_percentage = (empty_rows / total_rows) * 100
        nonempty_rows_percentage = 100 - empty_rows_percentage

        # print(f'{feature} : empty row = {empty_rows_percentage:.2f}% - non empty rows = {nonempty_rows_percentage:.2f}%')

        if empty_rows == total_rows or empty_rows_percentage >= percentage_threshold:
            df = df.drop(columns=feature)
            dropped_columns.append(feature)

        elif empty_rows > 0:
            # Forward-fill and backward-fill missing values
            df.loc[group.index,feature] = group[feature].ffill().bfill()
            filled_columns.append(feature)
            percent_filled.append(empty_rows_percentage)

    # print(f"Symbol ID: {symbol_id}")
    # print(f"Dropped Columns: {dropped_columns}")
    # print(f"Filled Columns: {filled_columns}")
    # print(f"% Filled: {percent_filled}")



    # Check if there is data to add to the DataFrame

    if dropped_columns or filled_columns or percent_filled:
        # new row for table
        new_row = pd.DataFrame({
            'Symbol ID': [symbol_id],
            'Dropped Columns': [dropped_columns],
            'Filled Columns': [filled_columns],
            '% Filled': [percent_filled]
        })


    processed_groups = pd.concat([processed_groups, new_row], ignore_index=True)



print(processed_groups)

#### Temporal Splitting

In [None]:
df = df.sort_values(['date_id', 'time_id'])
date_counts = df.date_id.value_counts()

In [None]:
date_counts = pd.DataFrame(date_counts.sort_index())
date_counts['cumulative_sum'] = date_counts['count'].cumsum()
date_counts.head()

In [None]:
total = len(df)
train_percentage = 0.6
val_percentage = 0.2
test_percentage = 0.2
apprx_train_len = int(total*train_percentage)
apprx_val_len = int(total*val_percentage)
apprx_test_len = total - apprx_train_len - apprx_val_len

def split_func(row):
    s = row['cumulative_sum']
    if s <= apprx_train_len:
        return 'Train'
    elif (s > apprx_train_len) and (s <= apprx_train_len + apprx_val_len):
        return 'Val'
    elif (s > apprx_train_len + apprx_val_len):
        return 'Test'
    else:
        raise ValueError

date_counts['Split'] = date_counts.apply(split_func, axis = 1)
print(date_counts.Split.value_counts())
date_counts.head()

In [None]:
last_train_data = date_counts[date_counts.Split == 'Train'].tail(1)
first_test_data = date_counts[date_counts.Split == 'Test'].head(1)


In [None]:
%%time
def split_func_df(row):
    s = row['date_id']
    if s <= last_train_data.index[0]:
        return 'Train'
    elif (s > last_train_data.index[0]) and (s < first_test_data.index[0]):
        return 'Val'
    elif (s >= first_test_data.index[0]):
        return 'Test'
    else:
        raise ValueError


df['Split'] = df.apply(split_func_df, axis = 1)
df['Split'].value_counts()

In [None]:
train_df = df[df.Split == 'Train']
val_df = df[df.Split == 'Val']
test_df = df[df.Split == 'Test']

### Todo:
Common Feature Train
All feature train

In [None]:
common_features = ['feature_05','feature_06','feature_07','feature_09','feature_10','feature_11','feature_12','feature_13','feature_14','feature_20','feature_22','feature_23','feature_24','feature_25','feature_28','feature_29','feature_30','feature_34','feature_35','feature_36','feature_38','feature_48','feature_49','feature_59','feature_60','feature_61','feature_67','feature_68','feature_69','feature_70','feature_71','feature_72']


#### Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler

df_feature_cols = []
possible_feature_cols = [f'feature_{i:02}' for i in range(0,79)]

for feature in df.columns:
    if feature in possible_feature_cols:
        df_feature_cols.append(feature)

std_scaler = StandardScaler()
df[df_feature_cols] = std_scaler.fit_transform(df[df_feature_cols])

#### Setting the features

In [None]:
TEMPORAL_FEATURES = ['date_id', 'time_id','symbol_id']
MARKET_FEATURES = [f'feature_{i:02}' for i in range(0,79) if f'feature_{i:02}' in df.columns]
RESPONDER_FEATURES = [f'responder_{i}' for i in range(0,9) if f'responder_{i}' in df.columns]
RESPONDER_FEATURES.remove('responder_6')
SYMBOL_FEATURES = [f'symbol_id_{i}' for i in range(max_symbol_id) if f'symbol_id_{i}' in df.columns]
WEIGHT = ['WEIGHT']

In [None]:
ALL_FEATURES = MARKET_FEATURES
ALL_FEATURES += RESPONDER_FEATURES
ALL_FEATURES = ALL_FEATURES + SYMBOL_FEATURES

In [None]:
train_x = train_df[ALL_FEATURES]
train_y = train_df[['responder_6']]

val_x = val_df[ALL_FEATURES]
val_y = val_df[['responder_6']]

## Model

In [None]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

### Ridge Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np


# Ridge Regression
# Typical values of alpha are 0.01 to 10,000

ridge_mse = []

# 50 alpha values from 0.01 to 10000 in logarithmic scaling
alpha_counter = np.logspace(-2, 4, 50)
# alpha_counter = np.arange(0.1,1000,0.1)

for a in alpha_counter:
    # Ridge Regression
    ridge = Ridge(alpha=a)
    ridge.fit(X_train, y_train)
    ridge_y_pred = ridge.predict(X_test)
    ridge_mse.append(mean_squared_error(y_test, ridge_y_pred))

plt.plot(alpha_counter,ridge_mse)
plt.xscale('log')
plt.show()

# argmin() finds index of minimnum value
min_mse_index = np.argmin(ridge_mse)

print(f'Minimum MSE = {ridge_mse[min_mse_index]} at alpha = {alpha_counter[min_mse_index]}')

In [None]:
lasso_mse = []
Lalpha_counter = np.logspace(-3, 1, 50)

for a in Lalpha_counter:
    lasso = Lasso(alpha=a)
    lasso.fit(X_train, y_train)
    lasso_y_pred = lasso.predict(X_test)
    lasso_mse.append(mean_squared_error(y_test, lasso_y_pred))

plt.plot(Lalpha_counter,lasso_mse)
plt.xscale('log')
plt.show()

# argmin() finds index of minimnum value
min_mse_index = np.argmin(lasso_mse)

print(f'Minimum MSE = {lasso_mse[min_mse_index]} at alpha = {Lalpha_counter[min_mse_index]}')

### Checking Elastic Results

In [None]:
l1_alpha_values = np.logspace(-3, 2, 10)  # 10 values from 0.001 to 100
l1_ratios = np.linspace(0.1, 1.0, 10)  # 10 values from 0.1 to 1.0

elastic_mse_results = np.zeros( (len(l1_alpha_values), len(l1_ratios))  )

for i, a in enumerate(l1_alpha_values):
    for j, l1 in enumerate(l1_ratios):
        elastic_net = ElasticNet(alpha=a, l1_ratio=l1)
        elastic_net.fit(X_train, y_train)
        elastic_net_y_pred = elastic_net.predict(X_test)
        elastic_mse_results[i, j] = mean_squared_error(y_test, elastic_net_y_pred)



min_mse_index = np.unravel_index(np.argmin(elastic_mse_results, axis=None), elastic_mse_results.shape)
optimal_alpha = l1_alpha_values[min_mse_index[0]]
optimal_l1_ratio = l1_ratios[min_mse_index[1]]

print(f"Optimal Alpha: {optimal_alpha}")
print(f"Optimal L1 Ratio: {optimal_l1_ratio}")
print(f"Minimum MSE: {elastic_mse_results[min_mse_index]}")

### Decision Trees and XGBoost

In [None]:
# Import libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# Models for regression
regressors = {
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "Linear Regression": LinearRegression(),
    "XGBoost": xgb.XGBRegressor(tree_method="hist")
}

# Metrics for regression
metrics = {
    "Mean Squared Error": mean_squared_error,
    "Mean Absolute Error": mean_absolute_error,
    "R^2 Score": r2_score,
    "Explained Variance": explained_variance_score
}

In [None]:
import time

use_val = True
results_train = {}
results_val = {}

for model in regressors:
    print(f"Model: {model}")

    # Start timing
    start_time = time.time()

    # Train the model
    regressors[model].fit(train_x, train_y)

    # End timing
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Predict and calculate metrics
    pred_y = regressors[model].predict(train_x)
    if use_val:
        pred_y_val = regressors[model].predict(val_x)

    train_list = []
    val_list = []
    for metric in metrics:
        score = metrics[metric](train_y, pred_y)
        result_string = f"{metric}: Train - {round(score, 4)}"
        if use_val:
            score2 = metrics[metric](val_y, pred_y_val)
            result_string+= f" Val - {round(score2,4)}"
            val_list.append(score2)
        print(result_string)
        train_list.append(score)

    # Save results
    results_train[model] = train_list
    if use_val:
        results_val[model] = val_list

    # Print elapsed time
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print()

#### Neural Nets

In [None]:
import tensorflow as tf

tf.keras.backend.clear_session()
tf.random.set_seed(42)

In [None]:
input_shape=X_train.shape[1:]
print(input_shape)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=X_train_scaled.shape[1:]),
    tf.keras.layers.Dense(300, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(100, kernel_initializer="he_normal", use_bias=False),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Activation("relu"),

    # Single output neuron for regression
    # We want to predict just Responder_6
    tf.keras.layers.Dense(1, activation=None)
])

tf.keras.utils.plot_model(model, "my_fashion_mnist_model.png", show_shapes=True)

In [None]:
regressors['Neural Net'].compile(loss="mse", optimizer="adam",
              metrics=["mse", "mae"])
history = regressors['Neural Net'].fit(train_x.values.astype('float32'), train_y.values.astype('float32'),
                             batch_size = 1024,
                             validation_data = (val_x.values.astype('float32'), val_y.values.astype('float32')) if use_val else None,
                             epochs=30)

In [None]:
pd.DataFrame(history.history).plot(
    figsize=(8, 5), xlim=[0, 29], ylim=[0, 1], grid=True, xlabel="Epoch",
    style=["r--", "r--.", "b-", "b-*"])
plt.legend(loc="lower left")  # extra code
plt.show()

In [None]:
test_x = test_df[ALL_FEATURES]
test_y = test_df[['responder_6']]

In [None]:
pred_y_test = regressors[model].predict(test_x.values.astype('float32'), batch_size = 1024)

In [None]:
# Performance
results_test = {}
for model in regressors:
  print(f"Model: {model}")
  if 'Neural Net' in model:
      pred_y_test = regressors[model].predict(test_x.values.astype('float32'),
                                             batch_size = 1024)
  else:
      pred_y_test = regressors[model].predict(test_x)

  results = []
  for metric in metrics:
    score = metrics[metric](test_y, pred_y_test)
    print(f"{metric}: {round(score,4)}")
    results.append(score)
  results_test[model] = results

  print()

### Feature Importance

In [None]:
from xgboost import plot_importance
plt.figure(figsize = (10,20))

plot_importance(regressors["XGBoost"])
plt.savefig('feature_importance.png', dpi = 300)