In [1]:
#import cudf as pd #Change1
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')



print('We will use RAPIDS version',pd.__version__)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score

train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
plt.rcParams.update({'font.size': 16})

train_model = True
model_to_restore = "model_cnn_1.hdf5"


We will use RAPIDS version 1.5.3


In [2]:
features = [col for col in train.columns if col not in ['row_id', 'time_id', 'date_id', 'target']]
print(features)
print(len(features))

train['bid_size_not_norm']=train['bid_size']
train['ask_size_not_norm']=train['ask_size']


['stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']
13


In [24]:
stock_id_ranges = {
    stock_id: [group['target'].min(), group['target'].max()]
    for stock_id, group in train.groupby('stock_id')
}



In [3]:
train['far_price'].fillna(0, inplace=True)
train['near_price'].fillna(1, inplace=True)

cols_group_by = ['date_id', 'seconds_in_bucket']
cat_cols = ['stock_id', 'imbalance_buy_sell_flag']
cols_fill_nan = [
    'imbalance_size', 'reference_price', 'matched_size', 'wap',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'bid_size_not_norm', 'ask_size_not_norm']
train_grouped_median = train.groupby(cols_group_by)[cols_fill_nan].transform('median')

train[cols_fill_nan] = train[cols_fill_nan].fillna(train_grouped_median)

display(train.isnull().sum())
print(f"before drop dataset size: {len(train)}")
train.dropna(inplace=True)
print(f"after drop dataset size: {len(train)}")

stock_id                    0
date_id                     0
seconds_in_bucket           0
imbalance_size              0
imbalance_buy_sell_flag     0
reference_price             0
matched_size                0
far_price                   0
near_price                  0
bid_price                   0
bid_size                    0
ask_price                   0
ask_size                    0
wap                         0
target                     88
time_id                     0
row_id                      0
bid_size_not_norm           0
ask_size_not_norm           0
dtype: int64

before drop dataset size: 5237980
after drop dataset size: 5237892


In [4]:
import joblib
from sklearn.preprocessing import RobustScaler, QuantileTransformer

num_cols = [feature for feature in features if feature not in cat_cols]
print(len(num_cols))

train[num_cols] = train[num_cols].astype('float32')
if train_model:
    robust_scaler = QuantileTransformer(output_distribution='normal')
    train[num_cols] = robust_scaler.fit_transform(train[num_cols])
    joblib.dump(robust_scaler, 'quantile_transformer.pkl')
else:
    #robust_scaler = joblib.load('robust_scaler.pkl')
    robust_scaler = QuantileTransformer(output_distribution='normal')

    train[num_cols] = robust_scaler.fit_transform(train[num_cols])




11


In [32]:
# 🧹 Function to reduce memory usage of a Pandas DataFrame
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    
    # 📏 Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # ℹ️ Provide memory optimization information if 'verbose' is True
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage
    return df

# 🏎️ Import Numba for just-in-time (JIT) compilation and parallel processing
from numba import njit, prange

# 📊 Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # 🔁 Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # 🔁 Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            # 🚫 Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

# 📈 Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

# 📊 Function to generate imbalance features
def imbalance_features(df):
    import cudf
    df = cudf.from_pandas(df)
    
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]

    # V1 features
    # Calculate various features using Pandas eval function
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("ask_price + bid_price")/2
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("imbalance_size-matched_size")/df.eval("matched_size+imbalance_size")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    
    # Create features for pairwise price imbalances
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
        
    # V2 features
    # Calculate additional features
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    
        
    # V3 features
    # Calculate shifted and return features for specific columns
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    df = df.to_pandas()
    # Replace infinite values with 0
    return df.replace([np.inf, -np.inf], 0)

def numba_imb_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    # Calculate triplet imbalance features using the Numba-optimized function
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    return df

# 📅 Function to generate time and stock-related features
def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  # Seconds
    df["minute"] = df["seconds_in_bucket"] // 60  # Minutes

    # Map global features to the DataFrame
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

# 🚀 Function to generate all features by combining imbalance and other features
def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = numba_imb_features(df)
    # Generate time and stock-related features
    df = other_features(df)
    gc.collect()  # Perform garbage collection to free up memory
    
    # Select and return the generated features
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]

In [6]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]


for i, cat in enumerate(cat_cols):
    print('encoding %s ...' % cat, end=' ')
    encoders[i] = {l: id for id, l in enumerate(train.loc[:, cat].astype(str).unique())}
    train[cat] = train[cat].astype(str).apply(lambda x: encode(encoders[i], x))

    print('Done')


embed_sizes = [len(encoder) + 1 for encoder in encoders] #+1 for possible unknown assets

encoding stock_id ... Done
encoding imbalance_buy_sell_flag ... Done


In [7]:
embed_sizes

[201, 4]

In [8]:
#X_train = train[features].copy(deep=True)
#y_train = train['target'].copy(deep=True)

#X_train.fillna(0, inplace = True)
#y_train.fillna(0, inplace = True)


X = train.drop(columns=['target'])  # Your features
#X['stock_id'] = X['stock_id'].astype("category")
y = train['target']  # Your target variable
groups = train['date_id'] # Extracting just the 'time_id' column for grouping

groups.reset_index(inplace=True, drop=True)

#train.fillna(0, inplace = True)
#X.fillna(0, inplace = True)
#y.fillna(0, inplace = True)

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization, Concatenate, Embedding, Flatten, Reshape, LSTM, Conv1D, MaxPooling1D
from tensorflow.keras import Model
from tensorflow.keras.losses import mean_absolute_error
import math

import gc
import warnings

gc.enable()

categorical_inputs = []
for cat in cat_cols:
    categorical_inputs.append(Input(shape=[1], name=cat))

categorical_embeddings = []
for i, cat in enumerate(cat_cols):
    categorical_embeddings.append(Embedding(embed_sizes[i], int(math.sqrt(embed_sizes[i]))+1)(categorical_inputs[i]))

concatenated = Concatenate()([Flatten()(cat_emb) for cat_emb in categorical_embeddings])
categorical_logits = Flatten()(concatenated)
categorical_logits = Dense(32,activation='relu')(categorical_logits)
categorical_logits =Dropout(0.25)(categorical_logits)
categorical_logits =BatchNormalization()(categorical_logits)
categorical_logits = Dense(32,activation='relu')(categorical_logits)


# Input for numerical data
numerical_inputs = Input(shape=(11,), name='num')
numerical_logits = numerical_inputs
numerical_logits = BatchNormalization()(numerical_logits)

numerical_logits = Dense(128,activation='relu')(numerical_logits)
numerical_logits = Dropout(0.25)(numerical_logits)
numerical_logits = BatchNormalization()(numerical_logits)
numerical_logits = Dense(128,activation='relu')(numerical_logits)
numerical_logits = Dense(64,activation='relu')(numerical_logits)


logits = Concatenate()([numerical_logits,categorical_logits])
logits = Dense(64,activation='relu')(logits)
out = Dense(1, activation='linear')(logits)

model = Model(inputs = categorical_inputs + [numerical_inputs], outputs=out)
model.compile(optimizer='adam',loss=mean_absolute_error)

model.summary(line_length=88)

2023-11-22 20:14:22.461723: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-22 20:14:22.477116: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-22 20:14:22.477137: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-22 20:14:22.477150: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-22 20:14:22.480503: I tensorflow/core/platform/cpu_feature_g

Model: "model"
________________________________________________________________________________________
 Layer (type)             Output Shape              Param    Connected to               
                                                    #                                   
 stock_id (InputLayer)    [(None, 1)]               0        []                         
                                                                                        
 imbalance_buy_sell_flag  [(None, 1)]               0        []                         
  (InputLayer)                                                                          
                                                                                        
 embedding (Embedding)    (None, 1, 15)             3015     ['stock_id[0][0]']         
                                                                                        
 embedding_1 (Embedding)  (None, 1, 3)              12       ['imbalance_buy_sell_flag[0
      

2023-11-22 20:14:23.182716: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-22 20:14:23.185385: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-11-22 20:14:23.185479: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [10]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, valid_idx = next(gss.split(train, groups=train['date_id']))



In [11]:
# https://www.kaggle.com/guowenrui/market-nn-if-you-like-you-can-use-it-and-upvote
class SWA(tf.keras.callbacks.Callback):
    
    def __init__(self, filepath, swa_epoch):
        super(SWA, self).__init__()
        self.filepath = filepath
        self.swa_epoch = swa_epoch 
    
    def on_train_begin(self, logs=None):
        self.nb_epoch = self.params['epochs']
        print('Stochastic weight averaging selected for last {} epochs.'
              .format(self.nb_epoch - self.swa_epoch))
        
    def on_epoch_end(self, epoch, logs=None):
        
        if epoch == self.swa_epoch:
            self.swa_weights = self.model.get_weights()
            
        elif epoch > self.swa_epoch:    
            for i in range(len(self.swa_weights)):
                self.swa_weights[i] = (self.swa_weights[i] * 
                    (epoch - self.swa_epoch) + self.model.get_weights()[i])/((epoch - self.swa_epoch)  + 1)  

        else:
            pass
        
    def on_train_end(self, logs=None):
        self.model.set_weights(self.swa_weights)
        print('Final model parameters set to stochastic weight average.')
        self.model.save_weights(self.filepath)
        print('Final stochastic averaged weights saved to file.')
        
class SnapshotCallbackBuilder:
    def __init__(self, nb_epochs, nb_snapshots, init_lr=0.15):
        self.T = nb_epochs
        self.M = nb_snapshots
        self.alpha_zero = init_lr

    def get_callbacks(self, model_prefix='Model'):

        callback_list = [
            callbacks.ModelCheckpoint("model.hdf5",monitor='val_my_iou_metric', 
                                   mode = 'max', save_best_only=True, verbose=500),
            swa,
            callbacks.LearningRateScheduler(schedule=self._cosine_anneal_schedule)
        ]

        return callback_list

    def _cosine_anneal_schedule(self, t):
        cos_inner = np.pi * (t % (self.T // self.M))  # t - 1 is used when t has 1-based indexing.
        cos_inner /= self.T // self.M
        cos_out = np.cos(cos_inner) + 1
        return float(self.alpha_zero / 2 * cos_out)

In [12]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, valid_idx = next(gss.split(X, groups=X['date_id']))

In [13]:
def get_input(market_train, market_target, indices):
    X_num = market_train[num_cols].iloc[indices].values
    X = {'num':X_num}
    for cat in cat_cols:
        X[cat] = market_train[cat].iloc[indices].values
    y = market_target.iloc[indices].values
    bid_size = market_train['bid_size_not_norm'].iloc[indices]
    ask_size = market_train['ask_size_not_norm'].iloc[indices]

    return X,y, bid_size, ask_size

# r, u and d are used to calculate the scoring metric
X_train,y_train, bid_size_train, ask_size_train = get_input(X, y, train_idx)
X_valid,y_valid, bid_size_valid, ask_size_valid = get_input(X,y, valid_idx)

In [14]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

if train_model:
    check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
    early_stop = EarlyStopping(patience=15,verbose=True)
    reduce_lr = ReduceLROnPlateau( mode = 'max',factor=0.5, patience=5, min_lr=0.0001, verbose=1)
    swa = SWA('model_swa.hdf5',6)
    
    with tf.device('/GPU:0'):
        model.fit(X_train, y_train,
                  validation_data=(X_valid,y_valid),
                  batch_size=1000,
                  epochs=30,
                  verbose=False,
                  callbacks=[early_stop,check_point, reduce_lr, swa])

Stochastic weight averaging selected for last 24 epochs.


2023-11-22 20:14:25.317136: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f6034662220 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-22 20:14:25.317150: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
2023-11-22 20:14:25.319518: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-22 20:14:25.325916: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-22 20:14:25.367332: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.



Epoch 1: val_loss improved from inf to 6.08239, saving model to model.hdf5

Epoch 2: val_loss improved from 6.08239 to 6.07676, saving model to model.hdf5

Epoch 3: val_loss did not improve from 6.07676

Epoch 4: val_loss improved from 6.07676 to 6.07341, saving model to model.hdf5

Epoch 5: val_loss improved from 6.07341 to 6.06627, saving model to model.hdf5

Epoch 6: val_loss improved from 6.06627 to 6.06534, saving model to model.hdf5

Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.

Epoch 7: val_loss improved from 6.06534 to 6.06047, saving model to model.hdf5

Epoch 8: val_loss improved from 6.06047 to 6.05962, saving model to model.hdf5

Epoch 9: val_loss did not improve from 6.05962

Epoch 10: val_loss improved from 6.05962 to 6.05915, saving model to model.hdf5

Epoch 11: val_loss improved from 6.05915 to 6.05896, saving model to model.hdf5

Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.

Epoch 12: val_loss improved fr

In [20]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GroupShuffleSplit

def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out
    
y_min, y_max = -64, 64
if train_model:
    model.load_weights('model.hdf5')
else:
    model.load_weights(model_to_restore)
    
model.load_weights('model_swa.hdf5')


In [21]:
# Making predictions on the test set
predictions = model.predict(X_valid)[:,0]



In [28]:
X_valid['stock_id']


array([  0,   1,   2, ..., 189, 190, 196])

In [31]:
predictions_to_clip = zero_sum(predictions, bid_size_valid + ask_size_valid)

min_values = np.array([stock_id_ranges[stock_id][0] for stock_id in X_valid['stock_id']])
max_values = np.array([stock_id_ranges[stock_id][1] for stock_id in X_valid['stock_id']])
    
clipped_predictions = np.clip(predictions_to_clip, min_values, max_values)


# Calculate the mean absolute and squared error
print("Mean Absolute Error:", mean_absolute_error(y_valid, predictions))
print("Mean Relative Error:", mean_absolute_percentage_error(y_valid, predictions))



print("Mean Absolute Error CP:", mean_absolute_error(y_valid, clipped_predictions))
print("Mean Relative Error CP:", mean_absolute_percentage_error(y_valid, clipped_predictions))

Mean Absolute Error: 6.0543914551045726
Mean Relative Error: 2594223027671.914
Mean Absolute Error CP: 6.053992847317898
Mean Relative Error CP: 2591382232413.9473


In [18]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()


from tqdm.auto import tqdm

TRAIN_TARGET = "target"

def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out
    
y_min, y_max = -64, 64

# Making predictions on the test set

counter = 0
for (df_test, revealed_targets, sample_prediction) in tqdm(iter_test):
#     display(df_test); display(revealed_targets); display(sample_prediction)
    test_nan_ = df_test.isnull().sum()
    df_test['far_price'].fillna(0, inplace=True)
    df_test['near_price'].fillna(1, inplace=True)
    
    df_test['bid_size_not_norm']=df_test['bid_size']
    df_test['ask_size_not_norm']=df_test['ask_size']
    
    df_test[cols_fill_nan] = df_test[cols_fill_nan].fillna(train_grouped_median)
    test_nan = pd.DataFrame(dict(before=test_nan_, after=df_test.isnull().sum()))

    df_test[num_cols] = df_test[num_cols].astype('float32')

    df_test[num_cols] = robust_scaler.fit_transform(df_test[num_cols])
    
    X = {'num':df_test[num_cols].values}
    for i, column in enumerate(cat_cols):
        X[column] = df_test[column].astype(str).apply(lambda x: encode(encoders[i], x)).values

    #X_test = scaler.transform(X_test)

    df_test[TRAIN_TARGET] = model.predict(X)[:,0]
    if counter < 5:
        display(df_test.head())
        display(test_nan.T)

    preds = zero_sum(df_test[TRAIN_TARGET], df_test['bid_size_not_norm'] + df_test['ask_size_not_norm'])
    clipped_predictions = np.clip(preds, y_min, y_max)
    sample_prediction['target'] = clipped_predictions
    prediction = sample_prediction
    env.predict(prediction)
    counter += 1

ModuleNotFoundError: No module named 'optiver2023'