In [1]:
import pandas as pd
import numpy as np

from itertools import groupby
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import gc
from itertools import combinations
from sklearn.decomposition import PCA
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
train = pd.read_csv('train.csv')
revealed_targets = pd.read_csv('example_test_files/revealed_targets.csv')
test = pd.read_csv('example_test_files/test.csv')
sample_submission = pd.read_csv('example_test_files/sample_submission.csv')

Add median MedianVolV2.csv from the [Optiver|Baseline|Models](https://www.kaggle.com/code/ravi20076/optiver-baseline-models)

In [3]:
median_vol = pd.read_csv("archive/MedianVolV2.csv")
median_vol.index.name = "stock_id";
median_vol = median_vol[['overall_medvol', "first5min_medvol", "last5min_medvol"]]

In [4]:
median_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()
std_sizes = train.groupby('stock_id')['bid_size'].median() + train.groupby('stock_id')['ask_size'].median()

In [5]:
lgbm_columns = ['stock_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'overall_medvol', 'first5min_medvol',
       'last5min_medvol', 'bid_plus_ask_sizes', 'imbalance_ratio', 'imb_s1',
       'imb_s2', 'ask_x_size', 'bid_x_size', 'ask_minus_bid',
       'bid_price_over_ask_price', 'reference_price_minus_far_price',
       'reference_price_times_far_price', 'reference_price_times_near_price',
       'reference_price_minus_ask_price', 'reference_price_times_ask_price',
       'reference_price_ask_price_imb', 'reference_price_minus_bid_price',
       'reference_price_times_bid_price', 'reference_price_bid_price_imb',
       'reference_price_minus_wap', 'reference_price_times_wap',
       'reference_price_wap_imb', 'far_price_minus_near_price',
       'far_price_times_near_price', 'far_price_minus_ask_price',
       'far_price_times_ask_price', 'far_price_minus_bid_price',
       'far_price_times_bid_price', 'far_price_times_wap', 'far_price_wap_imb',
       'near_price_minus_ask_price', 'near_price_times_ask_price',
       'near_price_ask_price_imb', 'near_price_minus_bid_price',
       'near_price_times_bid_price', 'near_price_bid_price_imb',
       'near_price_minus_wap', 'near_price_wap_imb',
       'ask_price_minus_bid_price', 'ask_price_times_bid_price',
       'ask_price_minus_wap', 'ask_price_times_wap', 'ask_price_wap_imb',
       'bid_price_minus_wap', 'bid_price_times_wap', 'bid_price_wap_imb',
       'reference_price_far_price_near_price_imb2',
       'reference_price_far_price_ask_price_imb2',
       'reference_price_far_price_bid_price_imb2',
       'reference_price_far_price_wap_imb2',
       'reference_price_near_price_ask_price_imb2',
       'reference_price_near_price_bid_price_imb2',
       'reference_price_near_price_wap_imb2',
       'reference_price_ask_price_bid_price_imb2',
       'reference_price_ask_price_wap_imb2',
       'reference_price_bid_price_wap_imb2',
       'far_price_near_price_ask_price_imb2',
       'far_price_near_price_bid_price_imb2', 'far_price_near_price_wap_imb2',
       'far_price_ask_price_bid_price_imb2', 'far_price_ask_price_wap_imb2',
       'far_price_bid_price_wap_imb2', 'near_price_ask_price_bid_price_imb2',
       'near_price_ask_price_wap_imb2', 'near_price_bid_price_wap_imb2',
       'ask_price_bid_price_wap_imb2', 'pca_prices']
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]

In [6]:
weights_df = pd.DataFrame(data=list(zip(range(0,201),weights)),columns=['stock_id','index_weight'])

In [7]:
train = train.merge(weights_df,on='stock_id')

In [8]:
def generate_prev_race(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    original_cols = df_in.columns
    df[f'wap_t-60'] = df_g['wap'].shift(6)
    df[f'target_t-60'] = df_g['target'].shift(6)
    df[f'initial_wap'] = df_g['wap_calc'].transform('first')
    df[f'initial_bid_size'] = df_g['bid_size'].transform('first')
    df[f'initial_ask_size'] = df_g['ask_size'].transform('first')
    cols = ['bid_price','ask_price','bid_size','ask_size','wap']
    for i in cols:
        df[f'{i}_t-60'] = df_g[i].shift(-1)

    for i in cols:
        df[f'{i}_t10'] = df_g[i].shift(1)

    return(df)

def generate_index(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    df[f'index_wap'] = df_g['wap_weighted'].transform('mean')
    return(df)

def generate_index_2(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    df[f'index_wap_t-60'] = df_g['index_wap'].shift(6)
    df[f'index_wap_init'] = df_g['index_wap'].transform('first')
    return(df)

def generate_index_3(df_in, df_g, rolling_window=10, factor=''):
    df = df_in.copy()
    df[f'index_wap_t-60'] = df_g['index_wap_move_to_init'].shift(6)
    return(df)

In [9]:
def feat_eng(df):
    
    cols = [c for c in df.columns if c not in ['row_id', 'time_id']]
    df = df[cols]
    df = df.merge(median_vol, how = "left", left_on = "stock_id", right_index = True)
    
    df['bid_plus_ask_sizes'] = df['bid_size'] + train['ask_size']
#     df['median_size'] = df['stock_id'].map(median_sizes.to_dict())
    df['std_size'] = df['stock_id'].map(std_sizes.to_dict())
#     df['high_volume'] = np.where(df['bid_plus_ask_sizes'] > df['median_size'], 1, 0) 
    df['imbalance_ratio'] = df['imbalance_size'] / df['matched_size']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    df['ask_x_size'] = df.eval('ask_size*ask_price')
    df['bid_x_size'] = df.eval('bid_size*bid_price')
        
    df['ask_minus_bid'] = df['ask_x_size'] - df['bid_x_size'] 
    
    df["bid_size_over_ask_size"] = df["bid_size"].div(df["ask_size"])
    df["bid_price_over_ask_price"] = df["bid_price"].div(df["ask_price"])
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for c in combinations(prices, 2):
        
        df[f'{c[0]}_minus_{c[1]}'] = (df[f'{c[0]}'] - df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_times_{c[1]}'] = (df[f'{c[0]}'] * df[f'{c[1]}']).astype(np.float32)
        df[f'{c[0]}_{c[1]}_imb'] = df.eval(f'({c[0]}-{c[1]})/({c[0]}+{c[1]})')

    for c in combinations(prices, 3):
        
        max_ = df[list(c)].max(axis=1)
        min_ = df[list(c)].min(axis=1)
        mid_ = df[list(c)].sum(axis=1)-min_-max_

        df[f'{c[0]}_{c[1]}_{c[2]}_imb2'] = (max_-mid_)/(mid_-min_)
    
        
    df.drop(columns=[
        # 'date_id', 
        'reference_price_far_price_imb',
        'reference_price_minus_near_price',
        'reference_price_near_price_imb',
        'far_price_near_price_imb',
        'far_price_ask_price_imb',
        'far_price_bid_price_imb',
        'far_price_minus_wap',
        'std_size',
        'bid_size_over_ask_size',
        'ask_price_bid_price_imb',
        'near_price_times_wap'
    ], inplace=True)
        
    # gc.collect()

    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

In [10]:
train['wap_calc'] = (train['bid_price']*train['ask_size']+train['ask_price']*train['bid_size'])/(train['ask_size']+train['bid_size'])

In [11]:
train['wap_weighted'] = train['wap']*train['index_weight']
train_g = train.groupby(['stock_id','date_id'])
train = generate_prev_race(train,train_g)
train['delta_wap'] = train['wap']/train['wap_t-60']

train_g = train.groupby(['seconds_in_bucket','date_id'])
train = generate_index(train,train_g)


train['wap_move_to_init'] = train['wap_calc']/train['initial_wap']
train_g = train.groupby(['date_id'])
train = generate_index_2(train,train_g)

train['index_wap_move_to_init'] = train['index_wap']/train['index_wap_init']
train_g = train.groupby(['date_id'])
train = generate_index_3(train,train_g)

train['bid_price_target'] = train['bid_price']-train['bid_price_t-60']
train['bid_price_t-60'] = train['bid_price_target']*10_000

train['wap_target'] = train['wap']-train['wap_t-60']
train['wap_price_t-60'] = train['wap_target']*10_000

train['ask_price_target'] = train['ask_price']-train['ask_price_t-60']
train['ask_price_t-60'] = train['ask_price_target']*10_000

In [12]:
targets = ['wap', 'bid_price', 'ask_price']
for i in targets:
    train[f'{i}_prev_move'] = (train[f'{i}']-train[f'{i}_t10']).fillna(0)*10000

In [20]:
y = train['wap_price_t-60'].values
X = feat_eng(train.drop(columns='target'))

In [21]:
prices = [c for c in X.columns if ('price' in c) and ('target' not in c) and ('60' not in c)]
# prices = [c for c in train.columns if 'price' in c]
pca_prices = PCA(n_components=1)
X['pca_prices'] = pca_prices.fit_transform(X[prices].fillna(1))

In [22]:
for x in X.columns:
    print(x)

stock_id
date_id
seconds_in_bucket
imbalance_size
imbalance_buy_sell_flag
reference_price
matched_size
far_price
near_price
bid_price
bid_size
ask_price
ask_size
wap
index_weight
wap_calc
wap_weighted
wap_t-60
target_t-60
initial_wap
initial_bid_size
initial_ask_size
bid_price_t-60
ask_price_t-60
bid_size_t-60
ask_size_t-60
bid_price_t10
ask_price_t10
bid_size_t10
ask_size_t10
wap_t10
delta_wap
index_wap
wap_move_to_init
index_wap_t-60
index_wap_init
index_wap_move_to_init
bid_price_target
wap_target
wap_price_t-60
ask_price_target
wap_prev_move
bid_price_prev_move
ask_price_prev_move
overall_medvol
first5min_medvol
last5min_medvol
bid_plus_ask_sizes
imbalance_ratio
imb_s1
imb_s2
ask_x_size
bid_x_size
ask_minus_bid
bid_price_over_ask_price
reference_price_minus_far_price
reference_price_times_far_price
reference_price_times_near_price
reference_price_minus_ask_price
reference_price_times_ask_price
reference_price_ask_price_imb
reference_price_minus_bid_price
reference_price_times_bid_p

In [41]:
X_train = X[[c for c in X.columns if ('target' not in c) and ('60' not in c)]].drop(columns=['delta_wap','date_id'])

In [42]:
%%time

m = lgb.LGBMRegressor(learning_rate=0.018052307589575444, max_depth=20, n_estimators=700,
              num_leaves=442, objective='mae', random_state=42,
              reg_alpha=0.02216069565875271, reg_lambda=0.01223572246957101)
m.fit(X_train, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.313123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24076
[LightGBM] [Info] Number of data points in the train set: 5237980, number of used features: 99
CPU times: total: 46min 22s
Wall time: 7min 51s


In [43]:
m.booster_.save_model('lgbm_model_new.lgb')

<lightgbm.basic.Booster at 0x17c03e40810>

In [44]:
X.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price',
       ...
       'far_price_near_price_bid_price_imb2', 'far_price_near_price_wap_imb2',
       'far_price_ask_price_bid_price_imb2', 'far_price_ask_price_wap_imb2',
       'far_price_bid_price_wap_imb2', 'near_price_ask_price_bid_price_imb2',
       'near_price_ask_price_wap_imb2', 'near_price_bid_price_wap_imb2',
       'ask_price_bid_price_wap_imb2', 'pca_prices'],
      dtype='object', length=112)

In [45]:
feat_imp = pd.Series(m.feature_importances_, index=X_train.columns).sort_values()
print('Columns with poor contribution', feat_imp[feat_imp<100].index)
fig = px.bar(x=feat_imp, y=feat_imp.index, orientation='h')
fig.show()

Columns with poor contribution Index([], dtype='object')


In [39]:
feat_imp.sort_values()

wap_move_to_init               112
wap_calc                       165
ask_price_times_wap            183
ask_price_times_bid_price      196
reference_price_times_wap      196
                             ...  
ask_price_prev_move           7803
index_wap_move_to_init       28836
seconds_in_bucket            36423
index_wap                    37142
date_id                      54101
Length: 100, dtype: int32

In [38]:
test = feat_eng(test)
test['pca_prices'] = pca_prices.transform(test[prices].fillna(1))

KeyError: "['bid_price_t10', 'ask_price_t10', 'bid_price_prev_move', 'ask_price_prev_move'] not in index"

In [15]:
test.shape

(3, 81)

In [16]:
m.predict(test)

array([0.1014048 , 0.65292689, 0.39743031])

In [15]:
def zero_sum(prices, volumes):
    
#    I got this idea from https://github.com/gotoConversion/goto_conversion/
    
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

In [16]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [17]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    
    feat = feat_eng(test)
    feat['pca_prices'] = pca_prices.transform(feat[prices].fillna(1))
    sample_prediction['target'] = m.predict(feat)
    
    sample_prediction['target'] = zero_sum(sample_prediction['target'], test.loc[:,'bid_size'] + test.loc[:,'ask_size'])
    
    env.predict(sample_prediction)
    
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
