In [1]:
# lib import
import numpy as np
import pandas as pd
TRAINING = False

In [2]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'prev_target'
               ]

    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap','prev_target']

    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')

    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')

    return df[features]

In [3]:
# fix data
def add_targets(df):
    result = df[df['seconds_in_bucket']==540].copy(deep=True)
    result['date_id'] = result['date_id'] + 1
    result = result[['stock_id', 'date_id', 'target']]
    result = result.rename(columns={'target': 'prev_target'})
    
    sr = result.sample(n=int(len(result) * 0.1), random_state=69)

    df = df.merge(sr, on=['stock_id', 'date_id'], how='left')
    return df

In [4]:
def add_targets_test(tdf, rdf):
    result = rdf.groupby(['stock_id', 'date_id']).agg(
    prev_target=('revealed_target', 'last')
    ).reset_index()

    tdf = tdf.merge(result, on=['stock_id', 'date_id'], how='left')
    
    tdf['prev_target'] = tdf['prev_target'].astype('float64')
    return tdf

In [5]:
if TRAINING:
    df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
    
    df = add_targets(df)
    
    X = generate_features(df).values
    Y = df['target'].values
        
    X = X[np.isfinite(Y)]
    Y = Y[np.isfinite(Y)]
    
    index = np.arange(len(X))

In [6]:
import lightgbm as lgb 
import joblib
import os


models = []

# test to train ratio
N_fold = 5

os.system('mkdir models')

#model_path ='/kaggle/input/testing_data_set/models'
model_path ='/kaggle/input/optimize'

def train(model_dict, modelname='lgbm'):
    if TRAINING:
        model = model_dict[modelname]
        model.fit(X[index%N_fold!=i], Y[index%N_fold!=i],
          eval_set=(X[index%N_fold==i], Y[index%N_fold==i]),
          early_stopping_rounds=100,
          verbose = 10
         )
        models.append(model)
        joblib.dump(model, f'./models/{modelname}_{i}.model')
    else:
        models.append(joblib.load(f'{model_path}/{modelname}_{i}.model'))


        
model_dict = {
    'lgbm': lgb.LGBMRegressor(objective='regression_l1',
                            boosting_type= "gbdt",
                            n_estimators= 710,
                            learning_rate= 0.1823960595729828,
                            num_leaves= 845,
                            max_depth= 8,
                            min_data_in_leaf= 234,
                            lambda_l1= 9,
                            lambda_l2= 23,
                            min_gain_to_split= 0.490619353569984,
                            bagging_fraction= 0.8845141958452706,
                            bagging_freq= 1,
                            feature_fraction= 0.5939383574598411,
                          )
}
for i in range(N_fold):
    train(model_dict, 'lgbm')

In [7]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [8]:
for (test, revealed_targets, sample_prediction) in iter_test:
    st = test.sort_values(['stock_id', 'date_id', 'seconds_in_bucket']).copy(deep=True)
    st = add_targets_test(st, revealed_targets)
    
    feat = generate_features(st)
    
    pred = (np.mean([model.predict(feat.values) for model in models], 0))
    sample_prediction['target'] = pred
    env.predict(sample_prediction)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
