In [1]:
# lib import
import numpy as np
from numpy import inf
import pandas as pd
import tensorflow_decision_forests as tfdf

TRAINING = True

In [2]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ]

    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')

    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']

    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')

    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')

    return df[features]

In [3]:
if TRAINING:
    # load dataset 
    df_train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
    df_features = generate_features(df_train)
    
    # data is defaulty normalized
    X = np.float32(df_features.values)
    Y = np.float32(df_train['target'].values)

    X = X[np.isfinite(Y)]
    Y = Y[np.isfinite(Y)]

    index = np.arange(len(X)) #array for indexing

    max_value = np.finfo(X.dtype).max #max float value allowed by numpy, perhaps lower? to prevent NAN during fit
    min_value = np.finfo(X.dtype).min
#     X[X==inf] = max_value
#     X[~np.isfinite(X)] = min_value
    X[X==inf] = 3.40282e+14
    X[~np.isfinite(X)] = -3.40282e+14
    
    del df_train
    del df_features

In [4]:
import joblib
import os


models = []

# test to train ratio
N_fold = 5

os.system('mkdir models')

model_path ='/kaggle/input/testing_data_set/models'

def train(model_dict, modelname='gbtm'):
    if TRAINING:
        model = model_dict[modelname]
        model.fit(X[index%N_fold!=0], Y[index%N_fold!=0],
          validation_data=(X[index%N_fold==0], Y[index%N_fold==0]),
          verbose = 2
         )
        models.append(model)
        joblib.dump(model, f'./models/{modelname}_{i}.model')
    else:
        models.append(joblib.load(f'{model_path}/{modelname}_{i}.model'))


model_dict = {
    'gbtm': tfdf.keras.GradientBoostedTreesModel(task = tfdf.keras.Task.REGRESSION,
                                             split_axis='AXIS_ALIGNED',
                                             categorical_algorithm='CART',
                                             growing_strategy='LOCAL',
                                             max_depth = 8,
                                             sampling_method='RANDOM',
                                             subsample=0.9,
                                             shrinkage=0.1,
                                             min_examples=10,
                                             num_candidate_attributes_ratio=0.9,
                                             early_stopping='LOSS_INCREASE',
                                             early_stopping_initial_iteration=100,
                                             random_seed = 69,
                                             num_trees=500
                                            ),
}
for i in range(N_fold):
    train(model_dict, 'gbtm')

Use /tmp/tmp15fgtvzg as temporary training directory




Reading training dataset...
Training tensor examples:
Features: Tensor("data:0", shape=(None, 49), dtype=float32)
Label: Tensor("data_1:0", shape=(None,), dtype=float32)
Weights: None
Normalized tensor features:
 {'data:0.0': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice:0' shape=(None,) dtype=float32>), 'data:0.1': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice_1:0' shape=(None,) dtype=float32>), 'data:0.2': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice_2:0' shape=(None,) dtype=float32>), 'data:0.3': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice_3:0' shape=(None,) dtype=float32>), 'data:0.4': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice_4:0' shape=(None,) dtype=float32>), 'data:0.5': SemanticTensor(semantic=<Semantic.NUMERICAL: 1>, tensor=<tf.Tensor 'strided_slice_5:0' shape=(None,) dtype=float32>), 'data:0.

[INFO 23-11-06 02:45:53.9439 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-11-06 02:45:53.9440 UTC kernel.cc:774] Collect training examples
[INFO 23-11-06 02:45:53.9440 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-06 02:45:53.9459 UTC kernel.cc:393] Number of batches: 130948
[INFO 23-11-06 02:45:53.9459 UTC kernel.cc:394] Number of examples: 4190313
[INFO 23-11-06 02:45:55.7157 UTC kernel.cc:794] Training dataset:
Number of records: 4190313
Number of columns: 50

Number of columns by type:
	NUMERICAL: 50 (100%)

Columns:

NUMERICAL: 50 (100%)
	0: "__LABEL" NUMERICAL mean:-0.0497912 min:-302.23 max:446.07 sd:9.38648
	1: "data:0.0" NUMERICAL mean:270.009 min:0 max:540 sd:158.745
	2: "data:0.1" 

Model trained in 1:12:07.302940
Compiling model...
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: could not get source code
Model compiled.
Reading training dataset...
Training dataset read in 0:00:27.507813. Found 4190313 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(1047579, shape=(), dtype=int32)
Validation dataset read in 0:00:20.168994. Found 1047579 examples.
Training model...


[INFO 23-11-06 03:59:22.7081 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-11-06 03:59:22.7081 UTC kernel.cc:774] Collect training examples
[INFO 23-11-06 03:59:22.7081 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-06 03:59:22.7083 UTC kernel.cc:393] Number of batches: 261896
[INFO 23-11-06 03:59:22.7083 UTC kernel.cc:394] Number of examples: 4190313
[INFO 23-11-06 03:59:24.4941 UTC kernel.cc:794] Training dataset:
Number of records: 4190313
Number of columns: 50

Number of columns by type:
	NUMERICAL: 50 (100%)

Columns:

NUMERICAL: 50 (100%)
	0: "__LABEL" NUMERICAL mean:-0.0497912 min:-302.23 max:446.07 sd:9.38648
	1: "data:0.0" NUMERICAL mean:270.009 min:0 max:540 sd:158.745
	2: "data:0.1" 

Model trained in 0:01:01.827569
Compiling model...
Model compiled.
Reading training dataset...
Training dataset read in 0:00:26.353856. Found 4190313 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(1047579, shape=(), dtype=int32)
Validation dataset read in 0:00:21.510319. Found 1047579 examples.
Training model...


[INFO 23-11-06 04:01:43.7006 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-11-06 04:01:43.7007 UTC kernel.cc:774] Collect training examples
[INFO 23-11-06 04:01:43.7007 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-06 04:01:43.7009 UTC kernel.cc:393] Number of batches: 392844
[INFO 23-11-06 04:01:43.7009 UTC kernel.cc:394] Number of examples: 4190313
[INFO 23-11-06 04:01:45.4587 UTC kernel.cc:794] Training dataset:
Number of records: 4190313
Number of columns: 50

Number of columns by type:
	NUMERICAL: 50 (100%)

Columns:

NUMERICAL: 50 (100%)
	0: "__LABEL" NUMERICAL mean:-0.0497912 min:-302.23 max:446.07 sd:9.38648
	1: "data:0.0" NUMERICAL mean:270.009 min:0 max:540 sd:158.745
	2: "data:0.1" 

Model trained in 0:01:01.654858
Compiling model...
Model compiled.
Reading training dataset...
Training dataset read in 0:00:26.175233. Found 4190313 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(1047579, shape=(), dtype=int32)
Validation dataset read in 0:00:21.427597. Found 1047579 examples.
Training model...


[INFO 23-11-06 04:04:04.0838 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-11-06 04:04:04.0838 UTC kernel.cc:774] Collect training examples
[INFO 23-11-06 04:04:04.0838 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-06 04:04:04.0840 UTC kernel.cc:393] Number of batches: 523792
[INFO 23-11-06 04:04:04.0840 UTC kernel.cc:394] Number of examples: 4190313
[INFO 23-11-06 04:04:05.8406 UTC kernel.cc:794] Training dataset:
Number of records: 4190313
Number of columns: 50

Number of columns by type:
	NUMERICAL: 50 (100%)

Columns:

NUMERICAL: 50 (100%)
	0: "__LABEL" NUMERICAL mean:-0.0497912 min:-302.23 max:446.07 sd:9.38648
	1: "data:0.0" NUMERICAL mean:270.009 min:0 max:540 sd:158.745
	2: "data:0.1" 

Model trained in 0:01:01.897259
Compiling model...
Model compiled.
Reading training dataset...
Training dataset read in 0:00:25.883371. Found 4190313 examples.
Reading validation dataset...
Num validation examples: tf.Tensor(1047579, shape=(), dtype=int32)
Validation dataset read in 0:00:21.883271. Found 1047579 examples.
Training model...


[INFO 23-11-06 04:07:08.0367 UTC kernel.cc:773] Start Yggdrasil model training
[INFO 23-11-06 04:07:08.0368 UTC kernel.cc:774] Collect training examples
[INFO 23-11-06 04:07:08.0368 UTC kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

[INFO 23-11-06 04:07:08.0371 UTC kernel.cc:393] Number of batches: 654740
[INFO 23-11-06 04:07:08.0371 UTC kernel.cc:394] Number of examples: 4190313
[INFO 23-11-06 04:07:09.7881 UTC kernel.cc:794] Training dataset:
Number of records: 4190313
Number of columns: 50

Number of columns by type:
	NUMERICAL: 50 (100%)

Columns:

NUMERICAL: 50 (100%)
	0: "__LABEL" NUMERICAL mean:-0.0497912 min:-302.23 max:446.07 sd:9.38648
	1: "data:0.0" NUMERICAL mean:270.009 min:0 max:540 sd:158.745
	2: "data:0.1" 

Model trained in 0:01:02.116338
Compiling model...
Model compiled.


In [5]:
def mae(ypred, ytrue):
  return np.mean(np.abs((ypred.flatten() - ytrue)))
for m in models:
  pred = m.predict(X[index%N_fold==0])
  print(mae(pred, Y[index%N_fold==0]))

6.385889
6.385889
6.385889
6.385889
6.385889


In [6]:
tfdf.model_plotter.plot_model_in_colab(model=models[0], tree_idx=0, max_depth=8)

In [7]:
'''
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
'''

'\nimport optiver2023\nenv = optiver2023.make_env()\niter_test = env.iter_test()\n'

In [8]:
'''
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print(test.head(3))
        print(revealed_targets.head(3))
        print(sample_prediction.head(3))
    sample_prediction['target'] = model.predict(test)
    env.predict(sample_prediction)
    counter += 1
'''

"\ncounter = 0\nfor (test, revealed_targets, sample_prediction) in iter_test:\n    if counter == 0:\n        print(test.head(3))\n        print(revealed_targets.head(3))\n        print(sample_prediction.head(3))\n    sample_prediction['target'] = model.predict(test)\n    env.predict(sample_prediction)\n    counter += 1\n"