# Objective of this notebook:
1. Explain Notebook
2. Serve as stepping stone to allow function to be reproduced across TS Pred Domains

### Competition Explanation
1. Leaderboard score: 6476
2. Notebook Score: 9766, which served as baseline and building block to notebook of score: 11,480
3. Notebook Credit: https://www.kaggle.com/code/tarlannazarov/own-jane-street-with-keras-nn/notebook

In [9]:
# Print Tensorflow, Keras, Numpy version to make version clear to prevent dependecy issues
import tensorflow
import keras
import numpy 
print(tensorflow.__version__)
print(keras.__version__)
print(numpy.__version__)

2.7.0-dev20210806
2.6.0
1.19.5


In [5]:
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from random import choices


SEED = 1111

tf.random.set_seed(SEED)
np.random.seed(SEED)

# train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
# train = pd.read_csv('../input/synthetic-jane-street-dataset/train.csv',engine='python')
train_orig = pd.read_csv('./Jane Street Data/Splitted_Data/JaneStreet_Part0.csv')
train_orig
train = train_orig.copy()

In [7]:
train

Unnamed: 0.1,Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,action
0,26362,6,1.758028,0.000596,-0.000543,-0.003567,-0.002244,0.000164,1,-3.172026,...,0.641876,0.795410,0.337828,1.292695,6.078467,-2.141768,-3.630178,4.806393,1181473,1
1,26363,6,2.880795,0.000708,0.000629,0.002001,0.001216,0.000674,-1,-1.862270,...,-0.814259,2.306749,4.012111,0.043977,-2.754400,-2.217010,-1.578796,0.741731,1273349,1
2,26364,6,9.118387,0.024360,0.028085,0.026842,0.053148,0.079602,1,0.285112,...,5.145386,-0.311186,0.262822,2.664963,1.783466,-0.820957,1.262784,-0.615489,2238809,1
3,26365,6,10.575239,-0.007252,-0.013773,-0.033410,-0.048703,-0.040168,-1,0.217926,...,-0.399924,1.578563,-0.193868,-1.338982,2.696016,-3.583999,2.477546,-0.361818,2189869,0
5,26367,6,3.907241,-0.033508,-0.056233,-0.078685,-0.082471,-0.086419,-1,1.418044,...,1.355057,1.659399,0.554493,0.424183,-1.588647,1.661917,-1.742249,0.360720,1011153,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168254,194616,41,0.127290,-0.002901,-0.004369,-0.015155,-0.026200,-0.019054,1,0.895137,...,1.453422,1.772596,-0.060673,5.001509,-0.784987,1.476904,0.847102,-1.697419,1787297,0
168255,194617,41,9.499657,-0.000010,-0.000433,-0.002241,-0.003893,-0.000241,-1,2.338139,...,1.654635,-3.354002,-1.561084,-0.021235,0.304516,0.578562,2.351930,-1.043938,2245831,0
168256,194618,41,0.880071,-0.009220,-0.010878,-0.016373,-0.005701,-0.004407,-1,3.033749,...,-0.441910,2.025904,1.111816,-0.007807,-0.383317,-0.112840,-0.702947,-0.997980,1805522,0
168258,194620,41,0.078104,-0.001270,-0.001045,0.006422,0.016154,0.010319,1,2.024954,...,0.218148,0.648862,-1.365715,0.255495,0.601280,3.224246,-0.650631,-1.370604,887604,1


In [10]:

train = train.query('date > 5').reset_index(drop = True) 
train
train = train[train['weight'] != 0]

train.fillna(train.mean(),inplace=True)

train['action'] = ((train['resp'].values) > 0).astype(int)


features = [c for c in train.columns if "feature" in c]

f_mean = np.mean(train[features[1:]].values,axis=0)

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']

X_train = train.loc[:, train.columns.str.contains('feature')]
#y_train = (train.loc[:, 'action'])

y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T




def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model


batch_size = 5000
hidden_units = [150, 150, 150]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

clf = create_mlp(
    len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate
    )

clf.fit(X_train, y_train, epochs=10, batch_size=5000)


models = []

models.append(clf)

th = 0.5000


# f = np.median
# models = models[-3:]
# import janestreet
# env = janestreet.make_env()
# for (test_df, pred_df) in tqdm(env.iter_test()):
#     if test_df['weight'].item() > 0:
#         x_tt = test_df.loc[:, features].values
#         if np.isnan(x_tt[:, 1:].sum()):
#             x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
#         pred = np.mean([model(x_tt, training = False).numpy() for model in models],axis=0)
#         pred = f(pred)
#         pred_df.action = np.where(pred >= th, 1, 0).astype(int)
#     else:
#         pred_df.action = 0
#     env.predict(pred_df)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
