In [1]:
import tensorflow as tf
tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:


import pandas as pd
import os
import numpy as np
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

import seaborn as sns
import matplotlib.pyplot as plt
import datatable as dt


In [3]:
SEED = 123

data = dt.fread('~/jane-street-market-prediction/train.csv').to_pandas()


In [4]:
profile = pd.pivot_table(data, index='date', values='resp',
           aggfunc={'resp':[np.sum, np.std, np.mean]},fill_value=0)
profile.sort_values(by='std')
vol_bins = pd.qcut(profile['std'], 10, labels=False)

In [5]:
data['vol_bin'] = 0
for vb in np.unique(vol_bins):
    mask = vol_bins == vb
    date_mask = data['date'].isin(vol_bins.loc[mask].index)
    data.loc[date_mask,'vol_bin'] = vb

In [6]:
data.fillna(data.mean(),inplace=True)

data['action'] = ((data['resp'].values) < 0).astype(int)


features = [c for c in data.columns if "feature" in c]
features.append('weight')

In [9]:
len(features)

131

In [10]:
f_mean = np.mean(data[features[1:]].values,axis=0)


target_feature = 'vol_bin'
X_train = data.loc[:, features]
y_train = (data.loc[:, target_feature])

In [11]:
def create_mlp(
    num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate
):

    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    #x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x) #(x)
    
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.relu)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
        x = tf.keras.layers.Concatenate(axis=1)([x, inp])
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)

    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    )

    return model

In [None]:


batch_size = 5000
hidden_units = [130, 130, 130]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

len(np.unique(y_train))


clf = create_mlp(
    len(features),len(np.unique(y_train)), hidden_units, dropout_rates, label_smoothing, learning_rate
    )

clf.fit(X_train, y_train, epochs=200, batch_size=20000)

opt_th = 0.5000


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200

In [None]:
from math import sqrt

def utility_score(df,action_vec):
    """Calculate utility score of a dataframe according to formulas defined at
    https://www.kaggle.com/c/jane-street-market-prediction/overview/evaluation
    """

    p = df['weight']  * df['resp'] * action_vec
    df['p'] = p
    p_i = df.set_index('date')['p'].groupby('date').sum()
    t = (p_i.sum() / sqrt((p_i**2).sum())) * (sqrt(250 / p_i.index.size))
    return min(max(t, 0), 6) * p_i.sum()

In [None]:
pred_df = train[['date', 'resp','weight']]
pred_df['actual'] = ((train['resp'].values) > 0).astype(int)
preds = clf(X_train.values, training=False)
pred_df['preds'] = preds.numpy()
pred_df['action'] = (pred_df['preds'] >= 0.5) * 1

In [None]:
utility_score(pred_df, pred_df['action']) / utility_score(pred_df, pred_df['actual'])
